In [1]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Increase the amount of rows and columns that we can display #
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
# Loading our data ... #
animes = pd.read_csv('./Data/anime.csv')
ratings = pd.read_csv('./Data/rating.csv')

In [4]:
# Checking the dimensions of our dataframes #
animes.shape, ratings.shape

((12294, 7), (7813737, 3))

In [5]:
# Removing Whitespace (if any) in Column Headers #
animes.columns = animes.columns.str.strip()
ratings.columns = ratings.columns.str.strip()

In [6]:
# Dropping the average user rating for the anime # 
animes.drop('rating', axis=1, inplace=True)

In [None]:
# Creating a series which shows how many animes are unrated in a user's profile #
unwatched_ratings_count = ratings.groupby('user_id')['rating'].apply(lambda x:
                                                                     x[x == -1].count())

In [None]:
unwatched_ratings_count = unwatched_ratings_count.rename('unrated_count')

In [68]:
# Creating a series which shows the total number of shows in the user's profile #
anime_watchlist_count = ratings.groupby('user_id')['rating'].count()

In [69]:
anime_watchlist_count = anime_watchlist_count.rename('watchlist_count')

In [70]:
# Creating a new dataframe which combines their total watched and watched animes #
user_ratings = pd.concat([anime_watchlist_count, unwatched_ratings_count],
               axis=1).reset_index()

In [71]:
user_ratings.head()

Unnamed: 0,user_id,watchlist_count,unrated_count
0,1,153,149
1,2,3,2
2,3,94,2
3,4,52,52
4,5,467,8


In [73]:
user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73515 entries, 0 to 73514
Data columns (total 3 columns):
user_id            73515 non-null int64
watchlist_count    73515 non-null int64
unrated_count      73515 non-null int64
dtypes: int64(3)
memory usage: 1.7 MB


In [75]:
# Removing users that have not rated any anime #
user_ratings_2 = user_ratings[user_ratings.watchlist_count != user_ratings.unrated_count]

In [78]:
# Creating a new column that shows the percentage of unrated animes in a user's list #
user_ratings_2['percent_unrated'] = user_ratings_2.unrated_count / user_ratings_2.watchlist_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [89]:
# Creating a new rating df that removes users with more than 85% unrated anime in their list #
ratings_df = user_ratings_2.loc[user_ratings_2.percent_unrated <= 0.85]

In [94]:
ratings_df = ratings.merge(ratings_df, on='user_id', how='inner')

In [97]:
ratings_df = ratings_df.drop(['watchlist_count', 'unrated_count', 'percent_unrated'],
                             axis=1)

In [99]:
# Creating a subset dataframe which only consists of rows in which the user has rated the anime #
rated_anime_df = ratings_df[ratings_df.rating != -1]

In [100]:
rated_anime_df

Unnamed: 0,user_id,anime_id,rating
0,2,11771,10
3,3,20,8
4,3,154,6
5,3,170,9
6,3,199,10
...,...,...,...
6924595,73515,16512,7
6924596,73515,17187,9
6924597,73515,22145,10
6924598,73516,790,9


In [104]:
# Creating a series which shows the number of ratings for each anime #
anime_ratings_count = rated_anime_df.groupby('anime_id')['rating'].count().sort_values()

In [108]:
# Number of unique anime that have been rated at least once #
len(anime_ratings_count)

9925

### Given than we have about 66k unique users. We will drop the animes which do not have 300 ratings, which translates to less than 0.5% of users rating the anime. We believe that these are niche animes and would not generalise to an average user.

In [118]:
animes_to_keep = anime_ratings_count[anime_ratings_count > 300]

In [120]:
animes_to_keep = animes_to_keep.rename('total_ratings')

In [121]:
ratings_df = ratings_df.merge(animes_to_keep, on='anime_id', how='inner')

In [126]:
ratings_df = ratings_df.drop('total_ratings', axis=1)

In [129]:
ratings_df

Unnamed: 0,user_id,anime_id,rating
0,2,11771,10
1,3,11771,10
2,14,11771,7
3,17,11771,9
4,21,11771,8
...,...,...,...
6479644,73206,942,9
6479645,73301,942,8
6479646,73308,942,-1
6479647,73317,942,7


In [132]:
with open('anime_ratings.pickle', 'wb') as file_out:
    pickle.dump(ratings_df, file_out)