In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv("../archive/rating_complete.csv")
ratings

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9
...,...,...,...
57633273,353404,502,8
57633274,353404,987,4
57633275,353404,225,8
57633276,353404,243,7


In [3]:
anime_df = pd.read_csv("../archive/anime.csv")
anime_df["anime_id"] = anime_df["MAL_ID"]
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,1
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,5
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,6
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,7
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,...,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48481
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48483
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48488
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48491


In [4]:
df = pd.merge(ratings, anime_df[['anime_id','Name', 'Genres']], on="anime_id", how="inner")
df

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
1,6,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
2,18,430,10,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
3,19,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
4,33,430,4,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
...,...,...,...,...,...
57633273,315549,38853,1,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633274,350024,38853,10,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633275,311855,39435,6,Oshiri Tantei Movie 1: Curry Naru Jiken,"Mystery, Comedy, Kids, Fantasy"
57633276,334473,35746,4,6 Lovers,"Comedy, Drama, Romance, School, Shounen Ai, Sl..."


# Data Preparation
* 57,633,278 rows are to many entries to compute, let's reduce the dataset

In [5]:
# User ratings
print(df["user_id"].value_counts().mean())
df["user_id"].value_counts()

185.87842313882197


189037    15455
162615    14864
68042     13462
283786    12778
259790     9996
          ...  
136422        1
213994        1
196985        1
190834        1
206653        1
Name: user_id, Length: 310059, dtype: int64

In [6]:
# Anime ratings
print(df["anime_id"].value_counts().mean())
df["anime_id"].value_counts()

3415.912636320531


1535     182375
16498    169794
11757    161192
6547     141127
30276    138924
          ...  
39686         1
39685         1
35153         1
40594         1
39627         1
Name: anime_id, Length: 16872, dtype: int64

In [159]:
# We are left still with 38,616,425 entries even after filtering out users and animes
# so let's sample the remaining dataset so we get a smaller one

df_sample = df.sample(frac=0.05)
df_sample # 34622 entries seems more logical for first runs so we can re-run faster

Unnamed: 0,user_id,anime_id,rating,Name,Genres
16278079,65843,5205,9,Kara no Kyoukai 7: Satsujin Kousatsu (Go),"Action, Mystery, Supernatural, Romance, Thriller"
19109832,325455,853,6,Ouran Koukou Host Club,"Comedy, Harem, Romance, School, Shoujo"
32128671,8695,12531,7,Sakamichi no Apollon,"Drama, Josei, Music, Romance, School"
5698476,34334,19815,8,No Game No Life,"Game, Adventure, Comedy, Supernatural, Ecchi, ..."
38224583,78404,34104,7,Knight's & Magic,"Action, Fantasy, Mecha, School"
...,...,...,...,...,...
36793630,60045,9760,8,Hoshi wo Ou Kodomo,"Adventure, Romance, Fantasy"
38882700,247568,50,7,Aa! Megami-sama! (TV),"Comedy, Supernatural, Magic, Romance, Seinen"
25709556,130798,23321,6,Log Horizon 2nd Season,"Action, Game, Adventure, Magic, Fantasy"
30064633,52106,33080,8,Brotherhood: Final Fantasy XV,Action


In [8]:
# df.info()
# User ratings
print("USERS \n")
print(df_sample["user_id"].value_counts().mean())
print(df_sample["user_id"].value_counts())
print("\n")

print("ANIMES \n")
# Anime ratings
print(df_sample["anime_id"].value_counts().mean())
print(df_sample["anime_id"].value_counts())

USERS 

3.035680236814799
162615    160
189037    156
283786    147
68042     123
291207    101
         ... 
297286      1
128300      1
16732       1
238946      1
32615       1
Name: user_id, Length: 189853, dtype: int64


ANIMES 

49.45366397803329
1535     1797
16498    1696
11757    1562
6547     1420
19815    1405
         ... 
41802       1
6971        1
24995       1
36362       1
41421       1
Name: anime_id, Length: 11654, dtype: int64


In [163]:
# Filter out users with less than 150 ratings and animes with less than 3500 ratings
df_sample_anime = df_sample.groupby("anime_id").filter(lambda x: len(x) > 100) # 16872
df_sample_anime

Unnamed: 0,user_id,anime_id,rating,Name,Genres
16278079,65843,5205,9,Kara no Kyoukai 7: Satsujin Kousatsu (Go),"Action, Mystery, Supernatural, Romance, Thriller"
19109832,325455,853,6,Ouran Koukou Host Club,"Comedy, Harem, Romance, School, Shoujo"
32128671,8695,12531,7,Sakamichi no Apollon,"Drama, Josei, Music, Romance, School"
5698476,34334,19815,8,No Game No Life,"Game, Adventure, Comedy, Supernatural, Ecchi, ..."
38224583,78404,34104,7,Knight's & Magic,"Action, Fantasy, Mecha, School"
...,...,...,...,...,...
36793630,60045,9760,8,Hoshi wo Ou Kodomo,"Adventure, Romance, Fantasy"
38882700,247568,50,7,Aa! Megami-sama! (TV),"Comedy, Supernatural, Magic, Romance, Seinen"
25709556,130798,23321,6,Log Horizon 2nd Season,"Action, Game, Adventure, Magic, Fantasy"
30064633,52106,33080,8,Brotherhood: Final Fantasy XV,Action


In [165]:
df_sample_users = df_sample_anime.groupby("user_id").filter(lambda x: len(x) > 100) # 310059
df_sample_users

Unnamed: 0,user_id,anime_id,rating,Name,Genres
55059847,318585,26351,8,Nagato Yuki-chan no Shoushitsu,"Comedy, Romance, School, Seinen, Slice of Life"
48337942,10255,400,8,Seihou Bukyou Outlaw Star,"Action, Sci-Fi, Adventure, Space, Comedy"
31599516,330851,19021,7,Takanashi Rikka Kai: Chuunibyou demo Koi ga Sh...,"Comedy, Drama, Romance, School, Slice of Life"
18406369,127483,158,10,Maria-sama ga Miteru,"Slice of Life, Drama, Romance, Shoujo, Shoujo Ai"
48577019,189037,1412,7,Lupin III,"Action, Adventure, Mystery, Comedy, Seinen"
...,...,...,...,...,...
36079568,234983,28735,9,Shouwa Genroku Rakugo Shinjuu,"Drama, Historical, Josei"
36181167,187292,28069,7,Shigatsu wa Kimi no Uso: Moments,"Music, Shounen"
30954655,216807,10893,8,Kyousou Giga,"Action, Fantasy, Supernatural"
46434219,189037,15613,7,Hakkenden: Touhou Hakken Ibun,"Action, Demons, Supernatural, Fantasy, Shoujo"


In [166]:
# User ratings
print("USERS \n")
print(df_sample_users["user_id"].value_counts().mean())
print(df_sample_users["user_id"].value_counts().head())
print("\n")

print("ANIMES \n")
# Anime ratings
print(df_sample_users["anime_id"].value_counts().mean())
print(df_sample_users["anime_id"].value_counts().head())

USERS 

122.8433734939759
189037    198
68042     192
277841    182
283786    175
55748     171
Name: user_id, dtype: int64


ANIMES 

5.28564022809746
20785    17
5678     17
11771    16
457      16
28825    16
Name: anime_id, dtype: int64


In [167]:
df_sample = df_sample_users

# Evaluation
Divide df into training and test set (80 - 20)

In [168]:
train_df, test_df = train_test_split(df_sample.drop(columns={"Name","Genres"}),
                                #    stratify=df_sample['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# ratings on Train set: %d' % len(train_df))
print('# ratings on Test set: %d' % len(test_df))
train_df

# ratings on Train set: 16313
# ratings on Test set: 4079


Unnamed: 0,user_id,anime_id,rating
30829729,125249,9107,7
24485953,241697,6007,10
8473624,1946,24703,7
45905587,328397,418,8
43701999,283786,7875,7
...,...,...,...
48815319,54818,6172,7
45883253,254739,9754,5
43293297,209638,18441,9
16665333,11697,33487,6


In Recommender Systems, there are a set metrics commonly used for evaluation. We chose to work with Top-N accuracy metrics, which evaluates the accuracy of the top recommendations provided to a user, comparing to the items the user has actually interacted in test set.
This evaluation method works as follows:

* For each user
    * For each item the user has interacted in test set
        * Sample 100 other items the user has never interacted.
            
            Ps: Here we naively assume those non interacted items are not relevant to the user, which might not be true, as the user may simply not be aware of those not interacted items. But let's keep this assumption.
        * Ask the recommender model to produce a ranked list of recommended items, from a set composed one interacted item and the 100 non-interacted ("non-relevant!) items
        * Compute the Top-N accuracy metrics for this user and interacted item from the recommendations ranked list
* Aggregate the global Top-N accuracy metrics

In [169]:
#Indexing by personId to speed up the searches during evaluation
indexed_df = df_sample.set_index('user_id')
train_indexed_df = train_df.set_index('user_id')
test_indexed_df = test_df.set_index('user_id')

In [170]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.

    if person_id in interactions_df.index :
        interacted_items = interactions_df.loc[person_id]['anime_id']
    else:
        return []
    
    # print(interacted_items.head(1))
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [171]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, indexed_df.drop(columns={"Name","Genres"}))
        all_items = set(indexed_df['anime_id'])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = test_indexed_df.loc[person_id]
        if type(interacted_values_testset['anime_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['anime_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['anime_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    train_indexed_df), 
                                               topn=10000000000)
        hits_at_5_count = 0
        hits_at_10_count = 0

        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))
            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['anime_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['anime_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)
        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        # ADEDEDEDEDD
        # print(person_metrics)
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(test_indexed_df.index.unique().values)):
            if idx % 100 == 0 and idx > 0:
               print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['user_id'] = person_id
            people_metrics.append(person_metrics)
            # if(idx > 15):
            #     break
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
            
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    

## Popularity model

In [172]:
#Computes the most popular items
item_popularity_df = indexed_df.groupby('anime_id')['rating'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,anime_id,rating
0,1535,140
1,5678,135
2,457,135
3,11771,130
4,1519,118
5,2966,118
6,20785,113
7,28825,113
8,33095,110
9,4896,109


In [173]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['anime_id'].isin(items_to_ignore)] \
                               .sort_values('rating', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
    
            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'anime_id', 
                                                          right_on = 'anime_id')[['rating']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, indexed_df)

In [174]:
# RUN POPULARITY MODEL

# import warnings
# warnings.filterwarnings('ignore')
# print('Evaluating Popularity recommendation model... (', len(test_indexed_df.value_counts()), ' users )')
# pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
# print('\nGlobal metrics:\n%s' % pop_global_metrics)
# pop_detailed_results_df.head(10)

## Collaborative Filtering
* For matrix factorization we use Singular Value Decomposition (SVD)
* 

In [175]:
# User-item matrix (users as rows, anime as columns)
# print(train_df)
user_anime_df = train_df.pivot(index='user_id', columns='anime_id', values='rating').fillna(0)
user_anime_df.head()

anime_id,1,5,6,7,15,16,17,18,19,20,...,41389,41433,41468,41619,41783,41930,42571,42603,42984,43555
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
user_anime_matrix = user_anime_df.values
user_anime_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [177]:
users_ids = list(user_anime_df.index)
users_ids[:10]

[1397, 1469, 1946, 4773, 5648, 7179, 10255, 10367, 11697, 15083]

In [178]:
user_anime_matrix_sparse = csr_matrix(user_anime_matrix)
user_anime_matrix_sparse

<166x3809 sparse matrix of type '<class 'numpy.float64'>'
	with 16313 stored elements in Compressed Sparse Row format>

In [179]:
#The number of factors to factor the user-anime matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user anime matrix
U, sigma, Vt = svds(user_anime_matrix_sparse, k = NUMBER_OF_FACTORS_MF)
sigma = np.diag(sigma)
user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 

print("U shape - ",U.shape, "\n Vt shape - ", Vt.shape, "\nsigma shape - ", sigma.shape, "\n", user_predicted_ratings)


U shape -  (166, 15) 
 Vt shape -  (15, 3809) 
sigma shape -  (15, 15) 
 [[ 0.04139727  0.23827193  0.53130312 ...  0.12812951 -0.178106
   0.04741709]
 [ 0.03669346  0.16266193  0.09948462 ...  0.04990785  0.08737252
   0.03173104]
 [ 0.05993014  0.20209001  0.24153585 ...  0.08234431  0.2256836
   0.0071257 ]
 ...
 [ 0.31822065  0.37276019  0.11907862 ...  0.43063863  0.41263269
   0.07949024]
 [ 0.07846071 -0.37658552  0.17736028 ...  0.34001266 -0.35028286
   0.13768175]
 [ 0.17089001  0.19968111  0.1741492  ...  0.14747877  0.10457824
   0.04440839]]


In [180]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(user_predicted_ratings, columns = user_anime_df.columns, index=users_ids).transpose()
print(len(cf_preds_df.columns))
cf_preds_df.head(10)

166


Unnamed: 0_level_0,1397,1469,1946,4773,5648,7179,10255,10367,11697,15083,...,336602,336750,340073,340979,341584,345498,346353,346787,348776,351361
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.041397,0.036693,0.05993,0.154898,-0.093635,0.09731,-0.033109,0.054873,0.122574,0.145004,...,0.071344,0.086325,0.024431,0.091722,0.037923,0.127671,-0.120072,0.318221,0.078461,0.17089
5,0.238272,0.162662,0.20209,-0.018376,-0.09178,-0.001429,-0.350029,0.202582,0.058691,0.231411,...,0.036142,0.24674,0.328671,0.155985,0.162681,0.154929,0.474692,0.37276,-0.376586,0.199681
6,0.531303,0.099485,0.241536,0.054336,-0.039622,0.005899,0.115358,0.108949,0.389135,0.492279,...,0.24622,0.191722,0.112686,0.306756,0.149047,0.187109,0.182663,0.119079,0.17736,0.174149
7,0.083748,0.046469,0.016233,0.034947,0.100819,0.062742,0.267866,0.06442,0.051967,0.02157,...,0.056684,0.055064,0.031526,0.016,0.042724,0.037589,0.205868,-0.144036,0.197989,0.113965
15,-0.313207,0.397545,0.17195,0.332746,-0.177034,0.534101,1.473059,0.149174,-0.19223,0.698199,...,0.486688,0.519873,0.333173,-0.08314,0.104176,0.066333,0.264945,-0.321808,0.420095,-0.080827
16,0.357141,0.290455,0.09945,0.247343,-0.041828,0.383765,0.861974,0.36068,0.35839,0.581077,...,0.332163,0.225968,0.078637,0.125084,0.217049,0.162347,0.097165,0.014124,0.67715,0.730175
17,0.010791,0.083505,0.207947,0.15627,0.576004,0.252185,0.150326,0.086827,-0.046198,-0.162447,...,0.052815,0.044516,0.09307,0.013718,0.04801,-0.110416,0.344297,0.420328,0.053948,0.310965
18,0.168906,0.196619,0.11666,0.155175,-0.244937,0.0813,0.17258,0.156165,0.256859,0.578577,...,0.252398,0.357862,0.291918,0.315821,0.282679,0.632674,0.080046,0.176216,0.086701,0.471947
19,0.217985,0.1283,0.174383,0.239207,0.254093,0.237446,0.186836,0.140434,0.210514,0.238928,...,0.13065,0.151235,0.112812,0.147708,0.063869,0.194181,0.107367,0.340791,0.31103,0.1877
20,0.152817,0.174132,-0.173669,-0.017018,-1.489953,0.032609,0.63722,0.278295,0.407858,0.906099,...,0.135671,0.157886,-0.036879,-0.00243,-0.052967,0.448031,-1.371529,-0.734609,0.168464,-0.062237


In [181]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'rating'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['anime_id'].isin(items_to_ignore)] \
                               .sort_values('rating', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'anime_id', 
                                                          right_on = 'anime_id')[['rating', 'anime_id']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, indexed_df)

In [182]:
import warnings
warnings.filterwarnings('ignore')
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
100 users processed
165 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.061289531747977445, 'recall@10': 0.10811473400343222}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,user_id
111,2,2,46,0.043478,0.043478,189037
15,3,5,42,0.071429,0.119048,332300
45,2,3,37,0.054054,0.081081,68042
85,2,3,36,0.055556,0.083333,297524
80,3,3,36,0.083333,0.083333,192123
148,3,5,35,0.085714,0.142857,4773
101,4,6,35,0.114286,0.171429,283786
52,1,3,35,0.028571,0.085714,275125
9,2,4,35,0.057143,0.114286,297715
147,1,2,34,0.029412,0.058824,333379


In [183]:
# Making anime-user matrix with predicted user ratings
user_anime_df
# Compute the Pearson correlation between users
user_item_matrix = user_anime_df.subtract(user_anime_df.mean(axis=1), axis='rows')
user_item_matrix
# user_corr = user_item_matrix.T.corr()

anime_id,1,5,6,7,15,16,17,18,19,20,...,41389,41433,41468,41619,41783,41930,42571,42603,42984,43555
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,...,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923,-0.165923
1469,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,...,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067,-0.127067
1946,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,...,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820,-0.142820
4773,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,...,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938,-0.181938
5648,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,...,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167,-0.218167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345498,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,...,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253,-0.204253
346353,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,...,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929,-0.207929
346787,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,...,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377,-0.196377
348776,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,8.791021,-0.208979,-0.208979,-0.208979,-0.208979,...,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979,-0.208979


In [184]:
# User correlation
user_corr = user_item_matrix.T.corr()
user_corr

user_id,1397,1469,1946,4773,5648,7179,10255,10367,11697,15083,...,336602,336750,340073,340979,341584,345498,346353,346787,348776,351361
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397,1.000000,-0.000907,0.050804,-0.003269,-0.016849,0.001883,-0.011181,0.001041,0.019075,0.010187,...,-0.018361,0.000274,0.011464,-0.015626,0.008512,0.023049,0.013672,0.000745,0.026695,0.023728
1469,-0.000907,1.000000,0.019442,-0.012516,0.017030,-0.006884,-0.003640,0.007146,0.030961,-0.004448,...,-0.011098,-0.012397,0.001858,-0.007852,0.004649,-0.020934,0.006327,-0.016251,0.007104,0.016001
1946,0.050804,0.019442,1.000000,-0.000222,0.053129,-0.022687,-0.024408,0.010371,-0.020614,-0.003342,...,-0.006924,0.036622,-0.004129,0.006186,-0.004243,-0.002899,-0.022883,0.015644,0.000482,0.019449
4773,-0.003269,-0.012516,-0.000222,1.000000,0.006702,0.033813,0.006925,-0.014708,-0.006882,0.011978,...,0.020813,0.006961,0.032847,0.004465,0.000495,-0.028448,-0.002225,-0.000016,0.003062,0.012457
5648,-0.016849,0.017030,0.053129,0.006702,1.000000,-0.008832,-0.001494,-0.028644,0.014250,0.002677,...,-0.002525,-0.005118,0.059551,-0.005293,0.005247,-0.029979,0.010541,0.010204,0.007210,0.004050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345498,0.023049,-0.020934,-0.002899,-0.028448,-0.029979,-0.006134,0.003718,0.010193,0.038566,-0.001575,...,0.019600,0.022443,-0.000665,0.002452,-0.010605,1.000000,0.028234,-0.026969,-0.000108,0.010640
346353,0.013672,0.006327,-0.022883,-0.002225,0.010541,0.002108,-0.013726,-0.009369,-0.016892,-0.018851,...,0.024263,0.028669,0.048739,-0.024266,-0.005353,0.028234,1.000000,-0.017211,0.020754,-0.013310
346787,0.000745,-0.016251,0.015644,-0.000016,0.010204,0.000535,-0.017922,0.052134,-0.001490,-0.004344,...,0.058918,0.007136,-0.013777,0.013188,-0.021293,-0.026969,-0.017211,1.000000,-0.004708,0.009791
348776,0.026695,0.007104,0.000482,0.003062,0.007210,0.000462,0.025410,-0.001935,-0.005433,-0.010370,...,-0.008287,0.059275,-0.011217,-0.007811,0.011852,-0.000108,0.020754,-0.004708,1.000000,-0.020078


In [186]:
# Sort the correlations in descending order and select the top n similar users
# similar_users = user_corr[251].sort_values(ascending=False)[1:10+1]
# similar_users

In [187]:
# Fill with -1 so unpredicted movies arent altered

user_pred_ratings = user_item_matrix.copy()
for col in user_pred_ratings.columns:
    user_pred_ratings[col].values[:] = -1
user_pred_ratings

anime_id,1,5,6,7,15,16,17,18,19,20,...,41389,41433,41468,41619,41783,41930,42571,42603,42984,43555
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1469,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1946,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4773,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5648,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345498,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
346353,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
346787,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
348776,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [189]:
# Get IDs of anime user '240' didn't watch
# Didn't watch
# user_anime_df.transpose()[user_anime_df.transpose()[240] == 0].index.tolist()
# # Watched
# user_anime_df.transpose()[user_anime_df.transpose()[240] != 0].index.tolist()

In [190]:
from IPython.display import clear_output, display
# make prediction dataframe 
user_corr.index.tolist()[:5]
progress = 0

# for each user
for user_id in user_corr.index.tolist():
    # top 10 similar users
    similar_users = user_corr[user_id].sort_values(ascending=False)[1:10+1].index.tolist()
    # for each anime user didn't watch
    for anime_id in user_anime_df.transpose()[user_anime_df.transpose()[user_id] == 0].index.tolist():
        total = 0
        count = 0
        # If user didn't rate that anime we don't him to predict the rating based on his value
        for user in similar_users:
            if( user_anime_df[anime_id][user] != 0):
                total += user_item_matrix[anime_id][user] * user_corr[user][user_id]
                count +=1
        if(count != 0):
            user_pred_ratings[anime_id][user_id] = total/count
    progress += 1
    clear_output(wait=True)
    print(progress, "/", len(user_corr.index.tolist()))

            

user_pred_ratings

166 / 166


anime_id,1,5,6,7,15,16,17,18,19,20,...,41389,41433,41468,41619,41783,41930,42571,42603,42984,43555
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397,0.165156,-1.000000,0.195732,-1.00000,-1.000000,0.307263,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0
1469,-1.000000,-1.000000,-1.000000,-1.00000,0.330694,0.405735,0.100412,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0
1946,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,0.220910,0.299464,-1.000000,-1.000000,-1.000000,-1.0,-1.0
4773,-1.000000,0.230718,0.225076,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,0.150422,-1.000000,-1.0,-1.000000,0.192229,-1.000000,-1.000000,-1.000000,-1.0,-1.0
5648,-1.000000,-1.000000,0.351699,-1.00000,-1.000000,0.259213,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,0.040465,0.348503,-1.000000,0.199721,-1.000000,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345498,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,0.214706,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,0.061823,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0
346353,-1.000000,-1.000000,0.333969,0.19255,-1.000000,0.249229,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.0,-1.000000,0.285230,-1.000000,-1.000000,-1.000000,-1.0,-1.0
346787,-1.000000,-1.000000,-1.000000,-1.00000,-1.000000,0.363842,-1.000000,0.259554,0.160405,-1.000000,...,-1.000000,-1.000000,-1.0,0.033259,-1.000000,0.226432,-1.000000,0.260158,-1.0,-1.0
348776,0.231847,-1.000000,-1.000000,-1.00000,-1.000000,-1.000000,-1.000000,-1.000000,0.296664,0.492721,...,-1.000000,0.492721,-1.0,-1.000000,0.259119,0.183055,-1.000000,-1.000000,-1.0,-1.0


In [192]:
# compare cf_preds_df with user_pred_ratings
# print(cf_preds_df[240].value_counts())
# print(user_pred_ratings.transpose()[240].value_counts())

In [193]:
# Using pearson correlation to have similarity between users, and than using that similarity as weight
# predict user rating on anime based on similar users rating

import warnings
warnings.filterwarnings('ignore')
cf_recommender_model = CFRecommender(user_pred_ratings.transpose(), indexed_df)
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
100 users processed
165 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.05932826673204217, 'recall@10': 0.11105663152733514}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,user_id
111,0,1,46,0.0,0.021739,189037
15,4,7,42,0.095238,0.166667,332300
45,3,3,37,0.081081,0.081081,68042
85,0,1,36,0.0,0.027778,297524
80,1,1,36,0.027778,0.027778,192123
148,2,5,35,0.057143,0.142857,4773
101,1,1,35,0.028571,0.028571,283786
52,2,2,35,0.057143,0.057143,275125
9,3,3,35,0.085714,0.085714,297715
147,1,3,34,0.029412,0.088235,333379
