In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sb

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Other
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv("../archive/rating_complete.csv")
ratings

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9
...,...,...,...
57633273,353404,502,8
57633274,353404,987,4
57633275,353404,225,8
57633276,353404,243,7


In [3]:
anime_df = pd.read_csv("../archive/anime.csv")
anime_df["anime_id"] = anime_df["MAL_ID"]
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,1
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,5
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,6
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,7
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,...,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48481
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48483
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48488
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,48491


In [4]:
df = pd.merge(ratings, anime_df[['anime_id','Name', 'Genres']], on="anime_id", how="inner")
df

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
1,6,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
2,18,430,10,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
3,19,430,8,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
4,33,430,4,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
...,...,...,...,...,...
57633273,315549,38853,1,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633274,350024,38853,10,Ex-Arm,"Action, Sci-Fi, Ecchi, Seinen"
57633275,311855,39435,6,Oshiri Tantei Movie 1: Curry Naru Jiken,"Mystery, Comedy, Kids, Fantasy"
57633276,334473,35746,4,6 Lovers,"Comedy, Drama, Romance, School, Shounen Ai, Sl..."


# Data Preparation
* 57,633,278 rows are to many entries to compute, let's reduce the dataset

In [5]:
# User ratings
print(df["user_id"].value_counts().mean())
df["user_id"].value_counts()

185.87842313882197


189037    15455
162615    14864
68042     13462
283786    12778
259790     9996
          ...  
136422        1
213994        1
196985        1
190834        1
206653        1
Name: user_id, Length: 310059, dtype: int64

In [6]:
# Anime ratings
print(df["anime_id"].value_counts().mean())
df["anime_id"].value_counts()

3415.912636320531


1535     182375
16498    169794
11757    161192
6547     141127
30276    138924
          ...  
39686         1
39685         1
35153         1
40594         1
39627         1
Name: anime_id, Length: 16872, dtype: int64

In [152]:
# Filter out users with less than 150 ratings and animes with less than 3500 ratings
df = df.groupby("user_id").filter(lambda x: len(x) > 150) # 310059
df = df.groupby("anime_id").filter(lambda x: len(x) > 6000) # 16872

In [153]:
df.info()
# User ratings
print("USERS \n")
print(df["user_id"].value_counts().mean())
print(df["user_id"].value_counts())
print("\n")

print("ANIMES \n")
# Anime ratings
print(df["anime_id"].value_counts().mean())
print(df["anime_id"].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34621885 entries, 1 to 55060794
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   anime_id  int64 
 2   rating    int64 
 3   Name      object
 4   Genres    object
dtypes: int64(3), object(2)
memory usage: 1.5+ GB
USERS 

296.99491310240705
277841    1902
291207    1890
189037    1889
283786    1876
68042     1870
          ... 
14236      109
5865       109
267201     106
282420     103
340534     103
Name: user_id, Length: 116574, dtype: int64


ANIMES 

17929.510616261006
16498    93005
11757    91704
1535     89019
6547     86237
4224     83214
         ...  
33012     6014
3456      6013
368       6012
990       6010
109       6005
Name: anime_id, Length: 1931, dtype: int64


In [154]:
# We are left still with 38,616,425 entries even after filtering out users and animes
# so let's sample the remaining dataset so we get a smaller one

df_sample = df.sample(frac=0.001)
df_sample # 34622 entries seems more logical for first runs so we can re-run faster

Unnamed: 0,user_id,anime_id,rating,Name,Genres
12275299,24937,6746,7,Durarara!!,"Action, Mystery, Supernatural"
10523920,179403,27787,5,Nisekoi:,"Comedy, Harem, Romance, School, Shounen"
49242568,268066,32673,8,Udon no Kuni no Kiniro Kemari,"Slice of Life, Fantasy, Seinen"
13247036,242010,23289,6,Gekkan Shoujo Nozaki-kun,"Comedy, Romance, School"
53149714,773,33558,7,Tales of Zestiria the Cross: Saiyaku no Jidai,"Action, Adventure, Magic, Fantasy"
...,...,...,...,...,...
28473107,111933,392,10,Yuu☆Yuu☆Hakusho,"Action, Comedy, Demons, Supernatural, Martial ..."
26770073,278233,28121,8,Dungeon ni Deai wo Motomeru no wa Machigatteir...,"Action, Adventure, Comedy, Romance, Fantasy"
46755408,255048,19383,8,Yami Shibai,"Dementia, Horror, Demons, Supernatural"
40088243,122012,38610,7,Tejina-senpai,"Comedy, Ecchi, School, Seinen"


# Evaluation
Divide df into training and test set (80 - 20)

In [155]:
train_df, test_df = train_test_split(df_sample.drop(columns={"Name","Genres"}),
                                #    stratify=df_sample['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# ratings on Train set: %d' % len(train_df))
print('# ratings on Test set: %d' % len(test_df))
train_df

# ratings on Train set: 27697
# ratings on Test set: 6925


Unnamed: 0,user_id,anime_id,rating
2530323,256579,32935,8
1004891,106136,459,9
45278465,160042,32801,5
20826655,334640,10161,7
4447019,263062,37779,8
...,...,...,...
31995983,318896,15119,9
1851712,8310,11771,10
4500020,81495,23847,9
39206693,293483,34561,6


In Recommender Systems, there are a set metrics commonly used for evaluation. We chose to work with Top-N accuracy metrics, which evaluates the accuracy of the top recommendations provided to a user, comparing to the items the user has actually interacted in test set.
This evaluation method works as follows:

* For each user
    * For each item the user has interacted in test set
        * Sample 100 other items the user has never interacted.
            
            Ps: Here we naively assume those non interacted items are not relevant to the user, which might not be true, as the user may simply not be aware of those not interacted items. But let's keep this assumption.
        * Ask the recommender model to produce a ranked list of recommended items, from a set composed one interacted item and the 100 non-interacted ("non-relevant!) items
        * Compute the Top-N accuracy metrics for this user and interacted item from the recommendations ranked list
* Aggregate the global Top-N accuracy metrics

In [156]:
#Indexing by personId to speed up the searches during evaluation
indexed_df = df_sample.set_index('user_id')
train_indexed_df = train_df.set_index('user_id')
test_indexed_df = test_df.set_index('user_id')

In [157]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.

    if person_id in interactions_df.index :
        interacted_items = interactions_df.loc[person_id]['anime_id']
    else:
        return []
    
    # print(interacted_items.head(1))
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [163]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, indexed_df.drop(columns={"Name","Genres"}))
        all_items = set(indexed_df['anime_id'])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = test_indexed_df.loc[person_id]
        if type(interacted_values_testset['anime_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['anime_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['anime_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    train_indexed_df), 
                                               topn=10000000000)
        hits_at_5_count = 0
        hits_at_10_count = 0

        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))
            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['anime_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['anime_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)
        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        # ADEDEDEDEDD
        # print(person_metrics)
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(test_indexed_df.index.unique().values)):
            if idx % 100 == 0 and idx > 0:
               print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['user_id'] = person_id
            people_metrics.append(person_metrics)
            # if(idx > 15):
            #     break
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('recall@10', ascending=False)
            
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    

## Popularity model

In [164]:
#Computes the most popular items
item_popularity_df = indexed_df.groupby('anime_id')['rating'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,anime_id,rating
0,16498,852
1,6547,825
2,4224,800
3,1575,781
4,1535,778
5,5114,776
6,11757,766
7,15809,759
8,30276,719
9,19815,698


In [165]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['anime_id'].isin(items_to_ignore)] \
                               .sort_values('rating', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
    
            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'anime_id', 
                                                          right_on = 'anime_id')[['rating']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, indexed_df)

In [166]:
# import warnings
# warnings.filterwarnings('ignore')
print('Evaluating Popularity recommendation model... (', len(train_indexed_df.value_counts()), ' users )')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model... ( 9062  users )
100 users processed
200 users processed
300 users processed
400 users processed
500 users processed
600 users processed
700 users processed
800 users processed
900 users processed
1000 users processed
1100 users processed
1200 users processed
1300 users processed
1400 users processed
1500 users processed
1600 users processed
1700 users processed
1800 users processed
1900 users processed
2000 users processed
2100 users processed
2200 users processed
2300 users processed
2400 users processed
2500 users processed
2600 users processed
2700 users processed
2800 users processed
2900 users processed
3000 users processed
3100 users processed
3200 users processed
3300 users processed
3400 users processed
3500 users processed
3600 users processed
3700 users processed
3800 users processed
3900 users processed
4000 users processed
4100 users processed
4200 users processed
4300 users processed
4400 users processed
4500 users processed
46

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,user_id
5303,1,1,1,1.0,1.0,238109
5124,1,1,1,1.0,1.0,335020
5126,1,1,1,1.0,1.0,275241
2198,1,1,1,1.0,1.0,54071
2200,0,1,1,0.0,1.0,130607
2201,1,1,1,1.0,1.0,152439
2202,0,1,1,0.0,1.0,91478
2203,0,1,1,0.0,1.0,336094
2204,1,1,1,1.0,1.0,176147
2210,0,1,1,0.0,1.0,136494
