In [1]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import defaultdict
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import surprise as sp
import time
print("Setup Complete")


Setup Complete


In [2]:

UsersDF = pd.read_csv('./data/full/users_cleaned.csv')
AnimesDF = pd.read_csv('./data/full/anime_cleaned.csv')
ScoresDF = pd.read_csv('./data/full/animelists_cleaned.csv')

ScoresDF_lite = ScoresDF[['username', 'anime_id', 'my_score', 'my_status']]

AnimeDF_name_only = AnimesDF[['anime_id', 'title']]
ScoresDF_lite_with_names = ScoresDF_lite.merge(AnimeDF_name_only, left_on='anime_id', right_on='anime_id')

ScoresDF_lite_with_names_no_0_score = ScoresDF_lite_with_names[ScoresDF_lite_with_names['my_score'] > 0]


UsersRatedPerAnime = ScoresDF_lite_with_names_no_0_score['anime_id'].value_counts().reset_index().rename(columns={"anime_id": "number_of_users", "index": "anime_id"})
AnimesRatedPerUser = ScoresDF_lite_with_names_no_0_score['username'].value_counts().reset_index().rename(columns={"username": "number_of_animes", "index": "username"})


UserRatedsPerAnimeNice = UsersRatedPerAnime[UsersRatedPerAnime['number_of_users'] > 10]
AnimesRatedPerUserNice = AnimesRatedPerUser[AnimesRatedPerUser['number_of_animes'] > 10]

ScoresDFFilteredNice = pd.merge(ScoresDF_lite_with_names_no_0_score, AnimesRatedPerUserNice, left_on = 'username', right_on = 'username', how = 'inner')
ScoresDFFilteredNice = pd.merge(ScoresDFFilteredNice, UserRatedsPerAnimeNice, left_on = 'anime_id', right_on = 'anime_id', how = 'inner')



- Choose the correct model to train on
- Output the nearest neighbours similar to how I did it for the others
- 

In [3]:
reader = sp.Reader(rating_scale=(1, 10))
data = sp.Dataset.load_from_df(ScoresDFFilteredNice[['username', 'title', 'my_score']], reader)

trainset, testset = sp.model_selection.split.train_test_split(data)

In [3]:
def precision_recall_at_k(predictions, k=10, threshold= 7):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    estimate_actual_by_user = defaultdict(list)
    for user, _, true_rating, estimated_rating, _ in predictions:
        estimate_actual_by_user[user].append((estimated_rating, true_rating))
    # Creates a dict with the key being a user and the value bringing the estimated rating and the true rating.

    precisions = dict()
    recalls = dict()
    for user, user_ratings in estimate_actual_by_user.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_rating >= threshold) for (_, true_rating) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_rating >= threshold) and (est >= threshold))
                              for (est, true_rating) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[user] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[user] = n_rel_and_rec_k /  n_rel if n_rel != 0 else 1

    return precisions, recalls

In [11]:
from surprise import dump
import os

analysis = defaultdict(list)
model_filename_prefix = "./model_"
model_filename_suffix = ".pickle"
def get_model_filename(model_name):
    return model_filename_prefix + model_name + model_filename_suffix
sim_options = {'name': 'pearson_baseline',
               'user_based': False,  # compute  similarities between items
               }
'''sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), '''
#algorithms = [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]
algorithms = [sp.KNNBasic(sim_options=sim_options), sp.KNNWithMeans(sim_options=sim_options), sp.KNNWithZScore(sim_options=sim_options), sp.CoClustering()]


for algorithm in algorithms:
    name = algorithm.__class__.__name__
    print("Starting: " , name)
    start = time.time()    
    algorithm.fit(trainset)
    # Dump algorithm.
    file_name = os.path.expanduser(get_model_filename(name))
    dump.dump(file_name, algo=algorithm)
    print("Done: " , name, "\n")
print ('\n\tDONE\n')

Starting:  KNNBasic
Estimating biases using als...


For item-item similarity, we have trained a bunch of KNN models: KNNBasic, KNNWithBeans, KNNWithZScore.
Let's explore what they mean and we will use the precision and recall at k to determine its performance.

In [4]:
print (">> Loading dumps")
from surprise import dump
import os
model_filename = "./model_KNNBasic.pickle"

file_name = os.path.expanduser(model_filename)
_, loaded_model = dump.load(file_name)
print (">> Loaded dump")
print(loaded_model.__class__.__name__)

>> Loading dumps
>> Loaded dump
KNNBasic


In [14]:

def get_similar_items(loaded_knn_model, anime_title, k=30):     
    iid = loaded_knn_model.trainset.to_inner_iid(anime_title)
    neighbor_ids = loaded_knn_model.get_neighbors(iid, k=k)
    neightbors = (loaded_knn_model.trainset.to_raw_iid(inner_id) for inner_id in neighbor_ids)
    df = pd.DataFrame(neightbors, columns = ['Title'])
    return df

Simple sanity check on one of my favourites during my childhood: Detective Conan. And to compare with my DIY solution.

In [19]:
get_similar_items(loaded_model, "Detective Conan", k=10)

Unnamed: 0,Title
0,Magic Kaito
1,Detective Conan Movie 04: Captured in Her Eyes
2,Detective Conan Movie 01: The Timed Skyscraper
3,Detective Conan Movie 10: Requiem of the Detec...
4,Detective Conan Movie 06: The Phantom of Baker...
5,Detective Conan Movie 02: The Fourteenth Target
6,Detective Conan Movie 03: The Last Wizard of t...
7,Detective Conan Movie 05: Countdown to Heaven
8,Detective Conan Movie 14: The Lost Ship in the...
9,Detective Conan Movie 13: The Raven Chaser


Better than my DIY solution! Interesting to see Magic Kaito at the top.


Now let's use precision + recall at 15 to evaulate the performance of the other KNNs.

In [21]:
models_to_load = ["KNNBasic", "KNNWithMeans", "KNNWithZScore"]
loaded_models = []

for model_name in models_to_load:
    file_name = os.path.expanduser(f'./model_{model_name}.pickle')
    print(file_name)
    _, loaded_model = dump.load(file_name)
    print(loaded_model.__class__.__name__)
    loaded_models.append(loaded_model)

./model_KNNBasic.pickle
KNNBasic
./model_KNNWithMeans.pickle
KNNWithMeans
./model_KNNWithZScore.pickle


MemoryError: 

"MemoryError:" sad times.

In [22]:
# I can prepare any test set to gauge on the performance metrics.
for model in loaded_models:
    predictions = model.test(testset)
    rmse = sp.accuracy.rmse(predictions)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=7)
    precision_avg = sum(prec for prec in precisions.values()) / len(precisions)

    analysis[name] = (name, rmse, precision_avg, time.time() - start)
#
#
#analysis_df = pd.DataFrame.from_dict(analysis, orient = 'index', columns = ['Algorithm', 'RMSE', 'Precision@10', 'Time to run (in seconds)']).reset_index()
#
#analysis_df = analysis_df[['Algorithm', 'RMSE', 'Precision@10', 'Time to run (in seconds)']]
#analysis_df = analysis_df.sort_values(by=['Precision@10'], ascending = False)
#analysis_df['RMSE^-1'] = analysis_df['RMSE'] ** -1
#analysis_df.head(n = 15)

NameError: name 'testset' is not defined

In [5]:

#loaded_model.predict('Tomoki-sama','Bleach').est
UserRatedsPerAnimeNice.head()

NameError: name 'UserRatedsPerAnimeNice' is not defined