In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate


# path with rating
RATINGS_PATH = './data/anime/anime_ratings.dat'
ANIME_PATH = './data/anime/anime_info.dat'

ratings_data = pd.read_csv(RATINGS_PATH, sep='\t')
anime_data = pd.read_csv(ANIME_PATH, sep='\t')

In [39]:
# Get minimum and maximum rating from the dataset
min_rating = ratings_data.Feedback.min()
max_rating = ratings_data.Feedback.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data[['User_ID', 'Anime_ID', 'Feedback']], reader)

## Data Exploration

In [40]:
ratings_data

Unnamed: 0,User_ID,Anime_ID,Feedback
0,1,1,8
1,1,3,5
2,1,5,9
3,1,6,9
4,1,7,8
...,...,...,...
419948,5001,405,7
419949,5001,6306,8
419950,5001,19,9
419951,5001,12,9


In [41]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

graph_data = ratings_data['Feedback'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = graph_data.index,
               text = ['{:.1f} %'.format(val) for val in (graph_data.values / ratings_data.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = graph_data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of anime-ratings',
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [42]:
# Number of ratings per book
book_data = ratings_data.groupby('Anime_ID')['Feedback'].count().clip(upper=200)

# Create trace
trace = go.Histogram(x = book_data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 200,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Anime',
                   xaxis = dict(title = 'Number of Ratings Per Anime'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## Hyperparameter search

In [50]:
from surprise import SVD
from surprise.model_selection import GridSearchCV
 
param_grid = {
  'n_factors': [10, 20, 30, 50, 100],
  'n_epochs': [5, 10, 20, 30, 40, 50, 100]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.1780101229929567
{'n_factors': 20, 'n_epochs': 20}


## Cross validation with 10 Folds

In [44]:
svd = SVD(n_epochs=20,n_factors=20)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.1926  1.1745  1.1731  1.1740  1.1799  1.1752  1.1753  1.1749  1.1792  1.1796  1.1778  0.0055  
MAE (testset)     0.9005  0.8872  0.8857  0.8846  0.8884  0.8870  0.8889  0.8834  0.8909  0.8909  0.8887  0.0046  
Fit time          0.81    1.00    0.87    1.22    0.85    0.84    1.14    0.84    0.97    1.20    0.98    0.15    
Test time         0.10    0.10    0.25    0.10    0.10    0.10    0.10    0.25    0.10    0.10    0.13    0.06    


In [45]:
from surprise.model_selection import train_test_split
 
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']
 
# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)
 
# We'll use the famous SVD algorithm.
svd = SVD(n_factors=best_factor, n_epochs=best_epoch)
 
# Train the algorithm on the trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28130e6d0>

## Give top-k recommendation based on user

In [49]:
def generate_recommendation(model, user_id, ratings_df, anime_df, n_items):
   # Get a list of all anime IDs from dataset
    anime_ids = ratings_df["Anime_ID"].unique()
 
   # Get a list of all anime IDs that have been watched by user
    anime_ids_user = ratings_df.loc[ratings_df["User_ID"] == user_id, "Anime_ID"]
    # Get a list off all anime IDS that that have not been watched by user
    anime_ids_to_pred = np.setdiff1d(anime_ids, anime_ids_user)
 
    # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
    test_set = [[user_id, anime_id, 4] for anime_id in anime_ids_to_pred]

    # Predict the ratings and generate recommendations
    predictions = model.test(test_set)
    pred_ratings = np.array([pred.est for pred in predictions])
    print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
    # Rank top-n animes based on the predicted ratings
    index_max = (-pred_ratings).argsort()[:n_items]
    for i in index_max:
        anime_id = anime_ids_to_pred[i]
        print(anime_df[anime_df["anime_ids"]==anime_id]["name"].values[0], pred_ratings[i])
 
 
# define which user ID that we want to give recommendation
userID = 5001
# define how many top-n animes that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd,userID,ratings_data,anime_data,n_items)

Top 10 item recommendations for user 5001:
Gintama&#039;: Enchousen 8.880735288872806
Mushishi Zoku Shou 2nd Season 8.802124644814953
Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare 8.752242529003576
Kaiba 8.716908544037986
Hajime no Ippo: New Challenger 8.688542788177742
Gintama&#039; 8.644647237398415
Uchuu Senkan Yamato 2199 8.64130767395637
Clannad: After Story 8.628552431662168
Ginga Eiyuu Densetsu 8.598029054152834
Gintama Movie: Shinyaku Benizakura-hen 8.54111879343711
