In [1]:
# Loading the libraries
import pandas as pd
import numpy as np

In [2]:
#Path to the csv file
path = r"/home/keembo/pacmann_recommender_system/data/raw"
print(path)

/home/keembo/pacmann_recommender_system/data/raw


In [3]:
#Read csv
final_animedataset_df = pd.read_csv(path + "/final_animedataset.csv")

In [4]:
# Showing the datafram information
print(final_animedataset_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35305695 entries, 0 to 35305694
Data columns (total 13 columns):
 #   Column      Dtype  
---  ------      -----  
 0   username    object 
 1   anime_id    int64  
 2   my_score    int64  
 3   user_id     int64  
 4   gender      object 
 5   title       object 
 6   type        object 
 7   source      object 
 8   score       float64
 9   scored_by   int64  
 10  rank        float64
 11  popularity  int64  
 12  genre       object 
dtypes: float64(2), int64(5), object(6)
memory usage: 3.4+ GB
None


In [5]:
# Reducing the dataset to 10% of the original dataset, for memory purposes
final_animedataset_df = final_animedataset_df.sample(frac=0.1, random_state=42)

In [6]:
# Show the first few rows of the dataframe
print(final_animedataset_df.head())

                  username  anime_id  my_score  user_id  gender  \
1444015            Nioko13     11531         0    74301  Female   
32919378             Dust_     34599         0  5595739    Male   
26337444  Nishimiya_Sensei      8937         0  5808728    Male   
25294092             ddbad       317         5  2503685    Male   
2213121       natsumi-chan      4246        10   109945  Female   

                                          title   type       source  score  \
1444015                         UN-GO: Inga-ron  Movie        Novel   7.70   
32919378                          Made in Abyss     TV    Web manga   8.91   
26337444              Toaru Majutsu no Index II     TV  Light novel   7.79   
25294092     Final Fantasy VII: Advent Children  Movie         Game   7.88   
2213121   Eureka Seven: Pocket ga Niji de Ippai  Movie     Original   7.22   

          scored_by    rank  popularity  \
1444015       14566  1121.0        2229   
32919378     166905    20.0         178   

In [7]:
# Check for missing values
print(final_animedataset_df.isnull().sum())

username         24
anime_id          0
my_score          0
user_id           0
gender            0
title             0
type              0
source            0
score             0
scored_by         0
rank          75180
popularity        0
genre           225
dtype: int64


In [8]:
# Check for duplicates
print(final_animedataset_df.duplicated().sum())

0


In [9]:
# Show the shape of the dataframe
print(final_animedataset_df.shape)

(3530570, 13)


In [10]:
# Statistics of only the columns my_score, score, scored_by, rank, popularity
print(final_animedataset_df[['my_score','score','scored_by', 'rank', 'popularity']].describe())

           my_score         score     scored_by          rank    popularity
count  3.530570e+06  3.530570e+06  3.530570e+06  3.455390e+06  3.530570e+06
mean   4.595993e+00  7.528114e+00  1.097831e+05  2.044993e+03  1.334928e+03
std    3.909624e+00  7.272679e-01  1.480833e+05  1.971351e+03  1.562810e+03
min    0.000000e+00  1.900000e+00  2.000000e+02  1.000000e+00  1.000000e+00
25%    0.000000e+00  7.110000e+00  1.746700e+04  5.000000e+02  2.510000e+02
50%    6.000000e+00  7.560000e+00  5.399900e+04  1.413000e+03  7.670000e+02
75%    8.000000e+00  8.030000e+00  1.396270e+05  3.039000e+03  1.821000e+03
max    1.000000e+01  9.250000e+00  1.009477e+06  9.527000e+03  1.026500e+04


In [11]:
# Handling missing values
final_animedataset_df.dropna(inplace=True)
processed_df = final_animedataset_df

In [12]:
# creating a user matrix using sparse matrix
from scipy.sparse import csr_matrix

#create the sparse for the data
row = processed_df['user_id'].astype('category').cat.codes
col = processed_df['anime_id'].astype('category').cat.codes
processed_df['user_id_cat'] = row
processed_df['anime_id_cat'] = col
rating = processed_df['my_score']

sparse_matrix = csr_matrix((rating, (row, col)), shape=(row.max()+1, col.max()+1))

In [13]:
print(sparse_matrix)

  (0, 1729)	8
  (0, 6094)	0
  (0, 111)	8
  (0, 1340)	9
  (0, 349)	7
  (0, 1383)	5
  (0, 2840)	0
  (0, 3196)	0
  (0, 1488)	7
  (0, 3445)	0
  (0, 1000)	0
  (0, 1180)	8
  (0, 286)	5
  (0, 512)	8
  (0, 51)	9
  (0, 175)	10
  (0, 164)	8
  (0, 1574)	6
  (0, 274)	8
  (0, 708)	0
  (0, 3343)	0
  (0, 157)	7
  (0, 4544)	0
  (0, 76)	8
  (0, 33)	7
  :	:
  (110676, 3838)	0
  (110676, 6583)	0
  (110676, 3525)	0
  (110676, 1800)	0
  (110677, 5993)	8
  (110677, 6258)	8
  (110677, 6387)	8
  (110677, 7211)	0
  (110677, 469)	8
  (110677, 4366)	8
  (110677, 7214)	9
  (110677, 2871)	9
  (110677, 7260)	7
  (110677, 4104)	9
  (110677, 6287)	9
  (110678, 7012)	0
  (110678, 484)	9
  (110678, 393)	10
  (110678, 142)	7
  (110679, 1843)	0
  (110679, 6228)	0
  (110679, 6983)	0
  (110679, 7435)	0
  (110679, 6294)	0
  (110679, 1321)	0


In [14]:
# computing the similarity matric using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

item_similarity_matrix = cosine_similarity(sparse_matrix.T, dense_output=False)
print(item_similarity_matrix)

  (0, 2707)	0.00028471311850851466
  (0, 7619)	0.013120216178876463
  (0, 7256)	0.010288646729738475
  (0, 6902)	0.007997175665966116
  (0, 2913)	0.006017933304760942
  (0, 2161)	0.00475111116975817
  (0, 7116)	0.007902734905315264
  (0, 6423)	0.008142994286992345
  (0, 7465)	0.005318981595074605
  (0, 7165)	0.0030981438646147203
  (0, 7093)	0.001991508792913957
  (0, 7080)	0.0035538940670224173
  (0, 7061)	0.0015545756515292697
  (0, 6625)	0.004671047829090624
  (0, 5814)	0.001746276485515214
  (0, 4790)	0.0022363933580628118
  (0, 3886)	0.003094209712088567
  (0, 3790)	0.0011917463226826547
  (0, 3748)	0.00245520515636011
  (0, 3704)	0.007313709488902375
  (0, 5675)	0.013115322197265302
  (0, 6512)	0.004032309541058253
  (0, 5756)	0.005727286856337283
  (0, 1446)	0.0032436745529513065
  (0, 6657)	0.0048405076668452856
  :	:
  (7631, 3256)	0.01759522741919797
  (7631, 3188)	0.0034587402957251224
  (7631, 3155)	0.005167460970468587
  (7631, 3079)	0.009000572807505798
  (7631, 2981)	0.0

In [15]:
# Create a funcction to get the top 5 similar anime
def recommend_anime(user_id, sparse_matrix, item_similarity_matrix, k=5):
    # Covert the user_id to category codes
    user_id_cat = processed_df[processed_df['user_id'] == user_id]['user_id_cat'].iloc[0]
    # Get all items rated by the user
    rated_items = sparse_matrix[user_id_cat].nonzero()[1]
    # Get the similarity scores of the items rated by the user
    similarity_scores = item_similarity_matrix[rated_items]
    # Sum the similarity scores of the items rated by the user
    total_similarity_scores = similarity_scores.sum(axis=0).A.flatten()
    
    # Recommended items with the highest similarity scores
    recommendations = np.argsort(total_similarity_scores)[::-1]
    # Exclude already rated items from the recommendations and return the top k
    mask = np.isin(recommendations, rated_items, invert=True)
    recommend_animes = recommendations[mask][:k]
    
    return recommend_animes

In [16]:
# Test the function
recommended_list = recommend_anime(1, sparse_matrix, item_similarity_matrix)
recommended_list

array([  52, 1321,  100,   20,    0])

In [17]:
# Add the unique anime name to the recommended list
recommended_animes = processed_df[processed_df['anime_id_cat'].isin(recommended_list)].drop_duplicates(subset=['title'])
recommended_animes[['title', 'genre']]

Unnamed: 0,title,genre
27994411,Death Note,"Mystery, Police, Psychological, Supernatural, ..."
17856970,Fullmetal Alchemist,"Action, Adventure, Comedy, Drama, Fantasy, Mag..."
2560959,Neon Genesis Evangelion,"Action, Sci-Fi, Dementia, Psychological, Drama..."
26887068,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
24695977,Full Metal Panic? Fumoffu,"Action, Comedy, School"


In [18]:
#  Evaluate the model precision and recall
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)

# Create a user-item matrix for the train set
train_row = train_df['user_id_cat']
train_col = train_df['anime_id_cat']
train_matrix = csr_matrix((train_df['my_score'], (train_row, train_col)), shape=(train_row.max()+1, train_col.max()+1))

# Create a user-item matrix for the test set
test_row = test_df['user_id_cat']
test_col = test_df['anime_id_cat']
test_matrix = csr_matrix((test_df['my_score'], (test_row, test_col)), shape=(test_row.max()+1, test_col.max()+1))

In [19]:
train_similarity_matrix = cosine_similarity(train_matrix.T, dense_output=False)

In [20]:
# Test the recommendation function on the test set
test_recommended_list = recommend_anime(1, test_matrix, train_similarity_matrix)
test_recommended_list

array([ 512, 2505, 3089, 2815,  723])

In [21]:
# Add the unique anime name to the recommended list
test_recommended_animes = processed_df[processed_df['anime_id_cat'].isin(test_recommended_list)].drop_duplicates(subset=['title'])
test_recommended_animes[['title', 'genre']]

Unnamed: 0,title,genre
16253,Major S2,"Comedy, Drama, Shounen, Sports"
20975307,Major S5,"Comedy, Drama, Romance, Sports"
4937914,Major S4,"Sports, Comedy, Drama, Shounen"
15524719,Ginsoukikou Ordian,"Mecha, Sci-Fi"
30767194,Sakigake!! Otokojuku Movie,"Action, Comedy, Martial Arts, School, Shounen"


In [25]:
# Function to return the list of recomended anime for each user_id
def recommendation_list(df, sparse_matrix, item_similarity_matrix):
    # Empty dict
    results = []
    
    for user_id in df['user_id'].unique():
        recommended_anime = recommend_anime(user_id, sparse_matrix, item_similarity_matrix)
        
        # Append the results to the empty dict
        results.append({'user_id': user_id,'recommended_anime': recommended_anime})
        
    results_df = pd.DataFrame(results)
    
    return results_df

In [26]:
# Returning the recommended anime for each user in the test set
recommended_test_list = recommendation_list(test_df, test_matrix, train_similarity_matrix)
recommended_test_list

Unnamed: 0,user_id,recommended_anime
0,4662832,"[7631, 2547, 2535, 2536, 2537]"
1,26925,"[7631, 2547, 2535, 2536, 2537]"
2,1505011,"[7533, 7619, 7256, 7595, 7494]"
3,6661026,"[6685, 6082, 6985, 6852, 6402]"
4,6405969,"[6467, 6919, 6507, 5461, 6175]"
...,...,...
97584,1112123,"[2871, 2893, 2827, 3486, 3441]"
97585,3908237,"[7631, 2547, 2535, 2536, 2537]"
97586,429269,"[7631, 2547, 2535, 2536, 2537]"
97587,321826,"[3218, 686, 2064, 7090, 4207]"


In [27]:
# Returning the relevant anime for each user in the test set
distinct_ranking = test_df['my_score'].drop_duplicates()
distinct_ranking

33982588     0
32709888     8
21541801    10
19969784     9
30173199     6
14918115     7
22480136     5
13482421     3
1012239      2
9462057      4
31211667     1
Name: my_score, dtype: int64

In [28]:
# Score threshold for an anime to be recommended
score_threshold = 5

# Filter the anime list for each user that is above the score threshold
relevant_anime = test_df[test_df['my_score'] >= score_threshold]
# Create a dataframe with the relevant anime for each user
relevant_anime = relevant_anime.groupby('user_id')['anime_id'].apply(list).reset_index()
relevant_anime

Unnamed: 0,user_id,anime_id
0,1,"[627, 1735, 201, 289, 306]"
1,3,[16]
2,4,"[2251, 2167, 4985, 6793, 13659, 13655, 32601]"
3,20,"[572, 174, 1818, 457, 1956]"
4,36,"[770, 8197, 7769, 31629, 240, 34798, 3001, 322..."
...,...,...
84501,7240178,[18277]
84502,7242053,"[30711, 31251, 570, 33433, 29095, 3784]"
84503,7248525,"[8074, 30307]"
84504,7249032,[34822]


In [29]:
def evaluate_recommender_system(recommended_df, relevant_df):
    # Merge the recommended and relevant dataframes on user_id
    merged_df = pd.merge(recommended_df, relevant_df, on='user_id', how='inner', suffixes=('_rec', '_rel'))
    
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    TP = merged_df.apply(lambda row: len(set(row['recommended_anime']) & set(row['anime_id'])), axis=1).sum()
    FP = merged_df.apply(lambda row: len(set(row['recommended_anime']) - set(row['anime_id'])), axis=1).sum()
    FN = merged_df.apply(lambda row: len(set(row['anime_id']) - set(row['recommended_anime'])), axis=1).sum()
    
    # Calculate Precision, Recall, and F1 Score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {'precision': precision, 'recall': recall, 'f1_score': f1_score}

# Assuming recommended_test_list and relevant_anime are your dataframes
evaluation_results = evaluate_recommender_system(recommended_test_list, relevant_anime)
print(evaluation_results)

{'precision': 0.00046387238775944905, 'recall': 0.0004900147004410132, 'f1_score': 0.00047658531485025255}


In [30]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import SVD

In [34]:
# Assuming 'processed_df' is your DataFrame with columns ['user_id', 'anime_id', 'my_score']
reader = Reader(rating_scale=(1, 10))  # Adjust the rating_scale according to your dataset's scale
data = Dataset.load_from_df(train_df[['user_id', 'anime_id', 'my_score']], reader)

In [35]:
# Use the SVD algorithm
algo = SVD()

# Perform a 5-fold cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.5344  3.5321  3.5300  3.5328  3.5318  3.5322  0.0014  
MAE (testset)     2.8839  2.8824  2.8821  2.8834  2.8831  2.8830  0.0007  
Fit time          33.20   31.97   32.94   32.02   32.68   32.56   0.49    
Test time         4.38    4.96    5.25    4.41    5.28    4.86    0.40    


{'test_rmse': array([3.53438921, 3.53205583, 3.53003922, 3.53282336, 3.53181551]),
 'test_mae': array([2.88394924, 2.88240467, 2.88210828, 2.88344933, 2.88306504]),
 'fit_time': (33.196446895599365,
  31.967849016189575,
  32.93683338165283,
  32.02084565162659,
  32.67876195907593),
 'test_time': (4.377642869949341,
  4.959263801574707,
  5.254959344863892,
  4.4053568840026855,
  5.2798449993133545)}

In [36]:
param_grid = {
    'n_epochs': [5, 10, 20],  # Number of epochs
    'lr_all': [0.002, 0.005],  # Learning rate
    'reg_all': [0.02, 0.1]  # Regularization term
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# Best RMSE score
print(gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

3.3166180950490407
{'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}


In [37]:
# Use the best parameters
algo = SVD(n_epochs=gs.best_params['rmse']['n_epochs'], lr_all=gs.best_params['rmse']['lr_all'], reg_all=gs.best_params['rmse']['reg_all'])

# Retrain the algorithm on the full training data
trainset = data.build_full_trainset()
algo.fit(trainset)

# Cross Validate again
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2822  3.2793  3.2852  3.2844  3.2814  3.2825  0.0021  
MAE (testset)     2.7755  2.7741  2.7776  2.7797  2.7717  2.7757  0.0028  
Fit time          31.22   31.75   32.11   31.88   32.15   31.82   0.34    
Test time         5.59    5.36    5.61    5.47    5.64    5.53    0.11    


{'test_rmse': array([3.28221386, 3.27929361, 3.28516909, 3.28436129, 3.28138222]),
 'test_mae': array([2.77548837, 2.7740694 , 2.7776254 , 2.77972567, 2.77171222]),
 'fit_time': (31.21980381011963,
  31.7461576461792,
  32.11291980743408,
  31.882842779159546,
  32.14727306365967),
 'test_time': (5.589059829711914,
  5.362762928009033,
  5.612452268600464,
  5.4657135009765625,
  5.643476247787476)}

In [38]:
def predict_ratings(user_id, anime_ids, algo):
    # List to store each anime ID and its predicted rating
    predictions = []

    for anime_id in anime_ids:
        # Predict the rating for each anime ID
        prediction = algo.predict(user_id, anime_id)
        predictions.append((anime_id, prediction.est))

    # Convert the list of predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['anime_id', 'predicted_rating'])

    # Sort the DataFrame by predicted ratings in descending order
    predictions_df = predictions_df.sort_values('predicted_rating', ascending=False)

    return predictions_df

In [39]:
# New predictions for user_id 1
new_predictions = predict_ratings(1, test_df['anime_id'].unique(), algo)
new_predictions.head(10)

Unnamed: 0,anime_id,predicted_rating
932,572,8.892566
449,1575,8.286678
448,199,8.190081
200,853,8.179647
166,120,7.947438
317,31043,7.552004
272,7311,7.21602
379,1689,7.203213
1032,317,7.148909
2701,3782,7.09032


In [45]:
# Merge the new_predictions DataFrame with test_df to get the titles and genres
new_predictions_with_info = pd.merge(new_predictions, test_df[['anime_id', 'title', 'genre']].drop_duplicates(), on='anime_id', how='left')

# Display the top 10 recommendations with titles and genres
new_predictions_with_info.head(5)

Unnamed: 0,anime_id,predicted_rating,title,genre
0,572,8.892566,Kaze no Tani no Nausicaä,"Adventure, Fantasy"
1,1575,8.286678,Code Geass: Hangyaku no Lelouch,"Action, Military, Sci-Fi, Super Power, Drama, ..."
2,199,8.190081,Sen to Chihiro no Kamikakushi,"Adventure, Supernatural, Drama"
3,853,8.179647,Ouran Koukou Host Club,"Comedy, Harem, Romance, School, Shoujo"
4,120,7.947438,Fruits Basket,"Slice of Life, Comedy, Drama, Romance, Fantasy..."


In [47]:
# Filter the processed_df for user_id 1 and sort by 'my_score' in descending order
top_rated_animes_by_user = processed_df[processed_df['user_id'] == 1].sort_values('my_score', ascending=False).head(5)

# Display the top 5 rated anime by the user
top_rated_animes_by_user[['anime_id', 'title', 'genre', 'my_score']]

Unnamed: 0,anime_id,title,genre,my_score
6586,199,Sen to Chihiro no Kamikakushi,"Adventure, Supernatural, Drama",10
6572,57,Beck,"Comedy, Drama, Music, Shounen, Slice of Life",9
6796,1559,Shijou Saikyou no Deshi Kenichi,"Action, Comedy, Martial Arts, School, Shounen",9
6841,627,Major S1,"Comedy, Sports, Drama, Shounen",9
6689,71,Full Metal Panic!,"Action, Military, Sci-Fi, Comedy, Mecha",9
