# Final Project: Recommendation System

##  Hanzhuo Gong

## COEN 140

### This recommendation system uses User Based Collabrative Filtering to computer the RMSE to make sure the recommended anime are fairy precise. Then use the KNN to find similar anime to recommend to the user.

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:

ratings = pd.read_csv('rating.csv')
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [3]:
#take out the -1 rating
ratings = ratings[(ratings['rating'] >= 0) | (ratings['rating'].isnull())]
ratings.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


In [4]:
print(ratings)

         user_id  anime_id  rating
47             1      8074      10
81             1     11617      10
83             1     11757      10
101            1     15451      10
153            2     11771      10
...          ...       ...     ...
7813732    73515     16512       7
7813733    73515     17187       9
7813734    73515     22145      10
7813735    73516       790       9
7813736    73516      8074       9

[6337241 rows x 3 columns]


In [5]:
#Read the anime file and take out the unnecesaary features 
anime = pd.read_csv('anime.csv')
anime = anime.drop(['genre','type','episodes', 'members', 'rating'],axis=1)
anime.head()

Unnamed: 0,anime_id,name
0,32281,Kimi no Na wa.
1,5114,Fullmetal Alchemist: Brotherhood
2,28977,Gintama°
3,9253,Steins;Gate
4,9969,Gintama&#039;


In [6]:
merge_ratings = pd.merge(anime,ratings, on='anime_id')
merge_ratings.head()

Unnamed: 0,anime_id,name,user_id,rating
0,32281,Kimi no Na wa.,99,5
1,32281,Kimi no Na wa.,152,10
2,32281,Kimi no Na wa.,244,10
3,32281,Kimi no Na wa.,271,10
4,32281,Kimi no Na wa.,322,10


In [7]:
#Item average rating
average_ratings = pd.DataFrame(merge_ratings.groupby('name')['rating'].mean())
average_ratings.head()

Unnamed: 0_level_0,rating
name,Unnamed: 1_level_1
&quot;0&quot;,4.764706
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",1.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,7.343307
&quot;Bungaku Shoujo&quot; Memoire,7.744713
&quot;Bungaku Shoujo&quot; Movie,7.840154


In [8]:
#sorted the mean in higher number first
average_ratings.groupby('name')['rating'].mean().sort_values(ascending=False).head()

name
What&#039;s Michael? (TV)                  10.0
Hello Kitty no Circus ga Yatte Kita        10.0
Yokohama Meibutsu: Otoko Katayama-gumi!    10.0
Midoriyama Koukou Koushien-hen             10.0
Asari-chan: Ai no Marchen Shoujo           10.0
Name: rating, dtype: float64

In [9]:
#copy the ratings to calculate the average user rating
totalRatings = ratings.drop(['user_id','anime_id'],axis=1)
totalRatings.head()

Unnamed: 0,rating
47,10
81,10
83,10
101,10
153,10


In [10]:
#user average rating
userAverage = totalRatings.mean()
print(userAverage)

rating    7.808497
dtype: float64


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = ratings.copy()
y = ratings['user_id']
print(y)

47             1
81             1
83             1
101            1
153            2
           ...  
7813732    73515
7813733    73515
7813734    73515
7813735    73516
7813736    73516
Name: user_id, Length: 6337241, dtype: int64


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.10)
print(X_test)
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)
#print(X_test)

         user_id  anime_id  rating
3744694    34992     22535      10
1384260    13175      1604       8
6949947    64848      6984       7
5935963    55435       567       8
745455      6960        45      10
...          ...       ...     ...
6818860    63158     28171       8
2325506    22391      9989       9
5315316    50381        27       9
1786806    17312     32729       7
3153356    29136       440       7

[633725 rows x 3 columns]


In [14]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [15]:
def baseline(user_id, anime_id):
    return userAverage
#predict the average number of user's rating

In [16]:
#In order to predict the RMSE, need all the user, we have 9357 users
user = pd.DataFrame(columns=['user_id'])
for i in range(9357):
    user.loc[i] = i+1

user.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


In [17]:
def score(cf_model): 
    # list of user-anime tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['anime_id'])
    
    #predict the ratings for user-anime
    y_pred = np.array([cf_model(user, anime) for (user, anime) in id_pairs])
    print("prediction is: ", y_pred)
    
    #Find out the actual ratings given in the test data
    y_true = np.array(X_test['rating'])
    print("actual rating is: ", y_true)
    
    return rmse(y_true, y_pred)

In [18]:
score(baseline)

prediction is:  [[7.80849695]
 [7.80849695]
 [7.80849695]
 ...
 [7.80849695]
 [7.80849695]
 [7.80849695]]
actual rating is:  [10  8  7 ...  9  7  7]


1.570641943661616

In [19]:
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='anime_id')
r_matrix.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34238,34239,34240,34252,34283,34324,34325,34349,34367,34475
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [20]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, anime_id):
    
    #Check if anime_id exists in r_matrix
    if anime_id in r_matrix:
        #Compute the mean of all the ratings given to the anime
        mean_rating = r_matrix[anime_id].mean()
    
    else:
        #Default to a rating, if the anime not there, use the user average
        mean_rating = userAverage
    
    return mean_rating


### Lines below will take a long time to run

In [57]:
#Compute RMSE for the Mean model
#This will take around 1 hr 12 mins for my computer to run 
score(cf_user_mean)

prediction is:  [8.931249265483606 7.866099893730074 6.857651245551601 ...
 7.911372371693421 7.967542503863988 7.569737153682216]
actual rating is:  [10  7 10 ... 10  9  7]


1.4360612373407562

### Unable to test cross validation, kernel keep deading, longest run: 8 hrs, still not finished

In [21]:
'''
#If the system doesn't have surprise, need to run pip install surprise
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import cross_validate

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

#Define the algorithm kNN
knn = KNNBasic()

#Calculate the RMSE, Run 5-fold cross-validation and then print results
cross_validate(knn, data, measures=['RMSE'], cv=2, verbose=True)

#reduce the testing data size, and K-fold, re-run this again.
#use KNN, remove cross_validation
'''

"\n#If the system doesn't have surprise, need to run pip install surprise\n#Import the required classes and methods from the surprise library\nfrom surprise import Reader, Dataset, KNNBasic, SVD\nfrom surprise.model_selection import cross_validate\n\n#Define a Reader object\n#The Reader object helps in parsing the file or dataframe containing ratings\nreader = Reader()\n\n#Create the dataset to be used for building the filter\ndata = Dataset.load_from_df(ratings, reader)\n\n#Define the algorithm kNN\nknn = KNNBasic()\n\n#Calculate the RMSE, Run 5-fold cross-validation and then print results\ncross_validate(knn, data, measures=['RMSE'], cv=2, verbose=True)\n\n#reduce the testing data size, and K-fold, re-run this again.\n#use KNN, remove cross_validation\n"

## Below is to find item similarity using KNN

In [22]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

In [23]:
#replace the NA for 0
user_ratings = merge_ratings.pivot_table(index='anime_id',columns='user_id', values='rating').fillna(0)
user_ratings.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,10.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,10.0,0.0
6,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
mat_anime_users=csr_matrix(user_ratings.values)
print(mat_anime_users)

  (0, 15)	10.0
  (0, 17)	9.0
  (0, 19)	9.0
  (0, 28)	10.0
  (0, 30)	7.0
  (0, 39)	10.0
  (0, 42)	10.0
  (0, 45)	10.0
  (0, 46)	10.0
  (0, 49)	8.0
  (0, 62)	8.0
  (0, 66)	7.0
  (0, 74)	10.0
  (0, 75)	8.0
  (0, 96)	8.0
  (0, 119)	9.0
  (0, 122)	9.0
  (0, 131)	10.0
  (0, 143)	10.0
  (0, 150)	9.0
  (0, 153)	8.0
  (0, 163)	7.0
  (0, 165)	10.0
  (0, 168)	10.0
  (0, 175)	6.0
  :	:
  (9922, 1629)	9.0
  (9922, 9650)	10.0
  (9922, 18978)	4.0
  (9922, 21571)	8.0
  (9922, 31692)	7.0
  (9922, 32053)	7.0
  (9922, 33445)	6.0
  (9922, 40075)	8.0
  (9922, 41771)	8.0
  (9922, 44732)	7.0
  (9922, 45136)	7.0
  (9922, 65172)	6.0
  (9922, 66256)	8.0
  (9922, 67125)	7.0
  (9922, 67974)	8.0
  (9923, 21198)	7.0
  (9923, 36384)	6.0
  (9923, 54550)	8.0
  (9924, 28913)	6.0
  (9924, 45136)	6.0
  (9924, 56512)	5.0
  (9924, 59127)	4.0
  (9925, 13195)	6.0
  (9925, 50622)	5.0
  (9925, 66796)	9.0


In [25]:
#use consine similarity to compute the KNN
model_knn = NearestNeighbors(metric = 'cosine', algorithm='brute', n_neighbors=10)

In [26]:
model_knn.fit(mat_anime_users)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [27]:
def recommender(anime_name, data, model, n_recommendations):
    model.fit(data)
    idx=process.extractOne(anime_name, anime['name'])[2]
    print("Anime Selected: ", anime['name'][idx], 'Index: ', idx)
    print('Searching for recommendations...')
    distances, indices = model.kneighbors(data[idx], n_neighbors= n_recommendations)
    print(distances, indices)
    #print(indices)
    for i in indices:
        #print(anime['name'][i])
        print(anime['name'][i])
    
    
recommender('kimi no na wa', mat_anime_users, model_knn, 10)

Anime Selected:  Kimi no Na wa. Index:  0
Searching for recommendations...
[[0.         0.39224266 0.44976264 0.46517664 0.4931964  0.49864531
  0.52742098 0.53427002 0.54379092 0.54390219]] [[   0    1  181    2  202   19 1814   27   23  802]]
0                         Kimi no Na wa.
1       Fullmetal Alchemist: Brotherhood
181                       Major: Message
2                               Gintama°
202        Hunter x Hunter: Greed Island
19       Code Geass: Hangyaku no Lelouch
1814                     Ganbarist! Shun
27                    Mushishi Zoku Shou
23                         One Punch Man
802                    Seikai no Monshou
Name: name, dtype: object


In [28]:
recommender('One Punch Man', mat_anime_users, model_knn, 10)

Anime Selected:  One Punch Man Index:  23
Searching for recommendations...
[[7.83817455e-14 3.68694161e-01 4.60296472e-01 5.02403970e-01
  5.16921940e-01 5.43790921e-01 5.59699817e-01 5.67155364e-01
  5.78317357e-01 5.90324546e-01]] [[  23  437  436   27  726    0   19 1417    1   21]]
23                                          One Punch Man
437                                      Junjou Romantica
436                              Jigoku Shoujo Futakomori
27                                     Mushishi Zoku Shou
726                                               Jin-Rou
0                                          Kimi no Na wa.
19                        Code Geass: Hangyaku no Lelouch
1417                    Hokuto no Ken Zero: Kenshirou Den
1                        Fullmetal Alchemist: Brotherhood
21      Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...
Name: name, dtype: object


In [46]:
from sklearn import neighbors
from math import sqrt
from sklearn.metrics import mean_squared_error

In [47]:
n_neightbors = 10
rmse_val=[]
for K in range(10):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors= K)
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test,pred))
    rmse_val.append(error)
    print('K is: ', K, ' RMSE error is: ', error)

K is:  1  RMSE error is:  6.992366345377843
K is:  2  RMSE error is:  6.376063856594053
K is:  3  RMSE error is:  6.230592147779602
K is:  4  RMSE error is:  6.2050970831911165
K is:  5  RMSE error is:  6.229196776865429
K is:  6  RMSE error is:  6.269803575786697
K is:  7  RMSE error is:  6.331273817956041
K is:  8  RMSE error is:  6.397785244269779
K is:  9  RMSE error is:  6.463722685011784
K is:  10  RMSE error is:  6.523442259057306


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                    weights='uniform')

K is:  10  RMSE error is:  6.523442259057306
