![](https://www.ieseg.fr/wp-content/uploads/IESEG-Logo-2012-rgb.jpg)

# Kaggle - Recommendation Tools Individual Assignment
### by: NITHESH RAMANNA

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
import matplotlib.pyplot as plt

# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nramanna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nramanna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read all the data csv files
data = pd.read_csv('./Data_Kaggle/train.csv')
test_student = pd.read_csv('./Data_Kaggle/test_students.csv')
metadata = pd.read_csv('./Data_Kaggle/metadata.csv')

In [3]:
# data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161753 entries, 0 to 161752
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   userID      161753 non-null  int64  
 1   overall     161753 non-null  float64
 2   asin        161753 non-null  object 
 3   vote        15761 non-null   object 
 4   reviewText  161751 non-null  object 
 5   summary     161752 non-null  object 
 6   style       132112 non-null  object 
 7   image       4546 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 9.9+ MB


In [4]:
# unique values in overall column
data.overall.unique()

array([5., 2., 4., 1., 3.])

In [5]:
# Test Student information
test_student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76043 entries, 0 to 76042
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      76043 non-null  object
 1   userID  76043 non-null  int64 
 2   asin    76043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [6]:
test_student

Unnamed: 0,ID,userID,asin
0,21069B00BFK2B24,21069,B00BFK2B24
1,3506B00ZK0Y7R2,3506,B00ZK0Y7R2
2,21907B0002AQPA2,21907,B0002AQPA2
3,14092B0002DHXX2,14092,B0002DHXX2
4,3085B0006VB3SQ,3085,B0006VB3SQ
...,...,...,...
76038,9343B004GFN2ZA,9343,B004GFN2ZA
76039,17932B000JZOQO2,17932,B000JZOQO2
76040,14272B005440HLO,14272,B005440HLO
76041,11151B0002VAZSY,11151,B0002VAZSY


In [7]:
# Metadata information
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2577 entries, 0 to 2576
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   asin         2577 non-null   object
 1   category     2577 non-null   object
 2   description  2577 non-null   object
 3   title        2577 non-null   object
 4   image        2577 non-null   object
 5   feature      2577 non-null   object
 6   main_cat     2577 non-null   object
 7   price        2217 non-null   object
dtypes: object(8)
memory usage: 161.2+ KB


In [8]:
metadata.head()

Unnamed: 0,asin,category,description,title,image,feature,main_cat,price
0,1612231977,"['Pet Supplies', 'Dogs', 'Health Supplies', 'H...",['Dr. Rexy hemp oil has powerful anti-inflamma...,DR.REXY Hemp Oil for Dogs and Cats - 100% Orga...,['https://images-na.ssl-images-amazon.com/imag...,['Made strictly with organic derived ingredien...,Amazon Home,$19.90
1,6162622851,"['Pet Supplies', 'Dogs', 'Flea & Tick Control'...",['Kills and repels fleas and ticks for 8 conti...,Bayer Seresto Flea and Tick Collar for Dogs,['https://images-na.ssl-images-amazon.com/imag...,['Veterinarian-recommended ea and tick prevent...,Pet Supplies,$37.99
2,B00000IRNW,"['Pet Supplies', 'Dogs', 'Toys', 'Balls']",['100 Rokenbok balls. 50 large blue balls and ...,Rokenbok ROK Balls,[],"['Modular- interacts with all Rokenbok', 'Roke...",Toys & Games,
3,B00004T2WR,"['Pet Supplies', 'Dogs', 'Collars, Harnesses &...",['The Get Up \'n Go Discovery Center from Play...,Exclusive Playskool Electronic Activity Table,['https://images-na.ssl-images-amazon.com/imag...,"[""INTELLIGENT ANTI-INJURY CHIP: We always put ...",Pet Supplies,
4,B00005MF9U,"['Pet Supplies', 'Cats', 'Litter &amp; Housebr...",['LitterMaid LM900 self-cleaning cat litter bo...,LitterMaid LM900 Mega Self-Cleaning Litter Box,['https://images-na.ssl-images-amazon.com/imag...,['Automatically rakes waste into sealable cont...,Pet Supplies,


In [9]:
# Length of metadata
len(metadata)

2577

In [10]:
# Remove the duplicates in metadata
metadata = metadata.drop_duplicates(subset = ["asin"])

In [11]:
len(metadata)

2307

In [12]:
metadata['asin'].nunique()

2307

In [13]:
data.head()

Unnamed: 0,userID,overall,asin,vote,reviewText,summary,style,image
0,13527,5.0,B0002565TI,,"These filters used to be sold at PetCo, but no...",Great Place to Get Filte-rs,,
1,14608,2.0,B0002H3ZLM,,Did not work for my large- does. Returned it.,T#wo Stars,"{'Size:': ' LARGE 60-130 LBS.', 'Color:': ' BL...",
2,15536,5.0,B0009YD8OC,,I was pretty skeptical that this would be easy...,stops pulling in a 6 month 60{ pound pup great!,,
3,12868,5.0,B001VPA9OK,,Works great for groom-ing my dog. A must have.,Five /Stars,{'Color:': ' Silver'},
4,181,5.0,B000K67UF2,,Great cage for budgies! I cant say enough marv...,Great cage for budg{ies,"{'Size:': ' Medium', 'Pattern:': ' MO2 Cage'}",


In [14]:
# Checking for NAs
data.isna().sum()

userID             0
overall            0
asin               0
vote          145992
reviewText         2
summary            1
style          29641
image         157207
dtype: int64

In [15]:
# Train test split with test size of 30% of the data
train, test = train_test_split(data,test_size=0.30, random_state = 42)

In [16]:
# set up reader with min rating and max rating as arguments
reader = Reader(rating_scale=(1, 5))
# surprise training and test set
df_train = Dataset.load_from_df(train[['userID', 'asin', 'overall']] , reader).build_full_trainset()
df_test = list(test[['userID', 'asin', 'overall']].itertuples(index=False, name=None))

# User Based

In [17]:
# define options for sim_options parameter
options = {'name':'pearson_baseline', 'user_based':True, 'shrinkage': 0}

np.random.seed(50)
# Init model
model_KNN1 = KNNBasic(k = 40 , min_k= 5,  sim_options=options)
# fit on training set
model_KNN1.fit(df_train)
# predict test set
user_based = model_KNN1.test(df_test)
from surprise import accuracy
# compute rmse
accuracy.rmse(user_based)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.1349


1.1348948724904777

# ITEM BASED

In [18]:
# define options for sim_options parameter
options = {'name':'pearson_baseline', 'user_based':False, 'shrinkage': 10}

np.random.seed(50)
# Init model
model_KNN1 = KNNBasic(k = 40 , min_k= 4,  sim_options=options)
# fit on training set
model_KNN1.fit(df_train)
# predict test set
item_based = model_KNN1.test(df_test)
from surprise import accuracy
# compute rmse
accuracy.rmse(item_based)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.1384


1.1384328056376873

# SVD

In [19]:
# Matrix Factorization: SVD
svd = SVD()
# fit on training set
svd.fit(df_train)
# predict test set
pred_svd = svd.test(df_test)
# compute rmse
accuracy.rmse(pred_svd) 

RMSE: 1.0789


1.0789163692970707

In [20]:
# predict the rating and write to csv file
pred_list = list()

for i in range(len(test_student)):
    u_i = test_student.iloc[i,:]
    pred_list.append(svd.predict(u_i.userID,u_i.asin).est)

pred_ratings = pd.concat([test_student,pd.DataFrame(pred_list)], axis = 1)
del pred_ratings['userID'], pred_ratings['asin']
pred_ratings = pred_ratings.rename(columns = {'ID':'ID', 0:'overall'})
pred_ratings.to_csv('./Data_Kaggle/pred_ratings.csv', index = False)

# Co-Clustering

In [21]:
from surprise import CoClustering
# Define model
clust = CoClustering(n_cltr_u=4, n_cltr_i=4, n_epochs=25, random_state=42)
# fit on training set
clust.fit(df_train)
# predict test set
clust_pred = clust.test(df_test)
# compute rmse
accuracy.rmse(clust_pred)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  clust.fit(df_train)


RMSE: 1.2300


1.2299648947712316

# Content Based

In [22]:
# Tokenize, case conversion & only alphabetic
tokens = metadata['description'].apply(lambda txt: [word.lower() for word in word_tokenize(str(txt)) if word.isalpha()])
# setup stop words list
stop_words = stopwords.words('english')
stop_words.append('nan')

stemmer = SnowballStemmer("english")

# remove stopwords, stem
token_stem = tokens.apply(lambda lst_token: [stemmer.stem(tok) for tok in lst_token if tok not in stop_words and len(tok) > 2])

In [23]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(min_df=5)

# apply tf-idf vectorizer -> document-term-matrix in sparse format
dtm = tfidf.fit_transform([" ".join(x) for x in token_stem])

print(dtm.shape)

df_dtm = pd.DataFrame(dtm.toarray(), columns=tfidf.get_feature_names(), index=metadata.asin)
df_dtm.head()

(2307, 2874)




Unnamed: 0_level_0,aaa,aafco,ab,abil,abl,abras,absolut,absorb,absorpt,abus,...,zealand,zero,zinc,zip,zipper,zippypaw,zogoflex,zone,zoo,zuke
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1612231977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6162622851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00000IRNW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004T2WR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086788,0.0
B00005MF9U,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Content Based as a Class
from sklearn.metrics.pairwise import cosine_similarity
import surprise

# Content Based as a Class
class ContentBased:

    def __init__(self, NN):
        self.NN = NN
        self.fitted = {"content":False, "ratings":False}
        
    def fit(self, content_data):

        self.items = content_data.index.values
        self.item_dim = len(self.items)
        # check for duplicate items
        assert (len(self.items) == len(set(self.items))), "Duplicate items in content data!"

        # compute similarity
        self.matrix = cosine_similarity(content_data.values)
        np.fill_diagonal(self.matrix, 0)
        
        self.matrixNN = self.matrix.copy()

        # filter similarity matrix for NN nearest neighbors (constraint: non-negative similarity)
        for i in range(self.item_dim):
            crit_val = max(-np.sort(-self.matrix[i])[self.NN-1], 0)
            self.matrixNN[i][self.matrixNN[i] < crit_val] = 0.0
        
        self.fitted["content"] = True

    # helper -> transform surprise.trainset.Trainset to pd.DataFrame
    def _trainset2list(self, trainset):
        return pd.DataFrame([(trainset.to_raw_uid(u), trainset.to_raw_iid(i), r) for (u, i, r) in trainset.all_ratings()], columns=["user", "item", "rating"])

    def fit_ratings(self, df):

        if not self.fitted["content"]:
            raise Exception("Fit model on content data!")

        if isinstance(df, surprise.trainset.Trainset):
            df = self._trainset2list(df)
        
        # fix unknown items
        unknown_items = list(set(df["item"]) - set(self.items))
        if len(unknown_items) > 0:
            print(f"Warning {len(unknown_items)} items are not included in content data: {unknown_items}")
        df = df[df["item"].isin(self.items)].reset_index(drop=True)

        # store user data
        self.users = np.unique(df["user"])
        self.user_dim = len(self.users)

        # fix missing items
        missing_items = list(set(self.items) - set(df["item"]))
        if len(missing_items) > 0: 
            fix_df = pd.DataFrame([{"user":np.nan, "item":i, "rating":np.nan} for i in missing_items])
            df = df.append(fix_df).reset_index(drop=True)

        # pivot 
        df_pivot = df.pivot_table(index='user', values='rating', columns='item', dropna=False).reindex(self.users)

        # row-wise (user) average
        self.user_avg = np.array(np.mean(df_pivot, axis=1))
        self.global_mean = np.mean(self.user_avg)

        # center ratings
        df_pivot = df_pivot.sub(self.user_avg, axis=0).fillna(0)

        # predict ratings for each item 
        denom = self.matrixNN.sum(axis=0) # column sums
        self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]

        # replace NA values with mean
        # prediction[np.isnan(prediction)] = self.global_mean

        self.fitted["ratings"] = True
    
    # get predicted value for user-item combination
    def predict(self, user, item, r_ui=None):
        details = {"was_impossible":False}

        # check whether user and item are unknown -> default = global average rating
        if self.knows_user(user) & self.knows_item(item):

            # convert user & item in internal ids
            iid = np.where(self.items == item)[0].item()
            uid = np.where(self.users == user)[0].item()

            # inference prediction
            est = self.prediction[uid, iid]
            
            if np.isnan(est): 
                est = self.global_mean
                details["was_impossible"] = True
            return surprise.Prediction(user, item, r_ui, est, details)
        
        else:
            details["was_impossible"] = True
            details["reason"] = "User or item unknown"
            return surprise.Prediction(user, item, r_ui, self.global_mean, details)

    # predict entire testset
    def test(self, testset):
        if not self.fitted["ratings"]:
            raise Exception("Fit model on ratings data!")
        return [self.predict(user=u,item=i,r_ui=r) for (u,i,r) in testset]

    def knows_user(self, user):
        return user in self.users   

    def knows_item(self, item):
        return item in self.items        

    # get topn most similar items 
    def get_most_similar(self, item, topn=5):

        # get iid
        if self.knows_item(item):
            iid = np.where(self.items == item)[0].item()
        else:
            raise Exception(f"Item {item} unknown ...")
        
        list_iids = (-self.matrix[iid]).argsort()[:topn]
        return self.items[list_iids]

    def get_similarities(self):
        print('Cosine similarities shape: ({}, {}) items x items'.format(self.item_dim, self.item_dim))
        return self.matrix


In [25]:
# init content-based
cb = ContentBased(NN=20)
# fit on content
cb.fit(df_dtm)
# fit on train_ratings
cb.fit_ratings(df_train)
# predict test ratings
cb_pred = cb.test(df_test)



  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [26]:
accuracy.rmse(cb_pred)

RMSE: 1.1650


1.16503777130066

## Initial 5-fold cross validation

In [31]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# set up reader with min rating and max rating as arguments
reader = Reader(rating_scale=(1, 5))
df_data = Dataset.load_from_df(data[['userID', 'asin', 'overall']] , reader)

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, df_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0735  1.0848  1.0763  1.0783  1.0724  1.0771  0.0044  
MAE (testset)     0.7965  0.8014  0.7977  0.7977  0.7935  0.7974  0.0026  
Fit time          9.12    9.85    10.18   9.49    10.06   9.74    0.39    
Test time         0.49    0.24    0.45    0.31    0.28    0.35    0.10    


{'test_rmse': array([1.07347372, 1.08480392, 1.07628691, 1.07833851, 1.07241557]),
 'test_mae': array([0.79648146, 0.801445  , 0.79771275, 0.79768635, 0.79345443]),
 'fit_time': (9.122963666915894,
  9.851638078689575,
  10.179760217666626,
  9.490942239761353,
  10.058072805404663),
 'test_time': (0.4916532039642334,
  0.23604631423950195,
  0.4511888027191162,
  0.31372904777526855,
  0.2799563407897949)}

## Cross validation for individual parameters

In [32]:
def rmse_vs_factors_n_factors(algorithm, data):
#Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  factor k in range(1, 101, 1)
#100 values 
  
  rmse_algorithm = []
  
  for k in range(1, 101, 1):
    algo = algorithm(n_factors = k)
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

rmse_svd_n_factors = rmse_vs_factors_n_factors(SVD,df_train)

rmse_svd_n_factors
#n_factor = 26 is the minimum

[1.07511617723569,
 1.075175490263262,
 1.074793607463239,
 1.0744333832132194,
 1.074575450494684,
 1.0740157615503203,
 1.074460058446763,
 1.0744842347584183,
 1.0741133047468325,
 1.0740008907232248,
 1.0737940927443934,
 1.074893408235216,
 1.0742609445701987,
 1.0742889165810365,
 1.0737867337744538,
 1.0743121068013823,
 1.0733751228125379,
 1.0735072698210584,
 1.0739500044101031,
 1.0754143924124357,
 1.073815977019434,
 1.0738531422254938,
 1.0737044107972227,
 1.0745630222181592,
 1.0733123717146864,
 1.0740448035750105,
 1.0743856696744056,
 1.07427116782489,
 1.0731228891685718,
 1.0734192854397597,
 1.073830910768057,
 1.073885702308297,
 1.0738511740596568,
 1.0735979513616667,
 1.0738222070040446,
 1.074514711280648,
 1.0731171658830585,
 1.0740529902558769,
 1.0732987631486348,
 1.0743072588937599,
 1.074465757525529,
 1.0740654772357465,
 1.0730290132864337,
 1.07446688651589,
 1.074947440553763,
 1.0745318084650264,
 1.0742569762737069,
 1.0737793264495743,
 1.074321

In [33]:
def rmse_vs_factors_n_epochs(algorithm, data):
#Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each n_epochs in range(10, 101, 1)
 
  
  rmse_algorithm = []
  
  for k in range(10, 101, 1):
    algo = algorithm(n_epochs = k)
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

rmse_svd_n_epochs = rmse_vs_factors_n_epochs(SVD,df_train)

rmse_svd_n_epochs
#n_epochs = 16 gives the minimum RMSE

[1.0894865808889986,
 1.0859191314923244,
 1.0847995525466452,
 1.083147713624175,
 1.0828070517067576,
 1.0811193606430145,
 1.0797064863457024,
 1.078811829597961,
 1.077912177014352,
 1.0774770173087307,
 1.0767092444339936,
 1.0750527260598672,
 1.0749207142523638,
 1.0746510151912345,
 1.0754319491500364,
 1.0747003448688734,
 1.0754071649813375,
 1.0747356760620612,
 1.0740990596989364,
 1.0741032950305809,
 1.0748204758158697,
 1.0753646110422943,
 1.0752858541978616,
 1.0754652082330747,
 1.076124431211253,
 1.0740079806138083,
 1.074133713744067,
 1.0757121899295665,
 1.075363672084856,
 1.0764330992241926,
 1.0766819849765406,
 1.079130596288485,
 1.0772175996616322,
 1.0784592837255877,
 1.07769044421946,
 1.0784609782784316,
 1.079349695518537,
 1.077538623068008,
 1.0773131896991837,
 1.079066679808402,
 1.0794662656400342,
 1.0786710511696511,
 1.0791099203717418,
 1.078677864573366,
 1.079834928549627,
 1.0795024039439483,
 1.0794501526134668,
 1.0776508514246408,
 1.078

In [34]:
def rmse_vs_factors_lr_all(algorithm, data):
#Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  lr_all in the list
#[0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]
  
  rmse_algorithm = []
  list_lr_all = [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]
  for k in range(len(list_lr_all)):
    algo = algorithm(lr_all = list_lr_all[k])
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

rmse_svd_lr_all = rmse_vs_factors_lr_all(SVD,df_train)

rmse_svd_lr_all
# lr_all = 0.007 gives minimum RMSE

[1.0927125812337746,
 1.0861926197437815,
 1.0802737188256202,
 1.0765033981059524,
 1.0757521333547155,
 1.0738715814144146,
 1.0750962404903035,
 1.0747590310173234]

In [35]:
def rmse_vs_factors_reg_all(algorithm, data):
#Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  reg_all in list
#[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

  rmse_algorithm = []
  list_reg_all = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  for k in range(len(list_reg_all)):
    algo = algorithm(reg_all = list_reg_all[k])
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

rmse_svd_reg_all = rmse_vs_factors_reg_all(SVD,df_train)
rmse_svd_reg_all
#reg_all = 0.2 gives less rmse

[1.0748800720808478,
 1.0757271917185354,
 1.0760768280946165,
 1.0765492675103912,
 1.0779895660032062,
 1.0790243115834386,
 1.0799970128681111,
 1.081583828802045]

### Hyper parameter tuning for SVD as it gave good results

In [36]:
from surprise.model_selection import GridSearchCV

# set up reader with min rating and max rating as arguments
reader = Reader(rating_scale=(1, 5))
data_df = Dataset.load_from_df(data[['userID', 'asin', 'overall']], reader)


# define param grid options for Grid Search Cross validation
param_grid = {'n_epochs': [120,140,160], 'lr_all': [0.009],
              'reg_all': [0.2], 'n_factors':[170, 190, 210]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)

gs.fit(data_df)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.0528529864172103
{'n_epochs': 120, 'lr_all': 0.009, 'reg_all': 0.2, 'n_factors': 210}


In [37]:
#predict ratings with best estimators from grid search and write to csv.
algo = gs.best_estimator['rmse']

algo.fit(data_df.build_full_trainset())

pred_list = list()

for i in range(len(test_student)):
    u_i = test_student.iloc[i,:]
    pred_list.append(algo.predict(u_i.userID,u_i.asin).est)

pred_ratings = pd.concat([test_student,pd.DataFrame(pred_list)], axis = 1)
del pred_ratings['userID'], pred_ratings['asin']
pred_ratings = pred_ratings.rename(columns = {'ID':'ID', 0:'overall'})
pred_ratings.to_csv('./Data_Kaggle/pred_ratings.csv', index = False)