In [1]:
import pandas as pd
import numpy as np
import surprise

In [2]:
df = pd.read_csv(r"Cases\Association Datasets\Amazon Movie Ratings\Amazon.csv")

ratings = pd.melt(df, id_vars='user_id', 
                  var_name="item_id",value_name="rating")

ratings = ratings[ratings['rating'].notna()]


In [3]:
ratings

Unnamed: 0,user_id,item_id,rating
0,A3R5OBKS7OM2IR,Movie1,5.0
4848,A3R5OBKS7OM2IR,Movie2,5.0
9697,AH3QC2PC1VTGP,Movie3,2.0
14546,A3LKP6WPMP9UKX,Movie4,5.0
14547,AVIY68KEPQ5ZD,Movie4,5.0
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [4]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between 1.0 and 5.0


In [5]:
data = surprise.Dataset.load_from_df(ratings, reader)

In [6]:
similarity_options = {'names':'cosine', 'user_based': True}
similarity_options


{'names': 'cosine', 'user_based': True}

In [7]:
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


In [8]:
pred = algo.predict(uid = "A3R5OBKS7OM2IR", iid = "Movie1")
score = pred.est
print(score)

5.0


In [9]:
iids = ratings['item_id'].unique()
iids

array(['Movie1', 'Movie2', 'Movie3', 'Movie4', 'Movie5', 'Movie6',
       'Movie7', 'Movie8', 'Movie9', 'Movie10', 'Movie11', 'Movie12',
       'Movie13', 'Movie14', 'Movie15', 'Movie16', 'Movie17', 'Movie18',
       'Movie19', 'Movie20', 'Movie21', 'Movie22', 'Movie23', 'Movie24',
       'Movie25', 'Movie26', 'Movie27', 'Movie28', 'Movie29', 'Movie30',
       'Movie31', 'Movie32', 'Movie33', 'Movie34', 'Movie35', 'Movie36',
       'Movie37', 'Movie38', 'Movie39', 'Movie40', 'Movie41', 'Movie42',
       'Movie43', 'Movie44', 'Movie45', 'Movie46', 'Movie47', 'Movie48',
       'Movie49', 'Movie50', 'Movie51', 'Movie52', 'Movie53', 'Movie54',
       'Movie55', 'Movie56', 'Movie57', 'Movie58', 'Movie59', 'Movie60',
       'Movie61', 'Movie62', 'Movie63', 'Movie64', 'Movie65', 'Movie66',
       'Movie67', 'Movie68', 'Movie69', 'Movie70', 'Movie71', 'Movie72',
       'Movie73', 'Movie74', 'Movie75', 'Movie76', 'Movie77', 'Movie78',
       'Movie79', 'Movie80', 'Movie81', 'Movie82', 'Movie83'

In [10]:
rec_13 = ratings[ratings['user_id'] == 13 ]
iids13 = rec_13['item_id']
print("List of iid that uid={0} has rated:".format(13))
print(iids13)

List of iid that uid=13 has rated:
Series([], Name: item_id, dtype: object)


In [11]:
iids_to_predict = np.setdiff1d(iids,iids13)
print("List of iid which uid={0} did not rate(in all {1}) :".format(13,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=13 did not rate(in all 206) :
['Movie1' 'Movie10' 'Movie100' 'Movie101' 'Movie102' 'Movie103' 'Movie104'
 'Movie105' 'Movie106' 'Movie107' 'Movie108' 'Movie109' 'Movie11'
 'Movie110' 'Movie111' 'Movie112' 'Movie113' 'Movie114' 'Movie115'
 'Movie116' 'Movie117' 'Movie118' 'Movie119' 'Movie12' 'Movie120'
 'Movie121' 'Movie122' 'Movie123' 'Movie124' 'Movie125' 'Movie126'
 'Movie127' 'Movie128' 'Movie129' 'Movie13' 'Movie130' 'Movie131'
 'Movie132' 'Movie133' 'Movie134' 'Movie135' 'Movie136' 'Movie137'
 'Movie138' 'Movie139' 'Movie14' 'Movie140' 'Movie141' 'Movie142'
 'Movie143' 'Movie144' 'Movie145' 'Movie146' 'Movie147' 'Movie148'
 'Movie149' 'Movie15' 'Movie150' 'Movie151' 'Movie152' 'Movie153'
 'Movie154' 'Movie155' 'Movie156' 'Movie157' 'Movie158' 'Movie159'
 'Movie16' 'Movie160' 'Movie161' 'Movie162' 'Movie163' 'Movie164'
 'Movie165' 'Movie166' 'Movie167' 'Movie168' 'Movie169' 'Movie17'
 'Movie170' 'Movie171' 'Movie172' 'Movie173' 'Movie174' 'Movie175'
 'Movie17

In [12]:
testset = [[13,iid,0.] for iid in iids_to_predict]
testset

[[13, 'Movie1', 0.0],
 [13, 'Movie10', 0.0],
 [13, 'Movie100', 0.0],
 [13, 'Movie101', 0.0],
 [13, 'Movie102', 0.0],
 [13, 'Movie103', 0.0],
 [13, 'Movie104', 0.0],
 [13, 'Movie105', 0.0],
 [13, 'Movie106', 0.0],
 [13, 'Movie107', 0.0],
 [13, 'Movie108', 0.0],
 [13, 'Movie109', 0.0],
 [13, 'Movie11', 0.0],
 [13, 'Movie110', 0.0],
 [13, 'Movie111', 0.0],
 [13, 'Movie112', 0.0],
 [13, 'Movie113', 0.0],
 [13, 'Movie114', 0.0],
 [13, 'Movie115', 0.0],
 [13, 'Movie116', 0.0],
 [13, 'Movie117', 0.0],
 [13, 'Movie118', 0.0],
 [13, 'Movie119', 0.0],
 [13, 'Movie12', 0.0],
 [13, 'Movie120', 0.0],
 [13, 'Movie121', 0.0],
 [13, 'Movie122', 0.0],
 [13, 'Movie123', 0.0],
 [13, 'Movie124', 0.0],
 [13, 'Movie125', 0.0],
 [13, 'Movie126', 0.0],
 [13, 'Movie127', 0.0],
 [13, 'Movie128', 0.0],
 [13, 'Movie129', 0.0],
 [13, 'Movie13', 0.0],
 [13, 'Movie130', 0.0],
 [13, 'Movie131', 0.0],
 [13, 'Movie132', 0.0],
 [13, 'Movie133', 0.0],
 [13, 'Movie134', 0.0],
 [13, 'Movie135', 0.0],
 [13, 'Movie136', 0.0]

In [13]:
predictions = algo.test(testset)
predictions[5]

Prediction(uid=13, iid='Movie103', r_ui=0.0, est=4.3856, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [14]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
       4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856, 4.3856,
      

In [15]:
iids_to_predict

array(['Movie1', 'Movie10', 'Movie100', 'Movie101', 'Movie102',
       'Movie103', 'Movie104', 'Movie105', 'Movie106', 'Movie107',
       'Movie108', 'Movie109', 'Movie11', 'Movie110', 'Movie111',
       'Movie112', 'Movie113', 'Movie114', 'Movie115', 'Movie116',
       'Movie117', 'Movie118', 'Movie119', 'Movie12', 'Movie120',
       'Movie121', 'Movie122', 'Movie123', 'Movie124', 'Movie125',
       'Movie126', 'Movie127', 'Movie128', 'Movie129', 'Movie13',
       'Movie130', 'Movie131', 'Movie132', 'Movie133', 'Movie134',
       'Movie135', 'Movie136', 'Movie137', 'Movie138', 'Movie139',
       'Movie14', 'Movie140', 'Movie141', 'Movie142', 'Movie143',
       'Movie144', 'Movie145', 'Movie146', 'Movie147', 'Movie148',
       'Movie149', 'Movie15', 'Movie150', 'Movie151', 'Movie152',
       'Movie153', 'Movie154', 'Movie155', 'Movie156', 'Movie157',
       'Movie158', 'Movie159', 'Movie16', 'Movie160', 'Movie161',
       'Movie162', 'Movie163', 'Movie164', 'Movie165', 'Movie166',
    

In [16]:
i_max = pred_ratings.argmax()
i_max

0

In [17]:
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(13,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 13 is Movie1 with predicted rating as 4.3856


In [18]:
import heapq
i_sorted_10 = heapq.nlargest(10,  range(len(pred_ratings)), pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

['Movie1' 'Movie10' 'Movie100' 'Movie101' 'Movie102' 'Movie103' 'Movie104'
 'Movie105' 'Movie106' 'Movie107']


In [19]:
np.arange(20,110,10)

array([ 20,  30,  40,  50,  60,  70,  80,  90, 100])

In [20]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'k': np.arange(30,110,10)}
kfold = KFold(n_splits=5, random_state=2021, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid, measures=['rmse', 'mae'], cv=kfold)

In [21]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

## Best score

In [24]:
print(gs.best_score['rmse'])

1.1999972199459112


## Best params

In [25]:
print(gs.best_params['rmse'])

{'k': 30}
