In [1]:
import pandas as pd
import numpy as np
import surprise

In [2]:
ratings = pd.read_csv(r"C:\Kaustubh Vaibhav\Machine Learning\Cases\Association Datasets\ml-100k\u.data",
                      sep='\t',
                      names = ['uid','iid','rating','timestamp'])
ratings.drop('timestamp', axis = 1, inplace= True)
ratings

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [3]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))

Ratings range between 1 and 5


In [4]:
data = surprise.Dataset.load_from_df(ratings, reader)

In [5]:
similarity_options = {'names':'cosine', 'user_based': True}
similarity_options


{'names': 'cosine', 'user_based': True}

In [6]:
algo = surprise.KNNBasic(sim_options = similarity_options)
output = algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
pred = algo.predict(uid = '13', iid = '225')
score = pred.est
print(score)

3.52986


In [8]:
iids = ratings['iid'].unique()
iids

array([ 242,  302,  377, ..., 1637, 1630, 1641], dtype=int64)

In [9]:
rec_13 = ratings[ratings['uid'] == 13 ]
iids13 = rec_13['iid']
print("List of iid that uid={0} has rated:".format(13))
print(iids13)

List of iid that uid=13 has rated:
63       526
144      836
145      272
185       98
205      360
        ... 
98615    822
99170    432
99306    904
99801    446
99998    225
Name: iid, Length: 636, dtype: int64


In [10]:
iids_to_predict = np.setdiff1d(iids,iids13)
print("List of iid which uid={0} did not rate(in all {1}) :".format(13,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=13 did not rate(in all 1046) :
[   3    6   10 ... 1680 1681 1682]


In [11]:
testset = [[13,iid,0.] for iid in iids_to_predict]
testset

[[13, 3, 0.0],
 [13, 6, 0.0],
 [13, 10, 0.0],
 [13, 15, 0.0],
 [13, 16, 0.0],
 [13, 18, 0.0],
 [13, 19, 0.0],
 [13, 20, 0.0],
 [13, 26, 0.0],
 [13, 30, 0.0],
 [13, 31, 0.0],
 [13, 34, 0.0],
 [13, 35, 0.0],
 [13, 36, 0.0],
 [13, 41, 0.0],
 [13, 43, 0.0],
 [13, 44, 0.0],
 [13, 46, 0.0],
 [13, 47, 0.0],
 [13, 52, 0.0],
 [13, 54, 0.0],
 [13, 55, 0.0],
 [13, 57, 0.0],
 [13, 63, 0.0],
 [13, 65, 0.0],
 [13, 74, 0.0],
 [13, 75, 0.0],
 [13, 76, 0.0],
 [13, 77, 0.0],
 [13, 80, 0.0],
 [13, 81, 0.0],
 [13, 84, 0.0],
 [13, 85, 0.0],
 [13, 93, 0.0],
 [13, 101, 0.0],
 [13, 102, 0.0],
 [13, 103, 0.0],
 [13, 104, 0.0],
 [13, 105, 0.0],
 [13, 106, 0.0],
 [13, 107, 0.0],
 [13, 108, 0.0],
 [13, 112, 0.0],
 [13, 113, 0.0],
 [13, 114, 0.0],
 [13, 115, 0.0],
 [13, 119, 0.0],
 [13, 120, 0.0],
 [13, 122, 0.0],
 [13, 123, 0.0],
 [13, 125, 0.0],
 [13, 126, 0.0],
 [13, 129, 0.0],
 [13, 130, 0.0],
 [13, 131, 0.0],
 [13, 133, 0.0],
 [13, 134, 0.0],
 [13, 136, 0.0],
 [13, 139, 0.0],
 [13, 140, 0.0],
 [13, 142, 0.0],

In [12]:
predictions = algo.test(testset)
predictions[5]

Prediction(uid=13, iid=18, r_ui=0.0, est=2.857306662720775, details={'actual_k': 10, 'was_impossible': False})

In [13]:
pred_ratings = np.array([pred.est for pred in predictions])
pred_ratings

array([2.84871025, 3.64335308, 3.82015525, ..., 2.        , 3.        ,
       3.        ])

In [14]:
iids_to_predict

array([   3,    6,   10, ..., 1680, 1681, 1682], dtype=int64)

In [15]:
i_max = pred_ratings.argmax()
i_max

485

In [16]:
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(13,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 13 is 1122 with predicted rating as 5.0


In [17]:
import heapq
i_sorted_10 = heapq.nlargest(10,  range(len(pred_ratings)), pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

[1122 1189 1201 1293 1467 1500 1536 1599 1653 1449]


In [18]:
np.arange(30,110,10)


array([ 30,  40,  50,  60,  70,  80,  90, 100])

In [19]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'k': np.arange(30,110,10)}
kfold = KFold(n_splits=5, random_state=2021, shuffle=True)
gs = GridSearchCV(surprise.KNNBasic, param_grid, measures=['rmse', 'mae'], cv=kfold)

In [20]:
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

## Best Score

In [21]:
print(gs.best_score['rmse'])

0.9770385408008142


## Best Params

In [22]:
print(gs.best_params['rmse'])

{'k': 30}
