## Recommender System for Book Crossing Playground KNN

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

from lib.preprocessing import data_for_training as data
from math import sqrt
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

In [2]:
np.random.seed(0)


In [3]:
ratings_new = data.match_uid_and_isbn()

In [4]:
ratings_explicit = ratings_new[ratings_new['Book-Rating'] > 0]

In [5]:
user_with_treshold = ratings_explicit.groupby('User-ID').count()
user_with_treshold = user_with_treshold[user_with_treshold['Book-Rating'] >= 200].index
user_with_treshold

Int64Index([  4385,   6251,   6575,   7346,  11676,  13552,  16634,  16795,
             23768,  23872,
            ...
            257204, 258185, 261829, 262998, 264321, 265115, 265889, 269566,
            270713, 274061],
           dtype='int64', name='User-ID', length=118)

In [6]:
isbn_with_treshold = ratings_explicit.groupby('ISBN').count()
isbn_with_treshold = isbn_with_treshold[isbn_with_treshold['Book-Rating'] >= 300].index
isbn_with_treshold

Index(['0060928336', '0142001740', '0312195516', '0316666343', '0385504209',
       '059035342X', '0971880107'],
      dtype='object', name='ISBN')

In [7]:
ratings_above_count_threshold = ratings_explicit[ratings_explicit['User-ID'].isin(user_with_treshold)]
ratings_above_count_threshold = ratings_explicit[ratings_explicit['ISBN'].isin(isbn_with_treshold)]
ratings_above_count_threshold.shape

(3098, 3)

In [8]:
# train_set, test_set = train_test_split(ratings_above_count_threshold, stratify=ratings_above_count_threshold['User-ID'], test_size=0.1)
train_set, test_set = train_test_split(ratings_above_count_threshold, test_size=0.1)
display(train_set.shape)
display(test_set.shape)
display(ratings_above_count_threshold.shape)

(2788, 3)

(310, 3)

(3098, 3)

In [9]:
ratings_copy = train_set.copy()
ratings_copy.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
448798,107718,971880107,4
263666,60633,971880107,8
73273,15602,142001740,10
382822,92810,385504209,9
455547,109502,312195516,8


In [10]:
rating = ratings_above_count_threshold.sample(n=1)

In [11]:
user_id = rating['User-ID'].iloc[0]

In [12]:
user_id = 277042

In [13]:
isbn_df = pd.DataFrame(ratings_above_count_threshold['ISBN']).drop_duplicates()
isbn_df.shape

(7, 1)

In [14]:
ratings_by_user = pd.DataFrame(ratings_above_count_threshold[ratings_above_count_threshold['User-ID'] == user_id])
ratings_by_user.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
812,277042,971880107,2


In [15]:
isbn_from_test_set = pd.DataFrame(test_set[test_set['User-ID'] == user_id]['ISBN'])
isbn_from_test_set

Unnamed: 0,ISBN
812,971880107


In [16]:
isbn_not_rated_by_user = isbn_df[~isbn_df['ISBN'].isin(ratings_by_user['ISBN'])]
isbn_not_rated_by_user = isbn_not_rated_by_user.append(isbn_from_test_set)
isbn_not_rated_by_user

Unnamed: 0,ISBN
429,0385504209
1388,0312195516
1965,0316666343
2892,0142001740
3775,059035342X
11551,0060928336
812,0971880107


In [17]:
isbn_for_prediction = isbn_not_rated_by_user.iloc[0,:]['ISBN']
isbn_for_prediction

'0385504209'

In [18]:
users_for_knn = ratings_above_count_threshold[ratings_above_count_threshold['ISBN'] == isbn_for_prediction]['User-ID']
users_for_knn.shape

(487,)

In [19]:
ratings_by_other_users = ratings_above_count_threshold[ratings_above_count_threshold['User-ID'].isin(users_for_knn)]
ratings_by_other_users

Unnamed: 0,User-ID,ISBN,Book-Rating
429,276925,0385504209,8
1584,277427,0385504209,8
11078,638,0316666343,10
11096,638,0385504209,10
11752,882,0385504209,10
12198,1075,0142001740,10
12199,1075,0316666343,7
12201,1075,0385504209,7
12440,1211,0385504209,9
13196,1652,0385504209,7


In [20]:
ratings_for_knn = pd.concat([ratings_by_user, ratings_by_other_users])
ratings_for_knn

Unnamed: 0,User-ID,ISBN,Book-Rating
812,277042,0971880107,2
429,276925,0385504209,8
1584,277427,0385504209,8
11078,638,0316666343,10
11096,638,0385504209,10
11752,882,0385504209,10
12198,1075,0142001740,10
12199,1075,0316666343,7
12201,1075,0385504209,7
12440,1211,0385504209,9


In [21]:
ratings_pivot = ratings_for_knn.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
ratings_pivot

ISBN,0060928336,0142001740,0312195516,0316666343,0385504209,059035342X,0971880107
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
638,0.0,0.0,0.0,10.0,10.0,0.0,0.0
882,0.0,0.0,0.0,0.0,10.0,0.0,0.0
1075,0.0,10.0,0.0,7.0,7.0,0.0,0.0
1211,0.0,0.0,0.0,0.0,9.0,0.0,0.0
1652,0.0,0.0,0.0,0.0,7.0,0.0,0.0
1803,0.0,0.0,0.0,0.0,9.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,10.0,0.0,0.0
3556,0.0,0.0,0.0,0.0,10.0,0.0,0.0
3999,0.0,0.0,0.0,0.0,10.0,0.0,0.0
4098,0.0,0.0,0.0,0.0,10.0,0.0,0.0


In [22]:
item_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=min(users_for_knn.count(), 5)+1)
item_knn.fit(ratings_pivot)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=6, p=2, radius=1.0)

In [23]:
distances, indices = item_knn.kneighbors(ratings_pivot.loc[user_id, :].values.reshape(1, -1))

In [24]:
distances.flatten()[1:]

array([0.2       , 0.34149539, 0.4       , 0.4452998 , 0.47000106])

In [25]:
indices.flatten()[1:]

array([178, 426, 331, 427, 413])

In [26]:
knn_ratings = ratings_pivot.iloc[indices.flatten()[1:],:]
knn_ratings

ISBN,0060928336,0142001740,0312195516,0316666343,0385504209,059035342X,0971880107
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
95025,0.0,0.0,0.0,0.0,6.0,0.0,8.0
241265,0.0,0.0,0.0,0.0,8.0,0.0,7.0
196052,0.0,0.0,0.0,0.0,8.0,0.0,6.0
241432,0.0,0.0,0.0,0.0,9.0,0.0,6.0
237271,0.0,0.0,0.0,0.0,8.0,0.0,5.0


In [27]:
isbn_for_prediction

'0385504209'

In [28]:
predicted_mean_rating = knn_ratings[isbn_for_prediction].mean()
round(predicted_mean_rating)

8.0

In [29]:
similarities = 1-distances
similarities

array([[1.        , 0.8       , 0.65850461, 0.6       , 0.5547002 ,
        0.52999894]])

In [30]:
predicted_weighted_avg = np.average(knn_ratings[isbn_for_prediction], weights=similarities.flatten()[1:])
round(predicted_weighted_avg)

8.0

In [57]:
predicted_ratings = pd.DataFrame([[user_id, isbn_for_prediction, round(predicted_weighted_avg)]], columns=list(ratings_copy.columns))
ratings_copy.append(predicted_ratings)

Unnamed: 0,User-ID,ISBN,Book-Rating
448798,107718,0971880107,4.0
263666,60633,0971880107,8.0
73273,15602,0142001740,10.0
382822,92810,0385504209,9.0
455547,109502,0312195516,8.0
380886,92048,0142001740,10.0
900656,217971,0060928336,8.0
958749,231635,0316666343,8.0
351771,84427,0142001740,9.0
710920,172512,0316666343,9.0
