# Import Libraries


In [1]:
# import libraries
import numpy as np
import pandas as pd

# Data Preprocessing

In [2]:
# Read book dataset and assign it to the variable b
b = pd.read_csv('books.csv')
print(b.shape)
b.head()

(2312, 5)


Unnamed: 0,book_id,avg_rating,no_of_ratings,user_id,user_rating
0,4833,4.25,7156.0,3466,0
1,590,4.31,7821.0,3466,5
2,4264,4.08,3836.0,3453,5
3,3361,3.52,1245.0,3453,4
4,4535,4.13,3107.0,3453,0


In [3]:
# There is no need for the columns "no_of_ratings" and "avg_rating"
# in our implementation therefore we drop these columns
# we assigned remain data to the variable ratings
ratings = b.drop(['no_of_ratings', 'avg_rating'], axis=1)

In [4]:
# Print first 5 rows from ratings dataset
ratings.head()

Unnamed: 0,book_id,user_id,user_rating
0,4833,3466,0
1,590,3466,5
2,4264,3453,5
3,3361,3453,4
4,4535,3453,0


In [5]:
# Print shape of ratings dataset
ratings.shape

(2312, 3)

In [6]:
# If there is no NaN value in any row then remove that row
ratings.dropna(inplace=True)

In [7]:
# Print shape of ratings dataset
ratings.shape

(2312, 3)

In [8]:
# We group rows by user_id column and remove the groups which has only one element
# We split using scikit-leanr's train_test_split function and set stratify property 
# of that function to user_id, therefore it isn't possible to split rows that belongs
# to exactly one user
ratings = ratings.groupby(['user_id']).filter(lambda x: len(x) > 1)

In [9]:
# Prints number of unique books from ratings dataset 
len(ratings.book_id.unique())

710

In [10]:
# Prints number of unique users from ratings dataset 
len(ratings.user_id.unique())

84

In [11]:
# Check how many Null value he have in ratings dataset
ratings.isnull().sum()

book_id        0
user_id        0
user_rating    0
dtype: int64

In [12]:
# Print shape of ratings dataset
ratings.shape

(2206, 3)

In [13]:
# Drop duplicate rows from the dataset ratings
ratings.drop_duplicates(subset=['book_id', 'user_rating', 'user_id'], inplace=True)

In [14]:
# Print shape of ratings dataset
ratings.shape

(2189, 3)

In [15]:
# Reset index for the dataset ratings
ratings.reset_index(inplace=True)

In [16]:
# Print first five rows from the dataset ratings
ratings.head()

Unnamed: 0,index,book_id,user_id,user_rating
0,0,4833,3466,0
1,1,590,3466,5
2,2,4264,3453,5
3,3,3361,3453,4
4,4,4535,3453,0


In [17]:
# Print shape of ratings dataset
ratings.shape

(2189, 4)

# User Based Collaorative Filtering

In [18]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [19]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

In [20]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [21]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

In [22]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
  #Construct a list of user-movie tuples from the testing dataset
  id_pairs = zip(X_test['user_id'], X_test['book_id'])
  #Predict the rating for every user-movie tuple
  y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])
  #Extract the actual ratings given by the users in the test data
  y_true = np.array(X_test['user_rating'])
  #Return the final RMSE score
  return rmse(y_true, y_pred)

In [23]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='user_rating', index='user_id', columns='book_id')

r_matrix.sample(20)

book_id,6,7,9,15,21,29,43,47,61,72,74,81,84,89,90,91,93,99,104,107,110,119,130,144,145,149,152,154,189,196,200,202,203,208,209,225,226,239,241,263,...,4744,4751,4755,4758,4771,4772,4777,4780,4805,4827,4830,4832,4833,4845,4852,4853,4854,4868,4881,4882,4885,4886,4889,4897,4901,4902,4904,4907,4919,4921,4923,4925,4941,4942,4968,4971,4975,4978,4991,4995
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2283,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7130,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,
3229,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,2.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3262,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3480,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,
117,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,3.0,,,,,,,...,,,,,,,,,,,3.0,,,,,,,,,,,,5.0,,,,,2.0,,,,,,,,,,,,
3115,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3364,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,
2672,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
330,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [24]:
# Print shape of ratings matrix
r_matrix.shape

(84, 677)

In [25]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [26]:
# Import cosine_score
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [28]:
#Convert into pandas dataframe
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

In [29]:
cosine_sim.head(10)

user_id,117,176,232,295,330,397,484,703,853,1083,1125,1183,1221,1283,1328,1412,1460,1482,1496,1586,1689,1784,2061,2078,2108,2122,2215,2222,2283,2460,2474,2478,2547,2549,2624,2672,2689,2760,2928,2931,...,3066,3115,3207,3217,3221,3229,3262,3270,3291,3292,3331,3364,3393,3403,3421,3426,3430,3444,3449,3453,3461,3466,3468,3469,3470,3471,3472,3474,3475,3476,3478,3479,3480,3482,3483,3484,3486,3497,7130,7131
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
117,1.0,0.0,0.0,0.025962,0.0,0.0,0.0,0.084509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.238976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167248,0.0,0.0,0.023682,0.0,0.0,0.0,0.0,0.030184,0.029509,0.0,0.0,0.09754,0.0,0.084374,0.161522,0.118858,0.069701,0.088929,0.083798,0.0,0.0,0.0,0.0
176,0.0,1.0,0.0,0.181356,0.0,0.0,0.0,0.209076,0.089245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.087255,0.0,0.0,0.0,0.0,0.082611,0.147046,0.0,0.0,0.0,0.0,0.0,0.008589,0.0,0.0,0.0,0.0,0.0,0.061055,0.242164,0.046371,0.0,0.124168,0.0,0.0,0.0,0.0
232,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
295,0.025962,0.181356,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175623,0.097689,0.0,0.0,0.0,0.044522,0.070971,0.0,0.040787,0.0,0.0,0.0,0.0,0.031151,0.06619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05595,0.0,0.0,0.0,0.0,0.151296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025901,0.029292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100125
703,0.084509,0.209076,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
853,0.0,0.089245,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.172919,0.057166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.31427,0.0,0.0,0.0,0.0,0.0,0.14072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067647,0.0,0.0,0.0,0.052137,0.0,0.079374,0.0,0.0,0.0,0.0,0.05435,0.0,0.158763,0.0,0.174604,0.0,0.0,0.0,0.0
1083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, book_id):
  #Check if movie_id exists in r_matrix
  if book_id in r_matrix:
    #Get the similarity scores for the user in question with every other user
    sim_scores = cosine_sim[user_id]
    #Get the user ratings for the movie in question
    m_ratings = r_matrix[book_id]

    #Extract the indices containing NaN in the m_ratings series
    idx = m_ratings[m_ratings.isnull()].index
    #Drop the NaN values from the m_ratings Series
    m_ratings = m_ratings.dropna()
    m_ratings.reset_index()
    #Drop the corresponding cosine scores from the sim_scores series
    sim_scores = sim_scores.drop(idx)

    #Compute the final weighted mean
    wmean_rating = np.dot(sim_scores, m_ratings)/ (sim_scores.sum()  + 0.00000001)
  else:
    #Default to a rating of 3.0 in the absence of any information
    wmean_rating = 3.0
  return wmean_rating

In [31]:
score(cf_user_wmean)

2.001154409808239

# Testing

In [32]:
# user based
b_id = 862
u_id = 3461

In [33]:
ratings[(ratings['book_id'] == b_id) & (ratings['user_id'] == u_id)]

Unnamed: 0,index,book_id,user_id,user_rating
176,176,862,3461,3


In [34]:
# Predicts ratings for the user id equal to 3461 and book id equal to 862
cf_user_wmean(u_id, b_id)

2.94207940734678

In [48]:
# Get all 5 rated books
tmp5 = X_test[X_test["user_rating"]==5]
tmp5

Unnamed: 0,index,book_id,user_id,user_rating
1953,1970,3900,3482,5
1278,1285,4240,3471,5
544,548,1180,3476,5
1281,1288,3857,3471,5
283,283,2276,3470,5
...,...,...,...,...
473,477,1773,3217,5
1343,1350,4354,3470,5
1955,1972,6,3484,5
1159,1164,3757,3426,5


In [49]:
# Calculate the mean of predicted values for this models based on ratings rated 
# by users as 5
list = [] 
def avg_rating(cf_model):   
  id_pairs = zip(tmp5['user_id'], tmp5['book_id'])
  y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])   
  lst = list.append(y_pred)   
  return np.mean(list)

avg_rating(cf_user_wmean)

2.7150544847285945