# Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd

# Data Preprocessing

In [2]:
# Read book dataset and assign it to the variable b
b = pd.read_csv('books.csv')
print(b.shape)
b.head()

(2312, 5)


Unnamed: 0,book_id,avg_rating,no_of_ratings,user_id,user_rating
0,4833,4.25,7156.0,3466,0
1,590,4.31,7821.0,3466,5
2,4264,4.08,3836.0,3453,5
3,3361,3.52,1245.0,3453,4
4,4535,4.13,3107.0,3453,0


In [3]:
# There is no need for the columns "no_of_ratings" and "avg_rating"
# in our implementation therefore we drop these columns
# we assigned remain data to the variable ratings
ratings = b.drop(['no_of_ratings', 'avg_rating'], axis=1)

In [4]:
# Print first 5 rows from ratings dataset
ratings.head()

Unnamed: 0,book_id,user_id,user_rating
0,4833,3466,0
1,590,3466,5
2,4264,3453,5
3,3361,3453,4
4,4535,3453,0


In [5]:
# Print shape of ratings dataset
ratings.shape

(2312, 3)

In [6]:
# If there is no NaN value in any row then remove that row
ratings.dropna(inplace=True)

In [7]:
# Print shape of ratings dataset
ratings.shape

(2312, 3)

In [8]:
# We group rows by book_id column and remove the groups which has only one element
# We split using scikit-leanr's train_test_split function and set stratify property 
# of that function to book_id, therefore it isn't possible to split rows that is 
# unique by book id
ratings = ratings.groupby(['book_id']).filter(lambda x: len(x) > 1)

In [9]:
_# Prints number of unique books from ratings dataset 
len(ratings.book_id.unique())

657

In [10]:
# Prints number of unique users from ratings dataset 
len(ratings.user_id.unique())

190

In [11]:
# Check how many Null value he have in ratings dataset
ratings.isnull().sum()

book_id        0
user_id        0
user_rating    0
dtype: int64

In [12]:
# Print shape of ratings dataset
ratings.shape

(2259, 3)

In [13]:
# Drop duplicate rows from the dataset ratings
ratings.drop_duplicates(subset=['book_id', 'user_rating', 'user_id'], inplace=True)

In [14]:
ratings.shape

(2242, 3)

In [15]:
# Reset index for the dataset ratings
ratings.reset_index(inplace=True)

In [16]:
# Print first five rows from the dataset ratings
ratings.head()

Unnamed: 0,index,book_id,user_id,user_rating
0,0,4833,3466,0
1,1,590,3466,5
2,2,4264,3453,5
3,3,3361,3453,4
4,4,4535,3453,0


In [17]:
# Print shape of ratings dataset
ratings.shape

(2242, 4)

# Book Based Collaborative Filtering

In [18]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [19]:
#Assign X as the original ratings dataframe and y as the book_id column of ratings.
X = ratings.copy()
y = ratings['book_id']

In [20]:
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.31, stratify=y, random_state=42)

In [21]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

In [22]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
  #Construct a list of user-movie tuples from the testing dataset
  id_pairs = zip(X_test['user_id'], X_test['book_id'])
  #Predict the rating for every user-movie tuple
  y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])
  #Extract the actual ratings given by the users in the test data
  y_true = np.array(X_test['user_rating'])
  #Return the final RMSE score
  return rmse(y_true, y_pred)

In [23]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='user_rating', index='book_id', columns='user_id')

r_matrix.sample(20)

user_id,117,176,232,295,318,330,386,397,446,484,514,577,578,585,605,680,703,806,820,853,967,1007,1044,1046,1049,1083,1119,1125,1144,1151,1152,1183,1203,1219,1283,1328,1331,1412,1438,1460,...,3421,3426,3430,3444,3449,3453,3461,3466,3468,3469,3470,3471,3472,3474,3475,3476,3478,3479,3480,3482,3483,3484,3486,3497,3498,3500,3521,3566,3568,3570,3648,3655,3795,3830,3849,3913,3937,3973,7130,7131
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2560,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,2.0,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,
3500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,
4332,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3280,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,
3033,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2616,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,4.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,
686,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,0.0,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,
4830,3.0,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3677,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,...,,4.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
3533,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,1.5,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,


In [24]:
r_matrix.shape

(657, 161)

In [25]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [26]:
# Import cosine_score
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [28]:
#Convert into pandas dataframe
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

In [29]:
cosine_sim.head(10)

book_id,6,7,9,21,29,43,45,47,72,74,81,89,90,93,99,104,107,110,119,130,144,145,149,152,154,189,196,200,202,203,208,209,225,226,241,263,265,278,298,323,...,4734,4744,4751,4755,4758,4771,4777,4780,4805,4823,4827,4830,4832,4833,4845,4852,4853,4854,4864,4881,4882,4886,4889,4897,4901,4902,4904,4907,4919,4921,4923,4925,4941,4942,4971,4975,4978,4991,4995,4999
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217491,0.0,0.580259,0.0,0.0,0.0,0.0,0.0,0.151947,0.0,0.0,0.0,0.0,0.0,0.0,0.574427,0.0,0.0,0.0,0.0,0.0,0.0,0.246183,0.0,0.0,0.0,0.0,0.0,0.204837,0.0,0.0,0.615457,...,0.079872,0.0,0.0,0.0,0.142134,0.348155,0.0,0.0,0.0,0.348155,0.0,0.0,0.0,0.0,0.0,0.0,0.127128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217491,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.811107,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.912871,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.182574,0.0,0.0,0.0,0.436436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.948683,0.0,0.0,0.0,0.0,0.0
9,0.0,0.811107,1.0,0.0,0.0,0.0,0.811107,0.0,0.0,0.0,0.0,0.0,0.0,0.740436,0.0,0.0,0.0,0.301238,0.811107,0.0,0.187317,0.0,0.170054,0.0,0.0,0.324443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219363,0.0,...,0.334945,0.0,0.0,0.59604,0.0,0.0,0.0,0.0,0.324443,0.0,0.0,0.0,0.0,0.0,0.463586,0.0,0.148087,0.266557,0.086711,0.09673,0.074432,0.353996,0.0,0.0,0.0,0.148431,0.0,0.0,0.0,0.39736,0.0,0.23694,0.228013,0.290191,0.769484,0.0,0.0,0.264906,0.0,0.0
21,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.317821,0.0,0.0,0.0,0.0,0.0,0.0,0.796819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.981495,0.0,0.583874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092582,0.0,...,0.0,0.0,0.0,0.0,0.745356,0.0,0.634335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.544331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074536,0.0,0.0
29,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.411597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363803,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.460179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.417311,0.514496
43,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.870388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.912871,0.0,0.0,0.0,0.218218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.0,1.0,0.811107,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.912871,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.182574,0.0,0.0,0.0,0.436436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.948683,0.0,0.0,0.0,0.0,0.0
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162088,0.0,0.0,0.0,0.0,0.297381,0.308607,0.0,0.0,0.0,0.28227,0.0,0.0,0.308607,0.0,0.0,0.0,0.0,0.0,0.0,0.118783,0.0,0.0,...,0.0,0.698297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.699854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105851,0.0,0.0,0.318728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.217491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272639,0.0,0.0,0.0,0.0,0.0,0.0,0.294484,0.0,0.0,0.0,0.0,0.0,0.0,0.441726,0.0,0.0,0.487805,0.0,0.69843,0.367538,0.0,0.0,0.0,...,0.143315,0.0,0.0,0.0,0.255031,0.624695,0.0,0.0,0.0,0.624695,0.0,0.0,0.0,0.0,0.0,0.712832,0.228106,0.0,0.0,0.0,0.0,0.0,0.0,0.290007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390244,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74,0.0,0.0,0.0,0.317821,0.0,0.870388,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330289,0.0,0.222681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615457,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.284268,0.0,0.093048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794552,0.0,0.2076,0.0,0.189934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#Item Based Collaborative Filter using Weighted Mean Ratings
(10, 6)
def cf_user_wmean(user_id, book_id):
  #Check if user exists in r_matrix
  if user_id in r_matrix:
    #Get the similarity scores for the book in question with every other book
    sim_scores = cosine_sim[book_id]
    #Get the book ratings for the user in question
    m_ratings = r_matrix[user_id]


    #Extract the indices containing NaN in the m_ratings series
    idx = m_ratings[m_ratings.isnull()].index
    #Drop the NaN values from the m_ratings Series
    m_ratings = m_ratings.dropna()
    m_ratings.reset_index()
    #Drop the corresponding cosine scores from the sim_scores series
    sim_scores = sim_scores.drop(idx)


    wmean_rating = np.dot(sim_scores, m_ratings)/ (sim_scores.sum()  + 0.000000001)
  else:
    #Default to a rating of 3.0 in the absence of any information
    wmean_rating = 3.0
  return wmean_rating

In [31]:
score(cf_user_wmean)

1.9027768256942574

# Testing

In [32]:
# item based
b_id = 862
u_id = 3461

In [33]:
ratings[(ratings['book_id'] == b_id) & (ratings['user_id'] == u_id)]

Unnamed: 0,index,book_id,user_id,user_rating
123,176,862,3461,3


In [34]:
# Predicts ratings for the user id equal to 3461 and book id equal to 862
cf_user_wmean(u_id, b_id)

2.6849896994941056

In [47]:
# Get all 5 rated books
tmp5 = X_test[X_test["user_rating"]==5]
tmp5

Unnamed: 0,index,book_id,user_id,user_rating
1405,1464,1209,3472,5
2126,2196,278,3480,5
1844,1913,4364,3482,5
1917,1986,3516,3483,5
152,205,1344,3471,5
...,...,...,...,...
945,1002,47,176,5
924,981,3393,3469,5
2054,2123,868,3479,5
2032,2101,4420,3291,5


In [48]:
# Calculate the mean of predicted values for this models based on ratings rated 
# by users as 5
list = [] 
def avg_rating(cf_model):   
  id_pairs = zip(tmp5['user_id'], tmp5['book_id'])
  y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])   
  lst = list.append(y_pred)   
  return np.mean(list)

avg_rating(cf_user_wmean)

2.4392515300953836