In [98]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

In [72]:
df_ratings = pd.read_csv("BX-Book-Ratings.csv", encoding="latin1")
df_ratings.sort_values(["user_id", "isbn"], inplace=True)
df_ratings = df_ratings.head(10000)
df_ratings.reset_index()
df_ratings.head()
#df_ratings.shape

Unnamed: 0,user_id,isbn,rating
9561,2,195153448,0
9562,7,34542252,0
9571,8,074322678X,5
9574,8,080652121X,0
9576,8,1552041778,5


In [73]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df_ratings["isbn"] = labelencoder.fit_transform(df_ratings["isbn"])
df_ratings.head()

Unnamed: 0,user_id,isbn,rating
9561,2,1467,0
9562,7,2490,0
9571,8,443,5
9574,8,475,0
9576,8,877,5


In [74]:
df_books = pd.read_csv("BX-Books.csv", encoding="latin1", low_memory=False)
df_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [75]:
df_users = pd.read_csv("BX-Users.csv", encoding="latin1", low_memory=False)
df_users.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [76]:
n_users = df_ratings["user_id"].unique().shape[0]
n_users

1323

In [77]:
n_books = df_ratings["isbn"].unique().shape[0]
n_books

8742

In [79]:
data_matrix = np.zeros((n_users, n_books))
for line in df_ratings.head().itertuples():
    #print(line)
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [80]:
data_matrix.shape

(1323, 8742)

In [81]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [85]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'book':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [87]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='book')

In [89]:
user_prediction[]

(1323, 8742)

In [99]:
#root mean square error
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [100]:
rmse(user_prediction, data_matrix)

4.9988560970029745

In [101]:
rmse(item_prediction, data_matrix)

5.0