In [None]:
# importing the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score
import matplotlib.ticker as ticker
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
#reading the csv file into books dataframe
books = pd.read_csv('../data/books.csv', sep=',')

In [3]:
# dropping the unwanted columns in book data frame
books = books.iloc[:, :16]
books = books.drop(columns=['original_title', 'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'original_publication_year','language_code','work_ratings_count','work_text_reviews_count'])
books.head(5)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count
0,1,2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,4780653
1,2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479
2,3,41865,Stephenie Meyer,"Twilight (Twilight, #1)",3.57,3866839
3,4,2657,Harper Lee,To Kill a Mockingbird,4.25,3198671
4,5,4671,F. Scott Fitzgerald,The Great Gatsby,3.89,2683664


In [4]:
#checking for null values
books.isna().sum()

id                0
book_id           0
authors           0
title             0
average_rating    0
ratings_count     0
dtype: int64

In [5]:
# reading the ratings csv file
ratings = pd.read_csv('../data/ratings.csv', sep=',')
ratings.head(5)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
#merging both the files 
df = pd.merge(ratings, books, on="book_id")
df.head(5)

Unnamed: 0,book_id,user_id,rating,id,authors,title,average_rating,ratings_count
0,1,314,5,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
1,1,439,3,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
2,1,588,5,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
3,1,1169,4,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
4,1,1185,4,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823


In [7]:
# droping the duplicated columns in the files
df1= df.drop_duplicates(['user_id','title','book_id'])
df1.to_csv('book_edit.csv')
df1.head(10) #went down from 79701 to 79531 
df1.shape #(79531, 8)

(79531, 8)

In [8]:
# creating the matrix and filling the nan values to be 0
books_matrix = df1.pivot_table(index = 'user_id', columns = 'book_id', values = 'rating')
books_matrix.shape #(28554, 794)
books_matrix.head()

book_id,1,2,3,5,6,8,10,11,13,21,...,9854,9864,9865,9912,9913,9914,9915,9943,9957,9998
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [9]:
books_matrix_ratings = books_matrix.fillna(0).values

In [10]:
#slipting the matrix to test and train. We are spliting the matrix in a 
#peculiar way as we had made sure that we encounter some values in the test and train data instead of complete 0

MIN_USER_RATINGS = 5
DELETE_RATING_COUNT = 1

def train_test_split(ratings):
    validation = np.zeros(ratings.shape)
    train = ratings.copy()
    
    for user in np.arange(ratings.shape[0]):
        if len(ratings[user,:].nonzero()[0]) >= MIN_USER_RATINGS:
            val_ratings = np.random.choice(
                ratings[user, :].nonzero()[0], 
                size=DELETE_RATING_COUNT,
                replace=False
            )
            train[user, val_ratings] = 0
            validation[user, val_ratings] = ratings[user, val_ratings]
    return train, validation

In [11]:
train, val = train_test_split(books_matrix_ratings)

In [1]:
train

NameError: name 'train' is not defined

In [12]:
train.shape

(28906, 812)

In [13]:
val.shape

(28906, 812)

In [14]:
# used a simple rsme calculation for error prediction
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [15]:
# main class where the model training and prediction take place

class Recommender:
    
  
    def __init__(self, n_epochs=200, n_latent_features=3, lmbda=0.1, learning_rate=0.001):
        self.n_epochs = n_epochs
        self.n_latent_features = n_latent_features
        self.lmbda = lmbda
        self.learning_rate = learning_rate
  
    def predictions(self, P, Q):
        return np.dot(P.T, Q)
  
    def fit(self, X_train, X_val):
        m, n = X_train.shape
        print(X_val.shape)
        self.P = 3 * np.random.rand(self.n_latent_features, m)
        self.Q = 3 * np.random.rand(self.n_latent_features, n)
        print(self.P)
        print(self.Q)
        
        self.train_error = []
        self.val_error = []

        users, items = X_train.nonzero()

        for epoch in range(self.n_epochs):
            for u, i in zip(users, items):
                error = X_train[u, i] - self.predictions(self.P[:,u], self.Q[:,i])
                self.P[:, u] += self.learning_rate * (error * self.Q[:, i] - self.lmbda * self.P[:, u])
                self.Q[:, i] += self.learning_rate * (error * self.P[:, u] - self.lmbda * self.Q[:, i])

            train_rmse = rmse(self.predictions(self.P, self.Q), X_train)
            val_rmse = rmse(self.predictions(self.P, self.Q), X_val)
            self.train_error.append(train_rmse)
            self.val_error.append(val_rmse)

        return self
  
    def predict(self, X_train, user_index):
        y_hat = self.predictions(self.P, self.Q)
        predictions_index = np.where(X_train[user_index, :] == 0)[0]
        return y_hat[user_index, predictions_index].flatten()

In [16]:
val.shape

(28906, 812)

In [17]:
recommender = Recommender().fit(train, val)

(28906, 812)


In [18]:
# we have chosen a particular user id 173 and calculated the predicted values for the user
user_id = 173
user_index = books_matrix.index.get_loc(user_id)
predictions_index = np.where(train[user_index, :] == 0)[0]

rating_predictions = recommender.predict(train, user_index)

In [19]:
def create_book_ratings(books_df, books_index, ratings, n=10):
    books_ids = books_matrix.columns[books_index]
    book_ratings = pd.DataFrame(data=dict(book_id=books_ids, rating=ratings))
    top_n_books = book_ratings.sort_values("rating", ascending=False).head(n)

    book_recommendations = books_df[books_df.id.isin(top_n_books.book_id)].reset_index(drop=True)
    book_recommendations['rating'] = pd.Series(top_n_books.rating.values)
    return book_recommendations.sort_values("rating", ascending=False)

In [20]:
existing_ratings_index = np.where(train[user_index, :] > 0)[0]
existing_ratings = train[user_index, existing_ratings_index]

create_book_ratings(books, existing_ratings_index, existing_ratings)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count,rating
0,6,11870085,John Green,The Fault in Our Stars,4.26,2346404,5.0
1,106,9418327,Tina Fey,Bossypants,3.94,506250,5.0
2,112,15507958,Jojo Moyes,"Me Before You (Me Before You, #1)",4.27,587647,5.0
3,249,4588,Jonathan Safran Foer,Extremely Loud and Incredibly Close,3.97,294726,5.0
4,291,3591262,Abraham Verghese,Cutting for Stone,4.28,258319,4.0
5,320,13526165,Maria Semple,"Where'd You Go, Bernadette",3.9,215453,4.0
6,357,355697,"Erich Maria Remarque, A.W. Wheen",All Quiet on the Western Front,3.92,249113,4.0
7,968,7968243,Stacy Schiff,Cleopatra: A Life,3.62,73994,3.0
8,1990,11331421,"Jan-Philipp Sendker, Kevin Wiliarty",The Art of Hearing Heartbeats,3.98,41647,2.0
9,6613,79421,"Paul Hattaway, Brother Yun",The Heavenly Man: The Remarkable True Story of...,4.33,12537,2.0


In [21]:
create_book_ratings(books, predictions_index, rating_predictions)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count,rating
0,1,2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,4.343315
1,13,5470,"George Orwell, Erich Fromm, Celâl Üster",1984,4.14,1956832,4.171455
2,25,136251,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...,4.61,1746574,4.138148
3,93,2998,Frances Hodgson Burnett,The Secret Garden,4.12,639357,4.121989
4,250,11387515,R.J. Palacio,Wonder,4.43,228538,4.106927
5,295,10644930,Stephen King,11/22/63,4.29,258464,4.088251
6,840,402093,James Clavell,"Shōgun (Asian Saga, #1)",4.37,104339,4.050868
7,1274,196970,"Clement C. Moore, Jan Brett",The Night Before Christmas,4.36,81553,4.014781
8,3885,29780253,Trevor Noah,Born a Crime: Stories From a South African Chi...,4.49,42573,4.012862
9,6862,17346698,"John Lewis, Andrew Aydin, Nate Powell","March: Book One (March, #1)",4.33,19430,4.009002
