In [1]:
# importing the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score
import matplotlib.ticker as ticker
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
#reading the csv file into books dataframe
books = pd.read_csv('../data/books.csv', sep=',')

In [3]:
# dropping the unwanted columns in book data frame
books = books.iloc[:, :16]
books = books.drop(columns=['original_title', 'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'original_publication_year','language_code','work_ratings_count','work_text_reviews_count'])
books.head(5)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count
0,1,2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,4780653
1,2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479
2,3,41865,Stephenie Meyer,"Twilight (Twilight, #1)",3.57,3866839
3,4,2657,Harper Lee,To Kill a Mockingbird,4.25,3198671
4,5,4671,F. Scott Fitzgerald,The Great Gatsby,3.89,2683664


In [4]:
#checking for null values
books.isna().sum()

id                0
book_id           0
authors           0
title             0
average_rating    0
ratings_count     0
dtype: int64

In [5]:
# reading the ratings csv file
ratings = pd.read_csv('../data/ratings.csv', sep=',')
ratings.head(5)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
#merging both the files 
df = pd.merge(ratings, books, on="book_id")
df.head(5)

Unnamed: 0,book_id,user_id,rating,id,authors,title,average_rating,ratings_count
0,1,314,5,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
1,1,439,3,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
2,1,588,5,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
3,1,1169,4,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823
4,1,1185,4,27,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823


In [7]:
# droping the duplicated columns in the files
df1= df.drop_duplicates(['user_id','title','book_id'])
df1.to_csv('book_edit.csv')
df1.head(10) #went down from 79701 to 79531 
df1.shape #(79531, 8)

(79531, 8)

In [19]:
# creating the matrix and filling the nan values to be 0
books_matrix = df1.pivot_table(index = 'user_id', columns = 'book_id', values = 'rating')
books_matrix.shape #(28554, 794)
books_matrix

book_id,1,2,3,5,6,8,10,11,13,21,...,9854,9864,9865,9912,9913,9914,9915,9943,9957,9998
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53419,,,,,,,,,,,...,,,,,,,,,,
53420,,,,,,,,,,,...,,,,,,,,,,
53422,,,,,,,,,,,...,,,,,,,,,,
53423,,,,,,,,,,,...,,,,,,,,,,


In [9]:
#slipting the matrix to test and train. We are spliting the matrix in a 
#peculiar way as we had made sure that we encounter some values in the test and train data instead of complete 0

MIN_USER_RATINGS = 5
DELETE_RATING_COUNT = 1

def train_test_split(ratings):
    validation = np.zeros(ratings.shape)
    print(validation)
    train = ratings.copy()
    print(train)
    
    for user in np.arange(ratings.shape[0]):
        if len(ratings[user,:].nonzero()[0]) >= MIN_USER_RATINGS:
            val_ratings = np.random.choice(
                ratings[user, :].nonzero()[0], 
                size=DELETE_RATING_COUNT,
                replace=False
            )
            train[user, val_ratings] = 0
            validation[user, val_ratings] = ratings[user, val_ratings]
    return train, validation

In [10]:
train, val = train_test_split(books_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
train.shape

(28906, 812)

In [13]:
val.shape

(28906, 812)

In [14]:
# used a simple rsme calculation for error prediction
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [15]:
# main class where the model training and prediction take place

class Recommender:
    
  
    def __init__(self, n_epochs=200, n_latent_features=3, lmbda=0.1, learning_rate=0.001):
        self.n_epochs = n_epochs
        self.n_latent_features = n_latent_features
        self.lmbda = lmbda
        self.learning_rate = learning_rate
  
    def predictions(self, P, Q):
        return np.dot(P.T, Q)
  
    def fit(self, X_train, X_val):
        m, n = X_train.shape
        print(X_train)
        print(X_val)
        self.P = 3 * np.random.rand(self.n_latent_features, m)
        self.Q = 3 * np.random.rand(self.n_latent_features, n)
        print(self.P)
        print(self.Q)
        
        self.train_error = []
        self.val_error = []

        users, items = X_train.nonzero()

        for epoch in range(self.n_epochs):
            for u, i in zip(users, items):
                error = X_train[u, i] - self.predictions(self.P[:,u], self.Q[:,i])
                self.P[:, u] += self.learning_rate * (error * self.Q[:, i] - self.lmbda * self.P[:, u])
                self.Q[:, i] += self.learning_rate * (error * self.P[:, u] - self.lmbda * self.Q[:, i])

            train_rmse = rmse(self.predictions(self.P, self.Q), X_train)
            val_rmse = rmse(self.predictions(self.P, self.Q), X_val)
            self.train_error.append(train_rmse)
            self.val_error.append(val_rmse)

        return self
  
    def predict(self, X_train, user_index):
        y_hat = self.predictions(self.P, self.Q)
        predictions_index = np.where(X_train[user_index, :] == 0)[0]
        return y_hat[user_index, predictions_index].flatten()

In [16]:
recommender = Recommender().fit(train, val)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[1.6053157  2.70797233 2.36864063 ... 2.94111661 2.01635864 2.51950705]
 [2.5703215  0.25236298 0.80911891 ... 2.13543293 2.36883595 0.07275262]
 [0.14100892 0.42916283 2.44377315 ... 1.10019869 0.66303294 2.36414725]]
[[0.58917348 1.3161524  2.68206931 ... 0.47215228 2.88842944 2.12164871]
 [0.14024504 1.07906415 2.29966548 ... 2.10639616 0.7860691  0.15478666]
 [2.50065839 1.79930502 0.1642384  ... 0.41950797 0.29175677 1.12090103]]


In [17]:
recommender.train_error

[2.7066139275738865,
 2.0140838585834477,
 1.7749864963185105,
 1.6816189970748205,
 1.6316714695901249,
 1.5958491418934437,
 1.5660202569481039,
 1.5397744599826093,
 1.5162359560498162,
 1.4949554858012661,
 1.4756190899378685,
 1.4579712906283906,
 1.4417937051099081,
 1.4268975681158629,
 1.4131196191696282,
 1.4003188520882224,
 1.388373620276913,
 1.3771790329280122,
 1.3666446444364326,
 1.3566924316356226,
 1.347255039910051,
 1.3382742716468456,
 1.3296997882250647,
 1.3214879976045042,
 1.3136011019264036,
 1.3060062824177388,
 1.2986750018029125,
 1.2915824071414546,
 1.2847068184392598,
 1.2780292905059363,
 1.2715332373667492,
 1.2652041101125107,
 1.2590291204162491,
 1.252997003092501,
 1.2470978120517697,
 1.2413227448339776,
 1.2356639916121692,
 1.230114605159806,
 1.2246683887875507,
 1.2193197996919072,
 1.2140638655298803,
 1.2088961123506814,
 1.2038125022856059,
 1.1988093796276154,
 1.193883424128705,
 1.1890316105109584,
 1.1842511733304866,
 1.179539576455894

In [24]:
recommender.val_error

[2.700031093797787,
 2.0192950964955285,
 1.780216232914088,
 1.685675219853823,
 1.6349306869117877,
 1.5984030906336613,
 1.567718361196673,
 1.5403907267842933,
 1.5155487299673749,
 1.4927739009906964,
 1.47178826958189,
 1.4523704740609544,
 1.434332431575433,
 1.4175115933587774,
 1.4017670747489899,
 1.3869767875698125,
 1.373034952730515,
 1.3598498896662934,
 1.3473420703618395,
 1.335442426598191,
 1.3240908895849177,
 1.3132351359324175,
 1.3028295131274323,
 1.2928341193814066,
 1.283214015464272,
 1.2739385490998383,
 1.2649807752990505,
 1.2563169584967289,
 1.24792614450486,
 1.2397897921206575,
 1.231891455768431,
 1.2242165118520625,
 1.2167519225880288,
 1.2094860320108272,
 1.2024083896216125,
 1.1955095978102854,
 1.188781179740633,
 1.1822154648634227,
 1.1758054896269556,
 1.1695449112994447,
 1.1634279331119848,
 1.157449239182463,
 1.1516039378959906,
 1.1458875126017614,
 1.140295778644225,
 1.1348248458819914,
 1.12947108596422,
 1.1242311037342123,
 1.1191017

In [20]:
# we have chosen a particular user id 173 and calculated the predicted values for the user
user_id = 173
user_index = books_matrix.index.get_loc(user_id)
predictions_index = np.where(train[user_index, :] == 0)[0]
print(predictions_index)
rating_predictions = recommender.predict(train, user_index)

[  0   1   2   3   5   6   7   8   9  10  11  12  13  14  15  16  17  19
  20  21  22  23  24  27  28  29  30  33  34  36  37  39  40  41  42  43
  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62
  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80
  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
 153 154 155 156 157 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
 244 245 246 247 248 249 250 251 252 253 254 255 25

In [21]:
def create_book_ratings(books_df, books_index, ratings, n=10):
    books_ids = books_matrix.columns[books_index]
    book_ratings = pd.DataFrame(data=dict(book_id=books_ids, rating=ratings))
    top_n_books = book_ratings.sort_values("rating", ascending=False).head(n)

    book_recommendations = books_df[books_df.id.isin(top_n_books.book_id)].reset_index(drop=True)
    book_recommendations['rating'] = pd.Series(top_n_books.rating.values)
    return book_recommendations.sort_values("rating", ascending=False)

In [22]:
existing_ratings_index = np.where(train[user_index, :] > 0)[0]
existing_ratings = train[user_index, existing_ratings_index]

create_book_ratings(books, existing_ratings_index, existing_ratings)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count,rating
0,6,11870085,John Green,The Fault in Our Stars,4.26,2346404,5.0
1,106,9418327,Tina Fey,Bossypants,3.94,506250,5.0
2,112,15507958,Jojo Moyes,"Me Before You (Me Before You, #1)",4.27,587647,5.0
3,249,4588,Jonathan Safran Foer,Extremely Loud and Incredibly Close,3.97,294726,5.0
4,250,11387515,R.J. Palacio,Wonder,4.43,228538,4.0
5,291,3591262,Abraham Verghese,Cutting for Stone,4.28,258319,4.0
6,320,13526165,Maria Semple,"Where'd You Go, Bernadette",3.9,215453,4.0
7,357,355697,"Erich Maria Remarque, A.W. Wheen",All Quiet on the Western Front,3.92,249113,4.0
8,1990,11331421,"Jan-Philipp Sendker, Kevin Wiliarty",The Art of Hearing Heartbeats,3.98,41647,3.0
9,6613,79421,"Paul Hattaway, Brother Yun",The Heavenly Man: The Remarkable True Story of...,4.33,12537,2.0


In [23]:
create_book_ratings(books, predictions_index, rating_predictions)

Unnamed: 0,id,book_id,authors,title,average_rating,ratings_count,rating
0,1,2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,4.60976
1,2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,4.458729
2,11,77203,Khaled Hosseini,The Kite Runner,4.26,1813044,4.443612
3,25,136251,"J.K. Rowling, Mary GrandPré",Harry Potter and the Deathly Hallows (Harry Po...,4.61,1746574,4.438808
4,27,1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Half-Blood Prince (Harry ...,4.54,1678823,4.371738
5,117,18131,Madeleine L'Engle,"A Wrinkle in Time (A Wrinkle in Time Quintet, #1)",4.04,615907,4.359092
6,2978,13578175,Brandon Sanderson,The Emperor's Soul,4.33,33634,4.304838
7,7677,184644,"Dorie Greenspan, Alan Richardson",Baking: From My Home to Yours,4.18,11662,4.291431
8,8852,12390650,Kristen Ashley,"Wildest Dreams (Fantasyland, #1)",4.19,19790,4.28846
9,9569,32075671,Angie Thomas,The Hate U Give,4.62,32610,4.285123
