In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings

In [85]:
reduced_books_users_ratings = pd.read_csv("data/clean/reduced_books_users_ratings.csv")

In [195]:
reduced_books_users_ratings.assign(year_of_publication = pd.to_numeric(reduced_books_users_ratings.year_of_publication, downcast="integer"))
reduced_books_users_ratings.head()

Unnamed: 0,user_id,unique_isbn,book_rating
0,11676,038550120X,10
1,11676,0671537458,8
2,11676,0679776818,8
3,11676,0684867621,3
4,11676,8437606322,8


In [335]:
book_titles = books_users_ratings.iloc[:, [1, 3]].drop_duplicates()
book_titles.head()

Unnamed: 0,isbn,book_title
0,0155061224,Rites of Passage
1,052165615X,Help!: Level 1
2,0521795028,The Amsterdam Connection : Level 4 (Cambridge ...
3,038550120X,A Painted House
5,0671537458,Waiting to Exhale


In [341]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)
    
    def full_dataframe(self):
        full_matrix_result = mf.full_matrix()
        result_matrix = pd.DataFrame(full_matrix_result)
        result_matrix.columns = r_df.columns
        result_matrix = result_matrix.assign(user_id = r_df.index.values)
        result_matrix_melted = result_matrix.melt(id_vars=["user_id"])

In [258]:
r_df = reduced_books_users_ratings.pivot(index="user_id", columns="isbn", values="book_rating")
R = np.array(r_df.fillna(0))

In [81]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)

In [83]:
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()
%time

Iteration: 20 ; error = 542.3172
Iteration: 40 ; error = 516.5185
Iteration: 60 ; error = 495.7338
Iteration: 80 ; error = 480.6897
Iteration: 100 ; error = 466.1268

P x Q:
[[7.801397   7.4851016  7.5479903  ... 7.55785513 7.69406574 7.56020718]
 [8.52520558 8.236721   8.26961773 ... 8.25975787 8.3842095  8.26311067]
 [7.57061691 7.28504311 7.34418632 ... 7.31970641 7.43461142 7.32015768]
 ...
 [8.17859599 7.91839269 7.95111911 ... 7.92944884 8.05400597 7.95076548]
 [7.55525814 7.31695502 7.33235362 ... 7.33523301 7.47733905 7.34535906]
 [8.05554403 7.73867561 7.81113586 ... 7.80454815 7.93799655 7.8041228 ]]



In [343]:
full_matrix_result = mf.full_matrix()
result_matrix = pd.DataFrame(full_matrix_result)
result_matrix.columns = r_df.columns
result_matrix = result_matrix.assign(user_id = r_df.index.values)
result_matrix_melted = result_matrix.melt(id_vars=["user_id"])

In [349]:
def top_recs(user_id, df, book_titles_df=book_titles):
    recs = df[df.user_id == user_id].sort_values(by="value", ascending=False).iloc[0:10, ]
    return(pd.merge(recs, book_titles_df, how="inner", on="isbn"))

In [350]:
top_recs(16877, result_matrix_melted)

Unnamed: 0,user_id,isbn,value,book_title
0,16877,0439425220,9.64009,Harry Potter and the Chamber of Secrets Postca...
1,16877,0743454529,9.542374,"My Sister's Keeper : A Novel (Picoult, Jodi)"
2,16877,0836213319,9.495734,Dilbert: A Book of Postcards
3,16877,0836218620,9.40336,Weirdos From Another Planet!
4,16877,0345339738,9.401936,"The Return of the King (The Lord of the Rings,..."
5,16877,0060256672,9.398157,Where the Sidewalk Ends : Poems and Drawings
6,16877,0618002235,9.396621,"The Two Towers (The Lord of the Rings, Part 2)"
7,16877,0140143505,9.39424,84 Charing Cross Road
8,16877,0836220889,9.377099,Calvin and Hobbes
9,16877,067168390X,9.358269,Lonesome Dove
