In [1]:
import pandas as pd
import numpy as np
import math
import copy
from sklearn.metrics import *
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

<h1> Get Dataset </h1>

In [4]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame
ratings_data = pd.read_csv("./Books Dataset/Amazon Book Reviews.csv", usecols = [0,1,2,6], header=0, names = ["user_id", "book_id", "book_name", "rating"])
ratings_data = ratings_data.assign(newUserId = ratings_data['user_id'].astype('category').cat.codes)
ratings_data = ratings_data.assign(newBookId = ratings_data['book_id'].astype('category').cat.codes)
ratings_data.drop(["user_id","book_id"], axis=1, inplace=True)
cols = ["newBookId","book_name"]
books = ratings_data[cols].copy()
books.drop_duplicates(inplace=True)
print(books)
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(ratings_data[['newUserId', 'newBookId', 'rating']], reader)
trainset = train_data.build_full_trainset()

      newBookId                                          book_name
0            19                                           Red Tide
11           20                                      No Man's Land
36           21                                   Day of Atonement
61           22  he Jaguar Knights: A Chronicle of the King's B...
72           23                                   The Last Goodbye
...         ...                                                ...
7268        165  The Brother of Jesus: The Dramatic Story &amp;...
7279        166                               The Great Unraveling
7331        167        By the Shores of Silver Lake (Little House)
7353        168                     The Long Winter (Little House)
7383        169                A Good Yarn (Blossom Street, No. 2)

[266 rows x 2 columns]


<h1> SVD (Surprise lib) </h1>

In [5]:
from surprise import SVD
from surprise.model_selection import cross_validate
svd = SVD(n_factors=100, n_epochs=5, biased=True, random_state=15, verbose=True)
results = cross_validate(svd, train_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0299  1.0298  1.0642  1.0413  0.0162  
MAE (testset)     0.8104  0.8078  0.8456  0.8212  0.0173  
Fit time          0.10    0.08    0.09    0.09    0.00    
Test time         0.02    0.01    0.02    0.02    0.00    


In [6]:
model=svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [7]:
def estSort(pred):
    return pred.est
def recommend_books(user_id) :
    predictions = []
    for x in pd.unique(ratings_data['newBookId']):
        predictions.append(svd.predict(uid=0, iid=x))
    predictions.sort(key = estSort, reverse=True)
    recommendations = [x.iid for x in predictions[:7]]
    return recommendations

In [8]:
book_indices=recommend_books(user_id=0)
recommendations = [books.values[x] for x in book_indices]
recommendations = DataFrame(recommendations, columns=["book_id", "book_name"])

In [9]:
print(recommendations)

   book_id                                          book_name
0       30                The Stingray Shuffle (Serge Storms)
1       96  A Year with C. S. Lewis: Daily Readings from H...
2      119  A Crack in the Edge of the World: America and ...
3       44                                        Goldengrove
4      119  A Crack in the Edge of the World: America and ...
5      119  A Crack in the Edge of the World: America and ...
6        9                                      Deadly Double


<h1> SVD (Scipy lib) </h1>

In [9]:
movie_ratings = ratings_data.pivot(
    index='newUserId',
    columns='newBookId',
    values='rating'
).fillna(0)

In [10]:
R = movie_ratings.values
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R, k = 50)

In [11]:
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
print(all_user_predicted_ratings)

[[-3.75112507e-03 -2.89592643e-03 -1.32330333e-03 ...  1.00985774e-03
   1.54498895e-03 -4.99066258e-03]
 [-1.13728763e-02  1.70883417e-02 -6.67785829e-03 ...  2.89990740e-02
  -3.64796942e-02  5.35367015e-03]
 [-5.27405513e-04 -3.33739289e-04 -5.90329619e-04 ...  3.92585431e-04
  -3.79413993e-04 -2.17999577e-05]
 ...
 [-3.40141125e-03 -6.65392984e-04  1.07192237e-02 ...  2.08001877e-02
  -1.51528338e-02 -2.57933247e-04]
 [-2.63702757e-03 -1.66869645e-03 -2.95164809e-03 ...  1.96292715e-03
  -1.89706997e-03 -1.08999789e-04]
 [ 8.90993930e-04  2.16761741e-04  1.14542531e-04 ... -3.47980808e-04
   2.26759025e-04 -5.62255188e-04]]


In [12]:
def recommend_books(predictions, user_index, books, org_matrix, num_recommendations=5):
    sorted_books = (-predictions[user_index]).argsort()
    boolArr = (org_matrix[user_index] == 0)
    unknown_sorted_books = [x for x in sorted_books if boolArr[x]]
    recommendations = [books[x] for x in unknown_sorted_books][:num_recommendations]               
    return recommendations

In [40]:
top_books = recommend_books(all_user_predicted_ratings, 1, books.values, R, 4)
top_books = DataFrame(top_books, columns=["book_id", "book_name"])
print(top_books)

   book_id                                          book_name
0       26  The Intelligent Investor: The Definitive Book ...
1       23                                   The Last Goodbye
2       20                                      No Man's Land
3       21                                   Day of Atonement


<h2> Similar books </h2>

In [30]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(list_1, list_2):
    cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
    return cos_sim

def top_cosine_similarity(data, book_id, top_n=10):
    index = book_id - 1 
    book_row = data[index, :]
    similarity_values = np.zeros(shape=[data.shape[0]-1,2])
    k=0
    for i in range(data.shape[0]):
        if i != index :
            val = cosine_similarity(book_row, data[i, :])
            similarity_values[k] = np.array([i, val])
            k = k + 1
            
    sort_indices = np.argsort(-similarity_values[:, 1])
    return sort_indices[:top_n]

In [35]:
top_book_indices = top_cosine_similarity(Vt.T, 13, 4)
similar_books = [books.values[x] for x in top_book_indices]
similar_books = DataFrame(similar_books, columns=["book_id", "book_name"])

In [36]:
print("Books similar to "+books.book_name[12]+"are:\n")
print(similar_books)

Books similar to No Man's Landare:

   book_id                                          book_name
0       26  The Intelligent Investor: The Definitive Book ...
1       20                                      No Man's Land
2       23                                   The Last Goodbye
3       19                                           Red Tide
