In [1]:
import pandas as pd
import numpy as np
import math
import copy
from sklearn.metrics import *
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

<h1> Get Dataset </h1>

In [22]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame
ratings_data = pd.read_csv("./Recommendation System - Sentiment analysis/newAmazonBooksRatings.csv", usecols = [0,1,2,5,6], header=0, names = ["user_id", "book_id", "book_name", "user_rating", "sentiment_rating"])
ratings_data = ratings_data.assign(newUserId = ratings_data['user_id'].astype('category').cat.codes)
ratings_data = ratings_data.assign(newBookId = ratings_data['book_id'].astype('category').cat.codes)
newRatingCol = ratings_data.loc[:, "user_rating":"sentiment_rating"]
ratings_data["mean_rating"] = newRatingCol.mean(axis=1)
ratings_data.drop(["user_id","book_id", "user_rating", "sentiment_rating"], axis=1, inplace=True)
cols = ["newBookId","book_name"]
books = ratings_data[cols].copy()
books.drop_duplicates(inplace=True)
books = books.reset_index(drop=True)
print(ratings_data)
print(books)
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(ratings_data[["newUserId", "newBookId", "mean_rating"]], reader)
trainset = train_data.build_full_trainset()

                                 book_name  newUserId  newBookId  mean_rating
0          Red Tide: A Novel (Ford, G. M.)        243         19          3.5
1          Red Tide: A Novel (Ford, G. M.)       5467         19          4.5
2          Red Tide: A Novel (Ford, G. M.)       1549         19          4.5
3          Red Tide: A Novel (Ford, G. M.)        545         19          3.0
4          Red Tide: A Novel (Ford, G. M.)       2719         19          3.5
...                                    ...        ...        ...          ...
7431  A Good Yarn CD (The Knitting Series)       1839        169          4.5
7432  A Good Yarn CD (The Knitting Series)       4562        169          3.5
7433  A Good Yarn CD (The Knitting Series)       2329        169          5.0
7434  A Good Yarn CD (The Knitting Series)       5575        169          4.5
7435  A Good Yarn CD (The Knitting Series)       1449        169          5.0

[7436 rows x 4 columns]
     newBookId                         

<h1> SVD (Surprise lib) </h1>

In [4]:
from surprise import SVD
from surprise.model_selection import cross_validate
svd = SVD(n_factors=100, n_epochs=5, biased=True, random_state=15, verbose=True)
results = cross_validate(svd, train_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9223  0.9495  0.8914  0.9211  0.0237  
MAE (testset)     0.7310  0.7503  0.7193  0.7335  0.0128  
Fit time          0.09    0.09    0.09    0.09    0.00    
Test time         0.01    0.02    0.02    0.02    0.00    


In [5]:
model=svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [10]:
def estSort(pred):
    return pred.est
def recommend_books(user_id, num_recommendations) :
    predictions = []
    for x in pd.unique(ratings_data['newBookId']):
        predictions.append(svd.predict(uid=0, iid=x))
    predictions.sort(key = estSort, reverse=True)
    recommendations = [x.iid for x in predictions[:num_recommendations]]
    return recommendations

In [11]:
book_indices=recommend_books(user_id=101, num_recommendations=7)
recommendations = [books.values[x] for x in book_indices]
recommendations = DataFrame(recommendations, columns=["book_id", "book_name"])

In [12]:
print(recommendations)

   book_id                                          book_name
0      112                            Autobiography of a Face
1      169               A Good Yarn CD (The Knitting Series)
2      102                        Hell at the Breech: A Novel
3      133      The Novice: The Black Magician Trilogy Book 2
4       18  Sacred Causes: The Clash of Religion and Polit...
5       44                               Goldengrove: A Novel
6      105                                 Indelible: A Novel


<h1> SVD (Scipy lib) </h1>

In [13]:
movie_ratings = ratings_data.pivot(
    index='newUserId',
    columns='newBookId',
    values='mean_rating'
).fillna(0)

In [14]:
R = movie_ratings.values
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R, k = 50)

In [15]:
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
print(all_user_predicted_ratings)

[[-7.67883834e-03 -2.46037610e-03 -1.53547246e-03 ... -7.59475765e-04
  -2.14462131e-04 -3.39814750e-03]
 [-2.30580147e-03  1.62222794e-03 -5.68060208e-03 ...  3.13255724e-02
  -2.91525306e-02  5.51109552e-03]
 [-1.03490609e-03 -4.28966687e-04 -1.14257363e-03 ...  2.93458937e-04
   1.42909121e-04 -6.58336153e-05]
 ...
 [-7.18186732e-03 -4.92080564e-03  1.28712916e-02 ...  2.66802475e-02
  -1.85588184e-02  1.15133956e-03]
 [-2.06981217e-03 -8.57933373e-04 -2.28514725e-03 ...  5.86917874e-04
   2.85818241e-04 -1.31667231e-04]
 [-4.06273071e-04 -4.34993365e-04 -3.47192929e-04 ... -2.69687018e-04
  -2.78287377e-05 -1.15233850e-03]]


In [16]:
def recommend_books(predictions, user_index, books, org_matrix, num_recommendations=5):
    sorted_books = (-predictions[user_index]).argsort()
    boolArr = (org_matrix[user_index] == 0)
    unknown_sorted_books = [x for x in sorted_books if boolArr[x]]
    recommendations = [books[x] for x in unknown_sorted_books][:num_recommendations]               
    return recommendations

In [17]:
top_books = recommend_books(all_user_predicted_ratings, 1, books.values, R, 7)
top_books = DataFrame(top_books, columns=["book_id", "book_name"])
print(top_books)

   book_id                                          book_name
0      144  Septimus Heap, Book Two: Flyte: 02 (Septimus H...
1       46                                          The Nanny
2       95  Eels: An Exploration, from New Zealand to the ...
3       57  The Barefoot Princess: The Lost Princesses #2 ...
4       58  The Prince Kidnaps a Bride: The Lost Princesse...
5       90  A Dark Champion: 3 (Brotherhood of the Sword S...
6      167  By the Shores of Silver Lake: Full Color Editi...


<h2> Similar books </h2>

In [18]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(list_1, list_2):
    cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
    return cos_sim

def top_cosine_similarity(data, book_id, top_n=10):
    index = book_id - 1 
    book_row = data[index, :]
    similarity_values = np.zeros(shape=[data.shape[0]-1,2])
    k=0
    for i in range(data.shape[0]):
        if i != index :
            val = cosine_similarity(book_row, data[i, :])
            similarity_values[k] = np.array([i, val])
            k = k + 1
            
    sort_indices = np.argsort(-similarity_values[:, 1])
    return sort_indices[:top_n]

In [23]:
top_book_indices = top_cosine_similarity(Vt.T, 13)
similar_books = [books.values[x] for x in top_book_indices]
similar_books = DataFrame(similar_books, columns=["book_id", "book_name"])

In [25]:
print("Books similar to "+books.book_name[12]+" are:\n")
print(similar_books)

Books similar to Valdez Is Coming Low Price are:

   book_id                                          book_name
0      137  Warlord: A Life of Winston Churchill at War, 1...
1       31          Cadillac Beach: A Novel: 6 (Serge Storms)
2       96  A Year with C. S. Lewis: Daily Readings from H...
3       41                     Gentlemen and Players: A Novel
4        0                                Mission Compromised
5       99  Scots on the Rocks: A Bed-and-Breakfast Myster...
6      128  Where God Was Born: A Journey by Land to the R...
7      158  Useful Idiots: How Liberals Got It Wrong in th...
8      131  Busting Vegas: The MIT Whiz Kid Who Brought th...
9       11       Fluke: Or, I Know Why the Winged Whale Sings
