In [2]:
import pandas as pd
import numpy as np
import math
import copy
from sklearn.metrics import *
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

<h1> Get Dataset </h1>

In [7]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame
ratings_data = pd.read_csv("./Recommendation System - Sentiment analysis/newAmazonBooksRatings.csv", usecols = [0,1,2,5,6], header=0, names = ["user_id", "book_id", "book_name", "user_rating", "sentiment_rating"])
ratings_data = ratings_data.assign(newUserId = ratings_data['user_id'].astype('category').cat.codes)
ratings_data = ratings_data.assign(newBookId = ratings_data['book_id'].astype('category').cat.codes)
newRatingCol = ratings_data.loc[:, "user_rating":"sentiment_rating"]
ratings_data["mean_rating"] = newRatingCol.mean(axis=1)
ratings_data.drop(["user_id","book_id", "user_rating", "sentiment_rating"], axis=1, inplace=True)
cols = ["newBookId","book_name"]
books = ratings_data[cols].copy()
print(ratings_data)
print(books)
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(ratings_data[["newUserId", "newBookId", "mean_rating"]], reader)
trainset = train_data.build_full_trainset()

                                book_name  newUserId  newBookId  mean_rating
0                                Red Tide        243         19          3.5
1                                Red Tide       5467         19          4.5
2                                Red Tide       1549         19          4.5
3                                Red Tide        545         19          3.0
4                                Red Tide       2719         19          3.5
...                                   ...        ...        ...          ...
7431  A Good Yarn (Blossom Street, No. 2)       1839        169          4.5
7432  A Good Yarn (Blossom Street, No. 2)       4562        169          3.5
7433  A Good Yarn (Blossom Street, No. 2)       2329        169          5.0
7434  A Good Yarn (Blossom Street, No. 2)       5575        169          4.5
7435  A Good Yarn (Blossom Street, No. 2)       1449        169          5.0

[7436 rows x 4 columns]
      newBookId                            book_nam

<h1> SVD (Surprise lib) </h1>

In [8]:
from surprise import SVD
from surprise.model_selection import cross_validate
svd = SVD(n_factors=100, n_epochs=5, biased=True, random_state=15, verbose=True)
results = cross_validate(svd, train_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9035  0.9020  0.9580  0.9212  0.0261  
MAE (testset)     0.7229  0.7221  0.7577  0.7342  0.0166  
Fit time          0.39    0.24    0.25    0.29    0.07    
Test time         0.05    0.03    0.02    0.03    0.01    


In [9]:
model=svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [13]:
def estSort(pred):
    return pred.est
def recommend_books(user_id) :
    predictions = []
    for x in pd.unique(ratings_data['newBookId']):
        predictions.append(svd.predict(uid=0, iid=x))
    predictions.sort(key = estSort, reverse=True)
    recommendations = [x.iid for x in predictions[:4]]
    return recommendations

In [21]:
book_indices=recommend_books(user_id=101)
recommendations = [books.values[x] for x in book_indices]
recommendations = DataFrame(recommendations, columns=["book_id", "book_name"])

In [22]:
print(recommendations)

   book_id                                          book_name
0       24  The Secret Life of Lobsters: How Fishermen and...
1       26  The Intelligent Investor: The Definitive Book ...
2       23                                   The Last Goodbye
3       26  The Intelligent Investor: The Definitive Book ...


<h1> SVD (Scipy lib) </h1>

In [23]:
movie_ratings = ratings_data.pivot(
    index='newUserId',
    columns='newBookId',
    values='mean_rating'
).fillna(0)

In [24]:
R = movie_ratings.values
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R, k = 50)

In [25]:
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
print(all_user_predicted_ratings)

[[-7.67297279e-03 -2.46203644e-03 -1.53549611e-03 ... -7.59305541e-04
  -2.14229876e-04 -3.39776509e-03]
 [-2.30649191e-03  1.62221239e-03 -5.68062077e-03 ...  3.13255962e-02
  -2.91524783e-02  5.51116878e-03]
 [-1.03476451e-03 -4.28978803e-04 -1.14257654e-03 ...  2.93433371e-04
   1.42871626e-04 -6.59499951e-05]
 ...
 [-7.17942602e-03 -4.92225219e-03  1.28713142e-02 ...  2.66799780e-02
  -1.85592558e-02  1.15254267e-03]
 [-2.06952902e-03 -8.57957606e-04 -2.28515309e-03 ...  5.86866742e-04
   2.85743251e-04 -1.31899990e-04]
 [-4.09180659e-04 -4.35679812e-04 -3.47127739e-04 ... -2.69955653e-04
  -2.82658456e-05 -1.15012785e-03]]


In [26]:
def recommend_books(predictions, user_index, books, org_matrix, num_recommendations=5):
    sorted_books = (-predictions[user_index]).argsort()
    boolArr = (org_matrix[user_index] == 0)
    unknown_sorted_books = [x for x in sorted_books if boolArr[x]]
    recommendations = [books[x] for x in unknown_sorted_books][:num_recommendations]               
    return recommendations

In [27]:
top_books = recommend_books(all_user_predicted_ratings, 1, books.values, R, 4)
top_books = DataFrame(top_books, columns=["book_id", "book_name"])
print(top_books)

   book_id                                          book_name
0       26  The Intelligent Investor: The Definitive Book ...
1       20                                      No Man's Land
2       23                                   The Last Goodbye
3       21                                   Day of Atonement


<h2> Similar books </h2>

In [28]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(list_1, list_2):
    cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
    return cos_sim

def top_cosine_similarity(data, book_id, top_n=10):
    index = book_id - 1 
    book_row = data[index, :]
    similarity_values = np.zeros(shape=[data.shape[0]-1,2])
    k=0
    for i in range(data.shape[0]):
        if i != index :
            val = cosine_similarity(book_row, data[i, :])
            similarity_values[k] = np.array([i, val])
            k = k + 1
            
    sort_indices = np.argsort(-similarity_values[:, 1])
    return sort_indices[:top_n]

In [29]:
top_book_indices = top_cosine_similarity(Vt.T, 13, 4)
similar_books = [books.values[x] for x in top_book_indices]
similar_books = DataFrame(similar_books, columns=["book_id", "book_name"])

In [30]:
print("Books similar to "+books.book_name[12]+"are:\n")
print(similar_books)

Books similar to No Man's Landare:

   book_id                                          book_name
0       26  The Intelligent Investor: The Definitive Book ...
1       20                                      No Man's Land
2       23                                   The Last Goodbye
3       20                                      No Man's Land
