In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

pd.options.display.max_rows = 999

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Model Building - Non-negative matrix factorization

In [None]:
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

df = pd.read_csv('/content/drive/My Drive/final_rating_content_based.csv')

df.head()

Unnamed: 0,book_id,book_title,user_id,helpfulness,rating,review
0,B000N6DDJQ,The Scarlet Letter A Romance,AUM3YMZ0YRJE0,0.5,5.0,"WHen I finally started reading this classic, I..."
1,B000N6DDJQ,The Scarlet Letter A Romance,AMKZHBOK7VMQR,1.0,5.0,Hawthorne wrote a masterful work of historical...
2,B000N6DDJQ,The Scarlet Letter A Romance,AWLFVCT9128JV,,4.0,I don't suppose anything can ruin a book more ...
3,B000N6DDJQ,The Scarlet Letter A Romance,ABN5K7K1TM1QA,0.904762,5.0,"""All have sinned and fall short of the glory o..."
4,B000N6DDJQ,The Scarlet Letter A Romance,A3IKBHODOTYYHM,0.866667,4.0,I think the many readers who were forced to pa...


In [None]:
user_ids = list(df.user_id)
book_ids = list(df.book_id)
ratings = list(df.rating)

In [None]:
unique_user_ids = list(set(user_ids))
unique_book_ids = list(set(book_ids))

num_users = len(unique_user_ids)
num_books = len(set(book_ids))

user_item_matrix = np.zeros((num_users, num_books))

In [None]:
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
for user_id, book_id, rating in zip(user_ids, book_ids, ratings):
  user_index = unique_user_ids.index(user_id)
  book_index = unique_book_ids.index(book_id)
  user_item_matrix[user_index, book_index] = rating

In [None]:
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 5., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
k=3
model = NMF(n_components = k, init='random', random_state=42)
user_features = model.fit_transform(user_item_matrix)
book_features = model.components_

In [None]:
book_similarity = cosine_similarity(book_features.T)

In [None]:
def get_recommendations(user_id, num_recommendations):
    user_index = user_id  # If using user index instead of user ID
    user_ratings = user_item_matrix[user_index, :]

    # Find unrated books by the user
    unrated_books = np.where(user_ratings == 0)[0]
    book_similarity_scores = book_similarity[:, unrated_books]
    average_similarity_scores = np.mean(book_similarity_scores, axis=0)
    top_books_indices = np.argsort(average_similarity_scores)[::-2][:num_recommendations]

    unique_book_titles = df.book_title

    recommended_books = [unique_book_titles[unrated_books[book_index]] for book_index in top_books_indices]
    return recommended_books

In [None]:
user_id = 4  # Index of the user for whom to generate recommendations
num_recommendations = 5
recommendations = get_recommendations(user_id, num_recommendations)

print("Recommended Books for User", user_id, ":")
for book in recommendations:
    print("Book", book)

Recommended Books for User 4 :
Book Lord of the flies
Book The Mayor of Casterbridge
Book The Stranger
Book Hard Times
Book Hound of the Baskervilles (Lrs Large Print Heritage Series)


# Model Building - Tfidf Vectorizer

In [None]:
df = pd.read_csv('/content/drive/My Drive/final_ratings.csv')
df.columns

Index(['book_id', 'book_title', 'user_id', 'helpfulness', 'rating', 'review'], dtype='object')

In [None]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[['user_id', 'book_id','rating']], reader) #kad dodam helfulness kaze too many to unpack

In [None]:
trainset,testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
trainset.all_users()

range(0, 193)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['review'])

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix) #sklearn

In [None]:
cosine_sim

array([[1.        , 0.10812172, 0.19099125, ..., 0.15778707, 0.08694615,
        0.17846448],
       [0.10812172, 1.        , 0.28979513, ..., 0.11010165, 0.06841154,
        0.10474602],
       [0.19099125, 0.28979513, 1.        , ..., 0.23149459, 0.12882581,
        0.25186359],
       ...,
       [0.15778707, 0.11010165, 0.23149459, ..., 1.        , 0.35497681,
        0.33343506],
       [0.08694615, 0.06841154, 0.12882581, ..., 0.35497681, 1.        ,
        0.30080535],
       [0.17846448, 0.10474602, 0.25186359, ..., 0.33343506, 0.30080535,
        1.        ]])

In [None]:
from surprise.model_selection.search import GridSearchCV

k_params = list(range(0, 11, 1))
knn_params = {'k' : k_params}
grid = GridSearchCV(KNNWithMeans, knn_params, cv = 3, measures = ['rmse', 'mse'])
grid.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [None]:
mse_mean = grid.cv_results['mean_test_mse'].round(3)
rmse_mean = grid.cv_results['mean_test_rmse'].round(3)
param_k = grid.cv_results['param_k']

print("\tRMSE\t\tMSE")
for k in param_k:
  print(f"k : {k}\t{rmse_mean[k]}\t\t{mse_mean[k]}")

print(f"\nBest score is for k = {grid.best_params['rmse']['k']}")
print(f"Best value of RMSE is {round(grid.best_score['rmse'], 4)}")
print(f"Best value of MSE is {round(grid.best_score['mse'], 4)}")

	RMSE		MSE
k : 0	0.603		0.364
k : 1	0.545		0.297
k : 2	0.506		0.256
k : 3	0.498		0.248
k : 4	0.498		0.249
k : 5	0.501		0.251
k : 6	0.506		0.256
k : 7	0.51		0.26
k : 8	0.515		0.266
k : 9	0.52		0.27
k : 10	0.524		0.275

Best score is for k = 3
Best value of RMSE is 0.4977
Best value of MSE is 0.2477


In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_pred_true = defaultdict(list)
    for uid, _, true_r, pred, _ in predictions:
        user_pred_true[uid].append((pred, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_pred_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((pred >= threshold) for (pred, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (pred >= threshold))
            for (pred, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

Model User Based With Cosine Similarities

In [None]:
algo = KNNWithMeans(sim_options={'name' : 'cosine','user_based' : True, 'k':3})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fefc2daecb0>

In [None]:
predictions = algo.test(testset)

In [None]:
precisions_user_based, recalls_user_based = precision_recall_at_k(predictions, k=10, threshold=3.5)
print(f"Precision at k: {round(sum(prec for prec in precisions_user_based.values()) / len(precisions_user_based), 5) * 100}%")
print(f"Recall at k: {round(sum(rec for rec in recalls_user_based.values()) / len(recalls_user_based), 5) * 100}%")

Precision at k: 87.726%
Recall at k: 79.338%


In [None]:
rmse = accuracy.rmse(predictions)
mse = accuracy.mse(predictions)

RMSE: 0.5747
MSE: 0.3302


In [None]:
def get_recommendations(user_id, n=20):
    # Find the top n most similar users based on their textual reviews
    user_index = df[df['user_id'] == user_id].index[0]
    sim_scores = list(enumerate(cosine_sim[user_index]))
    #print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #print(sim_scores)
    sim_scores = sim_scores[1:n+1]
    #print(sim_scores)
    sim_users = [df['user_id'][i[0]] for i in sim_scores]
   # print(sim_users)

    # Find the items that the similar users have rated highly
    top_items = {}
    for user in sim_users:
        items = df[df['user_id'] == user]['book_id']
        for item in items:
            if item not in top_items:
                top_items[item] = 1
            else:
                top_items[item] += 1

    # Sort the items by the number of times they have been recommended
    top_items = sorted(top_items.items(), key=lambda x: x[1], reverse=True)
    #print(top_items)

    # Return the top n items as recommendations
    lst = [i[0] for i in top_items[:n]]
    rec_books = set()
    for i in lst:
        book_name = df[df['book_id']==i].iloc[0]['book_title']
        rec_books.add(book_name)
    return list(rec_books)[:10]

In [None]:
get_recommendations('A2SHQJP6PNQTLD')


['The Postman Always Rings Twice',
 'Slaughterhouse-Five',
 'The Scarlet Letter',
 '1984',
 'Lord of the flies',
 'The Old Man and the Sea',
 'The Scarlet Letter A Romance',
 'The Great Gatsby',
 'Hiroshima']

Model User Based With Pearson Correlations

In [None]:
model_user_based = KNNWithMeans(k=70, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_user_based.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fefc2c43340>

In [None]:
from surprise import accuracy
predictions_user_based = model_user_based.test(testset)
rmse = accuracy.rmse(predictions_user_based)
mae = accuracy.mae

RMSE: 0.4428


In [None]:
from collections import defaultdict

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_pred_true = defaultdict(list)
    for uid, _, true_r, pred, _ in predictions:
        user_pred_true[uid].append((pred, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_pred_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((pred >= threshold) for (pred, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (pred >= threshold))
            for (pred, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

Predictions User Based

In [None]:
precisions_user_based, recalls_user_based = precision_recall_at_k(predictions_user_based, k=10, threshold=3.5)
print(f"Precision at k: {round(sum(prec for prec in precisions_user_based.values()) / len(precisions_user_based), 5) * 100}%")
print(f"Recall at k: {round(sum(rec for rec in recalls_user_based.values()) / len(recalls_user_based), 5) * 100}%")

Precision at k: 91.683%
Recall at k: 79.335%


Model Item Based

In [None]:
model_item_based = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
model_item_based.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fefc2f16a10>

Predictions Item Based

In [None]:
predictions_item_based = model_item_based.test(testset)
rmse = accuracy.rmse(predictions_item_based)

precisions_item_based, recalls_item_based = precision_recall_at_k(predictions_item_based, k=10, threshold=3.5)

print(f"Precision at k: {round(sum(prec for prec in precisions_item_based.values()) / len(precisions_item_based), 5) * 100}%")


print(f"Recall at k: {round(sum(rec for rec in recalls_item_based.values()) / len(recalls_item_based), 5) * 100}%")

RMSE: 0.3383
Precision at k: 95.55%
Recall at k: 83.208%


In [None]:
# Get the user ID for whom you want to make recommendations
user_id = 'A1X8VZWTOG8IS6'

# Get the list of all items (books) in the dataset
items = df['book_id'].unique()

# Predict the rating the user would give to each item and store in a dictionary
item_ratings = {}
for item in items:
    predicted_rating = model_item_based.predict(user_id, item).est
    item_ratings[item] = predicted_rating

# Sort the items by predicted rating in descending order and select the top 5
top_items = sorted(item_ratings.items(), key=lambda x: x[1], reverse=True)[:5]

# Print the top 5 recommended books
for i, item in enumerate(top_items):
    print(f"{i+1}. {item[0]}({df[df['book_id']==item[0]].iloc[0].book_title}) with predicted rating of {item[1]:.2f}")

1. 0395423317(The Stranger) with predicted rating of 5.00
2. 0821772287(The Awakening) with predicted rating of 5.00
3. 9562910334(1984) with predicted rating of 4.88
4. B000BKUZYA(The Hitchhiker's Guide to the Galaxy) with predicted rating of 4.81
5. B000J36YJE(The Two Towers) with predicted rating of 4.77


# Model Building - Content Based Filtering

In [None]:
from sklearn.metrics.pairwise import linear_kernel

Importing and cleaning the data.

In [None]:
books = pd.read_csv('/content/drive/My Drive/Colab Notebooks/popular_books_clean.csv')

Converting missing values in `Description`, `Categories` and `Authors` with empty strings `''`.

In [None]:
books['categories'] = books['categories'].fillna('')
books['authors'] = books['authors'].fillna('')
books['description'] = books['description'].fillna('')

print(f"There are {books['categories'].nunique()} unique categories.")
print(f"There are {books['authors'].nunique()} unique authors.")

There are 82 unique categories.
There are 458 unique authors.


Converting `Categories` to lowecase and removing unnecessary characters in between.

In [None]:
import re

def clean_categories(categories):
  for i in range(len(categories)):
    categories[i] = ''.join(re.split('\ &\ |\ ',categories[i].lower()))
  return categories

books['categories'] = clean_categories(books['categories'])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



`Books` is ready to make a soup!

In [None]:
books.shape

(789, 6)

In [None]:
def create_soup(data):
    return data['authors'] + ' ' + data['categories'] + ' ' + data['Title'] + ' ' + data['description']

books['soup'] = create_soup(books)

Building the model.

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
matrix = tfidf.fit_transform(books['soup'])

In [None]:
matrix.shape

(789, 11701)

In [None]:
cos_sim = cosine_similarity(matrix)

In [None]:
indices = pd.Series(books.index, index = books['Title'])

In [None]:
def get_recommendation_by_description(title, cos_sim = cos_sim):
  idx = indices[title]
  sim_scores = enumerate(cos_sim[idx])
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[2:7]
  sim_index = [i[0] for i in sim_scores]
  for i in sim_index:
    print(f"Title:\t{books['Title'].iloc[i]}\nAuthor:\t{books['categories'].iloc[i]}\nAuthor:\t{books['authors'].iloc[i]}\n")

In [None]:
get_recommendation_by_description('East of Eden')

Title:	Outside Of Eden: Part One
Author:	fiction
Author:	johnsteinbeck

Title:	The Wrath of Grapes
Author:	fiction
Author:	johnsteinbeck

Title:	The Grapes of Wrath: Tie-In Edition
Author:	fiction
Author:	johnsteinbeck

Title:	CliffsNotes on Hemingway's The Old Man And The Sea (Dummies Trade)
Author:	fiction
Author:	johnsteinbeck

Title:	Angela's Ashes (Turtleback School & Library Binding Edition)
Author:	fiction
Author:	abrahamverghese



In [None]:
books[['publisher', 'ratingsCount', 'soup']].head()

Unnamed: 0,publisher,ratingsCount,soup
0,Bantam Books,1180.0,edgarallanpoe fiction The Best Short Stories o...
1,William Morrow Paperbacks,142.0,"a.j.finn fiction Woman at the window, a novel ..."
2,Modern Library,375.0,sigmundfreud psychology The interpretation of ...
3,Hendrickson Publishers,106.0,c.h.spurgeon religion Spurgeon's Sermons Volum...
4,,145.0,jeanm.auel americanfiction The Clan of the Cav...




# Data Visualization

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
ratings_per_book = df.groupby('book_title')['user_id'].count().sort_values(ascending=True)

In [None]:

fig = px.bar(x=ratings_per_book.values[-50:], y=ratings_per_book.index[-50:], orientation='h')

fig.update_layout(title={'text': 'Number of Ratings per Book', 'font': {'size': 22}, 'x': 0.5},
                  xaxis_title={'text': 'Ratings Count', 'font': {'size': 16}},
                  yaxis_title={'text': 'Book Title', 'font': {'size': 16}})

# Show the bar chart
fig.show()

In [None]:


average_ratings = df.groupby('book_title')['rating'].mean()
average_ratings = average_ratings.to_frame()
average_ratings['num_ratings'] = df.groupby('book_title')['user_id'].count()
sorted_books = average_ratings.sort_values(by=['num_ratings'])
average_ratings_top_20 = sorted_books.tail(50)

In [None]:
fig = px.bar(x=average_ratings_top_20['num_ratings'], y=average_ratings_top_20.index, orientation='h',
             color=average_ratings_top_20['rating'], color_continuous_scale=['#EF553B','#636EFA'])

fig.update_layout(
    title={'text': 'Average Rating for the 50 Most Read Books', 'font': {'size': 22}, 'x': 0.5},
    xaxis_title={'text': 'Number Of Ratings', 'font': {'size': 16}},
    yaxis_title={'text': 'Book Title', 'font': {'size': 16}})

fig.show()

In [None]:
user_helpfulness = df.groupby('user_id')['helpfulness'].mean()
user_num = df.groupby('user_id')['rating'].count()
user_df = user_helpfulness.to_frame()
user_df['num_ratings'] = user_num
top20_users_by_helpfulness = user_df.sort_values(by='num_ratings').tail(50)

In [None]:
fig = px.bar(x=top20_users_by_helpfulness['num_ratings'], y=top20_users_by_helpfulness.index, orientation='h',
             color=top20_users_by_helpfulness['helpfulness'], color_continuous_scale=['#EF553B','#636EFA'])

fig.update_layout(
    title={'text': 'Average Helpfulness for the 20 Most Active Users', 'font': {'size': 22}, 'x': 0.5},
    xaxis_title={'text': 'Number Of Ratings', 'font': {'size': 16}},
    yaxis_title={'text': 'User ID', 'font': {'size': 16}})

fig.show()

In [None]:
fig = px.pie(df['rating'], values='rating', names='rating', title='Distribution Of Given Ratings', color_discrete_sequence=px.colors.sequential.Emrld, hole=0.4)
fig.show()