In [None]:
!pip install keybert

In [1]:
import pandas as pd
import numpy as np

# For plotting purposes
import matplotlib.pyplot as plt
import seaborn as sns

# RegEx and String Manipulation
import re
import string

# BERT-Embeddings
# from keybert import KeyBERT

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/goodreads-books-descriptions-keywords/keywords92730.csv
/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv
/kaggle/input/goodreads-books-preprocessed/books_processed.csv


In [29]:
books = pd.read_csv('/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv')

In [30]:
books.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [31]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB


### Filtering Columns

In [32]:
books = books[['author', 'desc', 'genre', 'isbn', 'pages', 'rating', 'reviews', 'title', 'totalratings']]
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",002914180X,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",1906863482,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,The seminal history and analysis of the Hungar...,"Politics,History",948984147,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life a...","Labor,History",814327079,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,2761920813,177,4.0,1,Les oiseaux gourmands,1


In [33]:
books.shape

(100000, 9)

## Data Cleaning

In [34]:
books.isna().sum()

author              0
desc             6772
genre           10467
isbn            14482
pages               0
rating              0
reviews             0
title               1
totalratings        0
dtype: int64

### Removing Books with no Description

In [35]:
books.dropna(subset=['desc'], inplace=True)

### Remove Punctuation from the Descriptions

In [36]:
import string
punctuations = string.punctuation
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', punctuations))

books.desc = books.desc.apply(remove_punctuations)

### Remove URLs from the description

In [37]:
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_url(text):   
    return re.sub(url_pattern, r'', text)

books.desc = books.desc.apply(remove_url)

### Remove extra spaces from the text columns and convert the lettercase to lower

In [38]:
books[["title", "author", "desc", "genre"]] = pd.concat([books[col].astype(str).str.lower().str.strip() for col in ["title", "author", "desc", "genre"]], axis=1)
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,laurence m. hauptman,reveals that several hundred thousand indians ...,"history,military history,civil war,american hi...",002914180X,0,3.52,5,between two fires: american indians in the civ...,33
1,"charlotte fiell,emmanuelle dirix",fashion sourcebook 1920s is the first book in...,"couture,fashion,historical,art,nonfiction",1906863482,576,4.51,6,fashion sourcebook 1920s,41
2,andy anderson,the seminal history and analysis of the hungar...,"politics,history",948984147,124,4.15,2,hungary 56,26
3,carlotta r. anderson,allamerican anarchist chronicles the life and ...,"labor,history",814327079,324,3.83,1,all-american anarchist: joseph a. labadie and ...,6
4,jean leveille,aujourdâ€™hui lâ€™oiseau nous invite ã sa tab...,,2761920813,177,4.0,1,les oiseaux gourmands,1


### Remove Book Descriptions With Shorter Length.

In [39]:
# Find description word count
books["length"] = [len(d.split()) for d in books['desc'].tolist()]

len(set(books.desc[books.length.isin(range(0,4))]))

432

In [40]:
# Replace empty strings of description with NaN
books.desc = books.desc.replace(r'^\s*$', np.nan, regex=True)

books[books.length.isin(range(1,4))][["isbn", "title", "desc", "length"]]\
.sort_values(by=["length"], ascending=True).head(5)

Unnamed: 0,isbn,title,desc,length
28959,897471180,lockheed pv-1 ventura in action,squadronsignal,1
33067,9002126565,de lachende wolf,ballonstrip,1
33081,9002106882,het zoemende ei,ballonstrip,1
33093,9002118139,het rijmende paard,ballonstrip,1
33185,4309264247,ãƒ—ãƒ©ã‚¹ãƒãƒƒã‚¯ã‚¬ãƒ¼ãƒ« [purasuchikku gäru],æ†žæ‚ªã¨ç„¡åž¢ã§ã§ããÿã€å¥³ã§ã‚‚ãªã„ä...,1


In [41]:
books.dropna(subset=["desc"], inplace=True)

# Drop records with very short description
books.drop(books.index[books.length.isin(range(0,4))], inplace = True)
del books["length"]

### Drop Variants of the Same Book


In [42]:
books.drop_duplicates(subset=['title', 'desc', 'author'], keep='first', inplace=True)

### Extract and Remove Book Series Information from the Book Name

In [43]:
series_pattern =  "(?:[;]\s*|\(\s*)([^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*))"
def get_book_series_info(text):
    series_info = re.findall(series_pattern, text)
    if series_info:
        series_info = " ".join([i.replace(" ", "_") for i in series_info])
        return series_info
    else:
        return np.nan
    
books['book_series_info'] = books.title.apply(get_book_series_info)

In [44]:
books[books['book_series_info'].notnull()]['book_series_info'].head()

545                  uniformly_hot!,_#15
1135                harlequin_blaze_#593
1145                    mule_hollow,_#17
1296                          misfile_#1
1856    uncle_john's_bathroom_reader_#10
Name: book_series_info, dtype: object

In [45]:
series_remove_pattern = re.compile("(?:[\(]\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)(?:;|\))|\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)\))")       
books["title"]= books["title"].str.replace(series_remove_pattern, r'', regex=True).str.strip()

In [46]:
books.isna().sum()

author                  0
desc                    0
genre                   0
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
dtype: int64

### Transform Book and Author Names into Single Token

In [47]:
books["author"] = books["author"].str.strip().str.replace(' ','_')
books.head(5)

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings,book_series_info
0,laurence_m._hauptman,reveals that several hundred thousand indians ...,"history,military history,civil war,american hi...",002914180X,0,3.52,5,between two fires: american indians in the civ...,33,
1,"charlotte_fiell,emmanuelle_dirix",fashion sourcebook 1920s is the first book in...,"couture,fashion,historical,art,nonfiction",1906863482,576,4.51,6,fashion sourcebook 1920s,41,
2,andy_anderson,the seminal history and analysis of the hungar...,"politics,history",948984147,124,4.15,2,hungary 56,26,
3,carlotta_r._anderson,allamerican anarchist chronicles the life and ...,"labor,history",814327079,324,3.83,1,all-american anarchist: joseph a. labadie and ...,6,
4,jean_leveille,aujourdâ€™hui lâ€™oiseau nous invite ã sa tab...,,2761920813,177,4.0,1,les oiseaux gourmands,1,


### Keyword Extraction Using KeyBERT

In [48]:
# descriptions = books['desc'].tolist()
# descriptions[:10]

In [49]:
# kw_model = KeyBERT()

In [50]:
# for i in range(0, len(descriptions)):
#     keywords = kw_model.extract_keywords(descriptions[i], top_n = 10, keyphrase_ngram_range=(1, 1), stop_words="english")
#     keywords = " ".join([k[0] for k in keywords])
#     keywords_list.append(keywords)

## Keywords enhancement with authors and genre

I have already extracted keywords using KeyBERT and saved them in a csv file. I am reading that csv here.

In [51]:
keywords_list = pd.read_csv('/kaggle/input/goodreads-books-descriptions-keywords/keywords92730.csv')

In [52]:
books = books.reset_index().drop('index', axis=1)

Adding that column to books dataframe

In [53]:
books['keywords'] = keywords_list['keywords']
books.isna().sum()

author                  0
desc                    0
genre                   0
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
keywords                1
dtype: int64

The genre column contains multiple genres. We need to fix that

In [54]:
books.genre = books['genre'].apply(lambda text: ' '.join(list(set(text.replace(',', ' ').split()))))

In [55]:
# books.loc[books.genre == 'nan']['genre'] = ''
books.genre = books.genre.replace('nan', np.nan)

In [56]:
# Fill missing genres with a placeholder and split multiple genres
books['genre'] = books['genre'].fillna('Unknown')
books['genre'] = books['genre'].apply(lambda x: x.split() if x != 'Unknown' else [])
books.genre.head(10)

0    [civil, americans, history, military, american...
1      [historical, fashion, couture, nonfiction, art]
2                                  [history, politics]
3                                     [history, labor]
4                                                   []
5    [human, romance, management, resources, busine...
6    [romance, management, business, leadership, hi...
7                                [nonfiction, history]
8                                                   []
9    [psychology, spirituality, buddhism, religion,...
Name: genre, dtype: object

In [57]:
books.isna().sum()

author                  0
desc                    0
genre                   0
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
keywords                1
dtype: int64

In [58]:
books = books[~books.keywords.isna()]

In [59]:
# books.keywords = books.keywords + ' ' + books.genre

In [60]:
# books.keywords = books.keywords + " " + books.author

In [61]:
# books.loc[books.book_series_info.isnull(), 'book_series_info'] = ''

In [62]:
# books.keywords = books.keywords + " " + books.book_series_info

Lets select only those columns which will help us build the recommendation system

In [63]:
books_processed = books[['title', 'genre', 'keywords']]
books_processed.head(10)

Unnamed: 0,title,genre,keywords
0,between two fires: american indians in the civ...,"[civil, americans, history, military, american...",indians war enlisted civil land thousand sides...
1,fashion sourcebook 1920s,"[historical, fashion, couture, nonfiction, art]",fashion 1920s fashionistas fashionable 1930s d...
2,hungary 56,"[history, politics]",hungarian revolution revolutionary councils hi...
3,all-american anarchist: joseph a. labadie and ...,"[history, labor]",anarchist 18501933 activists protest 19th alla...
4,les oiseaux gourmands,[],table anatomie photos morphologie dans oiseau ...
5,the human equation: building profits by puttin...,"[human, romance, management, resources, busine...",managing management managers manage organizati...
6,competitive advantage through people: unleashi...,"[romance, management, business, leadership, hi...",strategic firms competitive pfeffer advantage ...
7,hawaii: an uncommon history,"[nonfiction, history]",hawaiian islands history jacob knowledge adler...
8,r101: a pictorial history,[],airship titanic 1920s aircraft aviation flying...
9,genuine happiness: meditation as the path to f...,"[psychology, spirituality, buddhism, religion,...",meditation meditations meditationâ meditative ...


Let's save the preprocessed dataframe into a csv for later retreival

In [64]:
books_processed.to_csv('books_preprocessed.csv')

## Encoding Keywords Using Tfidf Vectors

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.6, max_features=7000)
keywords_tfidf = tfidf.fit_transform(books_processed['keywords'])

In [66]:
keywords_tfidf.shape

(92729, 7000)

## Encoding Genres Using OneHot Vectors

In [67]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_onehot = mlb.fit_transform(books_processed['genre'])

In [68]:
genres_onehot.shape

(92729, 1094)

## Creating Features from Tfidf and Genres

In [69]:
# Combine TF-IDF features and one-hot encoded genres
features = np.hstack((keywords_tfidf.toarray(), genres_onehot))

# Create a DataFrame for the combined features
feature_columns = tfidf.get_feature_names_out().tolist() + mlb.classes_.tolist()
feature_df = pd.DataFrame(features, columns=feature_columns)

# Display the first few rows of the combined feature DataFrame
feature_df.head()

Unnamed: 0,10,12,1776,1812,1850s,1861,1862,1863,1864,1865,...,you,young,your,yuri,zambia,zen,zeppelin,zimbabwe,zombies,æ¼«ç”»
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Getting Recommendations

In [70]:
# Example user ratings (book_id: rating)
user_ratings = {29845: 5, 41582: 5, 86466: 4, 1477: 3.4, 39537: 4.5, 50950: 4, 39536:3}

# Initialize an empty user profile
user_profile = np.zeros(features.shape[1]) #8049 dimensional vector

# Update user profile based on ratings
for book_id, rating in user_ratings.items():
    book_index = books_processed[books_processed.index == book_id].index[0]
    book_features = features[book_index]
    user_profile += rating * book_features

# Normalize the user profile
user_profile = user_profile / sum(user_ratings.values())

# Calculate similarity between user profile and each book
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity([user_profile], features)[0]

# Sort books based on similarity scores
books_processed['similarity'] = similarities
recommended_books = books_processed.sort_values(by='similarity', ascending=False).head(10)

recommended_books[['title', 'similarity']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_processed['similarity'] = similarities


Unnamed: 0,title,similarity
74437,the space book: from the beginning to the end ...,0.72684
84653,archives of the universe: 100 discoveries that...,0.718367
86101,the planet mars: a history of observation and ...,0.716007
5243,the living cosmos: our search for life in the ...,0.714418
50950,destination mars: new explorations of the red ...,0.710974
965,mars: uncovering the secrets of the red planet,0.703985
39537,exoplanets and alien solar systems,0.699517
64091,stars: a very short introduction,0.698397
41156,how it began: a time-traveler's guide to the u...,0.692816
84495,lives of the planets: a natural history of the...,0.690249


## Recommendation Pipeline For Any User

The personalized book recommender system pipeline begins with data preprocessing, where missing values in the genre column are handled, TF-IDF is applied to the keywords, and genres are one-hot encoded. User profiles are initialized as weighted vectors based on user ratings, updating dynamically as new ratings are added. Cosine similarity is calculated between the user profile and the feature vectors of all books to generate recommendations, while excluding books the user has already rated. This ensures that the recommendations are personalized and relevant to the user's preferences.

In [78]:
user_profiles = {}  # Dictionary to store user profiles
user_ratings_data = {}  # Dictionary to store user ratings

In [72]:
def update_user_profile(user_id, book_id, rating):
    global user_profiles, user_ratings_data

    if user_id not in user_profiles:
        user_profiles[user_id] = np.zeros(features.shape[1])
        user_ratings_data[user_id] = {}

    # Update ratings
    user_ratings_data[user_id][book_id] = rating
    
    
    user_profiles[user_id] = np.zeros(features.shape[1])
    for book_id, rating in user_ratings_data[user_id].items():
        book_index = books_processed[books_processed.index == book_id].index[0]
        book_features = features[book_index]
        user_profiles[user_id] += rating * book_features

    # Normalize the user profile
    num_ratings = sum(user_ratings_data[user_id])
    user_profiles[user_id] = user_profiles[user_id] / num_ratings

    
def get_recommendations(user_id, top_n=10):
    user_profile = user_profiles[user_id]
    similarities = cosine_similarity([user_profile], features)[0]
    
    books_processed['similarity'] = similarities
    
     # Exclude books the user has already rated
    rated_books = user_ratings_data[user_id].keys()
    recommendations = books_processed[~books_processed.index.isin(rated_books)]
    
    recommended_books = recommendations.sort_values(by='similarity', ascending=False).head(top_n)
    
    return recommended_books[['title', 'genre', 'similarity']]

## Testing

In [79]:
# Example of adding a new rating and getting updated recommendations
#update_user_profile(user_id=1, book_id=39536, rating=5)
#update_user_profile(user_id=1, book_id=86466, rating=5)
update_user_profile(user_id=1, book_id=1477, rating=4)
update_user_profile(user_id=1, book_id=39537, rating=4)
update_user_profile(user_id=1, book_id=216, rating=5)
update_user_profile(user_id=1, book_id=91700, rating=5)
update_user_profile(user_id=1, book_id=35780, rating=4)
update_user_profile(user_id=1, book_id=86556, rating=4)

In [80]:
recommendations = get_recommendations(user_id=1, top_n=50)
recommendations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_processed['similarity'] = similarities


Unnamed: 0,title,genre,similarity
45273,the young atheist's handbook: lessons for livi...,"[skepticism, science, religion, nonfiction, me...",0.557287
1165,"looking for a miracle: weeping icons, relics, ...","[skepticism, paranormal, fantasy, religion, no...",0.533095
42601,god and the folly of faith: the incompatibilit...,"[skepticism, physics, spirituality, history, r...",0.524769
66513,conversations with carl sagan,"[astronomy, memoir, nonfiction, science, philo...",0.52431
66507,physics and philosophy,"[philosophy, physics, nonfiction, science]",0.52122
18170,"tribal science: brains, beliefs and bad ideas","[philosophy, skepticism, nonfiction, science]",0.519289
5049,"science, order and creativity","[philosophy, physics, nonfiction, science]",0.519179
4923,cosmic blueprint: new discoveries in natures a...,"[physics, astronomy, nonfiction, science, phil...",0.517075
7095,the hidden heart of the cosmos: humanity and t...,"[spirituality, astronomy, nonfiction, science,...",0.517075
6409,the book of universes: exploring the limits of...,"[physics, astronomy, nonfiction, science, phil...",0.517075


Our recommender is working exactly as expected. This brings us to the end of the project. If you liked this project. Please upvote this notebook and also share it with other ML Geeks out there.