In [None]:
!pip install keybert

In [1]:
import pandas as pd
import numpy as np

# For plotting purposes
import matplotlib.pyplot as plt
import seaborn as sns

# RegEx and String Manipulation
import re
import string

# BERT-Embeddings
# from keybert import KeyBERT

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pd.options.display.max_columns = None # show all cols
pd.set_option('max_colwidth', None) # show full width of showing cols
pd.set_option("expand_frame_repr", False) # print cols side by side as it's supposed to be

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv
/kaggle/input/goodreads-books-preprocessed/books_processed.csv
/kaggle/input/goodreads-books-descriptions-keywords/keywords92730.csv


In [4]:
books = pd.read_csv('/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv')

In [5]:
books.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","History,Military History,Civil War,American History,American Civil War,Nonfiction,North American Hi...,American History,Native Americans",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1387738765l/1001053.jpg,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Between_Two_Fires,0,3.52,5,Between Two Fires: American Indians in the Civil War,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,"Fashion Sourcebook - 1920s is the first book in a brand-new series by Fiell Publishing that documents comprehensively the seasonal fashion styles of the 20th century, decade by decade. Sumptuously illustrated with over 600 original photographs, drawings and prints, this title is a must-have reference work for not only students of fashion, but for all fashionistas. Fashion Sourcebook - 1920s focuses on the Art Deco period with its beautiful beaded dresses, cloche hats and t-bar shoes as worn by the fashionable flappers and the ""bright young things"" of the time. An accompanying introduction outlines the major themes within fashion during this period and introduces its most famous designers and assesses their creative contributions. Text in English, French & German. Also Available: Fashion Sourcebook - 1930s ISBN: 9781906863586 24.95""","Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1421011497l/10010552.jpg,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashion-sourcebook-1920s,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","Politics,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1348117708l/1001077.jpg,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungary_56,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participant in the major social reform movements of the Gilded Age, Labadie was a central figure in the pervasive struggle for a new social order as the American Midwest underwent rapid industrialization at the end of the 19th century. This engaging biography follows Labadie's colorful career from a childhood among a Pottawatomi tribe in the Michigan woods through his local and national involvement in a maze of late 19th-century labor and reform activities, including participation in the Socialist Labor party, Knights of Labor, Greenback movement, trades councils, typographical union, eight-hour-day campaigns, and the rise of the American Federation of Labor. In writing this biography of her grandfather, Carlotta R. Anderson consulted the renowned Labadie Collection at the University of Michigan, a unique collection of protest literature which extensively documents pivotal times in American labor history and radical history.","Labor,History",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1356461214l/1001079.jpg,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_American_Anarchist,324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa table, table surprenante par sa diversitÃ© et son originalitÃ©. Tous initient leurs petits Ã la vie gourmande en puisant dans un panier aux ressources immenses. Pour y parvenir, lâ€™oiseau a modifiÃ© son anatomie, sa morphologie, mais surtout il a radicalement adaptÃ© son organisme Ã ses choix. Par ses photos magnifiques et ses textes fascinants, lâ€™auteur nous invite Ã dÃ©couvrir les innombrables et subtiles facettes de lâ€™alimentation des oiseaux., - ,www.jeanleveille.org",,https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1293221069l/10010880.jpg,2761920813,,https://goodreads.com/book/show/10010880-les-oiseaux-gourmands,177,4.0,1,Les oiseaux gourmands,1


In [6]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB


### Filtering Columns

In [7]:
books = books[['author', 'desc', 'genre', 'isbn', 'pages', 'rating', 'reviews', 'title', 'totalratings']]
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,"Reveals that several hundred thousand Indians were affected by the Civil War and that twenty thousand Indians enlisted on both sides in an attempt to gain legitimacy, autonomy, or simply land.","History,Military History,Civil War,American History,American Civil War,Nonfiction,North American Hi...,American History,Native Americans",002914180X,0,3.52,5,Between Two Fires: American Indians in the Civil War,33
1,"Charlotte Fiell,Emmanuelle Dirix","Fashion Sourcebook - 1920s is the first book in a brand-new series by Fiell Publishing that documents comprehensively the seasonal fashion styles of the 20th century, decade by decade. Sumptuously illustrated with over 600 original photographs, drawings and prints, this title is a must-have reference work for not only students of fashion, but for all fashionistas. Fashion Sourcebook - 1920s focuses on the Art Deco period with its beautiful beaded dresses, cloche hats and t-bar shoes as worn by the fashionable flappers and the ""bright young things"" of the time. An accompanying introduction outlines the major themes within fashion during this period and introduces its most famous designers and assesses their creative contributions. Text in English, French & German. Also Available: Fashion Sourcebook - 1930s ISBN: 9781906863586 24.95""","Couture,Fashion,Historical,Art,Nonfiction",1906863482,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,"The seminal history and analysis of the Hungarian Revolution and the workers' councils, perhaps the single most important revolutionary event ever, and this is simply the best book on it.","Politics,History",948984147,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life and work of Joseph A. Labadie (1850-1933), Detroit's prominent labor organizer and one of early labor's most influential activists. A dynamic participant in the major social reform movements of the Gilded Age, Labadie was a central figure in the pervasive struggle for a new social order as the American Midwest underwent rapid industrialization at the end of the 19th century. This engaging biography follows Labadie's colorful career from a childhood among a Pottawatomi tribe in the Michigan woods through his local and national involvement in a maze of late 19th-century labor and reform activities, including participation in the Socialist Labor party, Knights of Labor, Greenback movement, trades councils, typographical union, eight-hour-day campaigns, and the rise of the American Federation of Labor. In writing this biography of her grandfather, Carlotta R. Anderson consulted the renowned Labadie Collection at the University of Michigan, a unique collection of protest literature which extensively documents pivotal times in American labor history and radical history.","Labor,History",814327079,324,3.83,1,All-American Anarchist: Joseph A. Labadie and the Labor Movement,6
4,Jean Leveille,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa table, table surprenante par sa diversitÃ© et son originalitÃ©. Tous initient leurs petits Ã la vie gourmande en puisant dans un panier aux ressources immenses. Pour y parvenir, lâ€™oiseau a modifiÃ© son anatomie, sa morphologie, mais surtout il a radicalement adaptÃ© son organisme Ã ses choix. Par ses photos magnifiques et ses textes fascinants, lâ€™auteur nous invite Ã dÃ©couvrir les innombrables et subtiles facettes de lâ€™alimentation des oiseaux., - ,www.jeanleveille.org",,2761920813,177,4.0,1,Les oiseaux gourmands,1


In [8]:
books.shape

(100000, 9)

## Data Cleaning

In [9]:
books.isna().sum()

author              0
desc             6772
genre           10467
isbn            14482
pages               0
rating              0
reviews             0
title               1
totalratings        0
dtype: int64

### Removing Books with no Description

In [10]:
books.dropna(subset=['desc'], inplace=True)

### Remove Punctuation from the Descriptions

In [11]:
import string
punctuations = string.punctuation
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', punctuations))

books.desc = books.desc.apply(remove_punctuations)

### Remove URLs from the description

In [12]:
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_url(text):   
    return re.sub(url_pattern, r'', text)

books.desc = books.desc.apply(remove_url)

### Remove extra spaces from the text columns and convert the lettercase to lower

In [13]:
books[["title", "author", "desc", "genre"]] = pd.concat([books[col].astype(str).str.lower().str.strip() for col in ["title", "author", "desc", "genre"]], axis=1)
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,laurence m. hauptman,reveals that several hundred thousand indians were affected by the civil war and that twenty thousand indians enlisted on both sides in an attempt to gain legitimacy autonomy or simply land,"history,military history,civil war,american history,american civil war,nonfiction,north american hi...,american history,native americans",002914180X,0,3.52,5,between two fires: american indians in the civil war,33
1,"charlotte fiell,emmanuelle dirix",fashion sourcebook 1920s is the first book in a brandnew series by fiell publishing that documents comprehensively the seasonal fashion styles of the 20th century decade by decade sumptuously illustrated with over 600 original photographs drawings and prints this title is a musthave reference work for not only students of fashion but for all fashionistas fashion sourcebook 1920s focuses on the art deco period with its beautiful beaded dresses cloche hats and tbar shoes as worn by the fashionable flappers and the bright young things of the time an accompanying introduction outlines the major themes within fashion during this period and introduces its most famous designers and assesses their creative contributions text in english french german also available fashion sourcebook 1930s isbn 9781906863586 2495,"couture,fashion,historical,art,nonfiction",1906863482,576,4.51,6,fashion sourcebook 1920s,41
2,andy anderson,the seminal history and analysis of the hungarian revolution and the workers councils perhaps the single most important revolutionary event ever and this is simply the best book on it,"politics,history",948984147,124,4.15,2,hungary 56,26
3,carlotta r. anderson,allamerican anarchist chronicles the life and work of joseph a labadie 18501933 detroits prominent labor organizer and one of early labors most influential activists a dynamic participant in the major social reform movements of the gilded age labadie was a central figure in the pervasive struggle for a new social order as the american midwest underwent rapid industrialization at the end of the 19th century this engaging biography follows labadies colorful career from a childhood among a pottawatomi tribe in the michigan woods through his local and national involvement in a maze of late 19thcentury labor and reform activities including participation in the socialist labor party knights of labor greenback movement trades councils typographical union eighthourday campaigns and the rise of the american federation of labor in writing this biography of her grandfather carlotta r anderson consulted the renowned labadie collection at the university of michigan a unique collection of protest literature which extensively documents pivotal times in american labor history and radical history,"labor,history",814327079,324,3.83,1,all-american anarchist: joseph a. labadie and the labor movement,6
4,jean leveille,aujourdâ€™hui lâ€™oiseau nous invite ã sa table table surprenante par sa diversitã© et son originalitã© tous initient leurs petits ã la vie gourmande en puisant dans un panier aux ressources immenses pour y parvenir lâ€™oiseau a modifiã© son anatomie sa morphologie mais surtout il a radicalement adaptã© son organisme ã ses choix par ses photos magnifiques et ses textes fascinants lâ€™auteur nous invite ã dã©couvrir les innombrables et subtiles facettes de lâ€™alimentation des oiseaux wwwjeanleveilleorg,,2761920813,177,4.0,1,les oiseaux gourmands,1


### Remove Book Descriptions With Shorter Length.

In [14]:
# Find description word count
books["length"] = [len(d.split()) for d in books['desc'].tolist()]

print(set(books.desc[books.length.isin(range(0,4))]))

{'', 'æ´¸ã\x81®é•·å´žæ™‚ä»£ã\x81®å\x8f‹äººã€\x81æˆ\x90æµ·å”¯ã\x81œå\x8fœè‘‰ã\x81ÿã\x81¡ã\x81®é«˜æ\xa0¡ã\x81®æ–‡åœ–ç¥\xadã\x81«ã‚„ã\x81£ã\x81¦ã\x81\x8dã\x81ÿã€‚æ´¸ã\x81œå”¯ã\x81«ã€\x81é›»è©±ã‚„ãƒ¡ãƒ¼ãƒ«ã‚’é\xa0»ç¹\x81ã\x81«ã\x81™ã‚‹ã‚ˆã\x81†ã\x81«ã\x81ªã\x81£ã\x81¦ã\x81‹ã‚‰è·\x9dé›¢ã‚’æ„ÿã\x81˜ã\x81¦ã\x81„ã\x81ÿå\x8fœè‘‰ã\x81¯ã€\x81è¤‡é›‘ã\x81ªæ°—æœ\x81ã\x81¡ã\x81§å”¯ã‚’è¿žã\x81ˆã‚‹ã€‚èƒ¸ã\x81‹ã\x81\x8dä¹±ã\x81™å‡ºæ\x9d¥äº‹ç¶šã\x81\x8dã\x81®æ–‡åœ–ç¥\xadâ€¦â€¦ã€‚', 'é›ªé¢¨ã\x81®ç‹\xadé–“ã€€åˆ¹é‚£ã\x81®è¦šæ‚ÿã€\x81ç¥žã\x81®ä¸€æ‰‹ã€€æ\xad¤å‡¦ã\x81«åˆ°ã‚‹ã€‚å¹³å®‰ã\x81®ç¢\x81è\x81–ãƒ»è—¤åžÿä½\x90ç‚ºã\x81¨å¹³æˆ\x90ã\x81®å\x90\x8däººãƒ»å¡”çÿ¢è¡œæ´‹ã\x81œã‚¤ãƒ³ã‚¿ãƒ¼ãƒ\x8dãƒƒãƒˆã\x81«æµ®ã\x81‹ã\x81¶ç¢\x81ç›¤ã\x81®ä¸šã\x81§ç›¸å¯¾ã\x81™ã‚‹ã€‚æ·±ã\x81\x8fæ·±ã\x81\x8fåºƒã\x81œã\x81£ã\x81¦ã\x81„ã\x81\x8få¯¾å±€ã\x81®å®‡å®™â€•â€•â€•å\x8dƒå¹´ã\x81®æ™‚ã‚’çµœã\x81¦å®ÿç\x8f¾ã\x81™ã‚‹ã€\x81ç¥žã\x81®ä¸€æ‰‹ã‚’å·¡ã‚‹ä¸€å±€ã€€å\x8dƒå¹´ã\x81®æ™‚æ¸¡ã‚‹å›²ç¢\x81ãƒ\xadãƒžãƒ³ã€\x81å¾…æœ›ã\x81®å®œå…¨ç‰ˆåœ–ï¼\x81', 

In [15]:
# Replace empty strings of description with NaN
books.desc = books.desc.replace(r'^\s*$', np.nan, regex=True)

books[books.length.isin(range(1,4))][["isbn", "title", "desc", "length"]]\
.sort_values(by=["length"], ascending=True).head(5)

Unnamed: 0,isbn,title,desc,length
11715,9025414079,de kamer hiernaast,verhalen,1
90361,4063145840,ã‚ªã‚¯ã‚¿ãƒ¼ãƒ´ 3,èª°ã‹ã‹ã‚‰æ„›ã•ã‚œãÿã„â”€â”€ãã†é¡˜ã£ã¦ã„ãÿå®®ä¸‹é›ªä¹ƒã€‚ãã‚“ãªå½¼å¥³ã‚’è¦‹ã¤ã‘ãÿã€å²©äº•ç¯€å­ã€‚ãµãÿã‚šã¯å‡ºä¼šã„ã€ä½“ã‚’é‡ã­ã€ãã‚œã‹ã‚‰æ‹ã«è½ã¡ãÿã€‚ç•°æ€§ã‚’çÿ¥ã‚‰ãªã„é›ªä¹ƒã¨ã€ç•°æ€§ã¨ã®éžåž»ã‚’æœã¤ç¯€å­ã€‚ãã—ã¦ã™ã‚œã¡ãœã£ã¦ã—ã¾ã£ãÿå¤œã€ãµãÿã‚šã®â€é–¢ä¿‚â€ã«è½ã¡ãÿå½±â”€â”€ãã‚œã¯ã€æ±ºã—ã¦æ¶ˆã—åž»ã‚‹ã“ã¨ã®ã§ããªã„â€çµœé¨“â€ã€‚,1
90384,434482380X,èš±ã¯å’²ãã‹ 3 [hana wa saku ka 3],ç„¡æ„›æƒ³ãªç¾žå¤§ç”ÿãƒ»è“‰ä¸€ã«æƒ¹ã‹ã‚œã¯ã˜ã‚ãÿæ¡œäº•ã¯ã€æ€ã‚ãšè“‰ä¸€ã«ã‚­ã‚¹ã‚’ã€‚è“‰ä¸€ã‚‚ã¾ãÿæ¡œäº•ã¸ã®æ‹ã‚’è‡ªè¦šã—ã€æ¬¡ç¬¬ã«å¤‰ã‚ã£ã¦ã„ããœâ€¦ã€‚,1
90666,4063417808,ãã‚‡ã†ã®ã‚­ãƒ©ãã‚“ ï¼‘[kyou no kira-kun 1],â€•365æ—¥ã€çž¬ãã™ã‚‹ã®ã‚‚ãšã—ã„ãã‚‰ã„ã€ã‚ãªãÿã‚’è¦‹ã¤ã‚ã¦ã„ãã‹ã‚‰ã€‚â€•è‚©ã«ã‚¤ãƒ³ã‚³ã‚’ä¹—ã›ãÿå¤‰ã‚ã‚šè€…ã®ãƒ‹ãƒžã¨ç„¡æ„å‘³ãªæ¯žæ—¥ã‚’éžã”ã™éšã³äººã®ã‚­ãƒ©ã€‚å®¶ãœéš£åœå£«ãªã®ã«è©±ã—ãÿã“ã¨ã™ã‚‰ãªã‹ã£ãÿâ€¦ã€‚ã‘ã‚œã©ã€ãƒ‹ãƒžãœã‚­ãƒ©ã®ç§˜å¯†ã‚’çÿ¥ã£ãÿã“ã¨ã‹ã‚‰é‹å‘½ã¯äº¤éœ¯ã—ã€ç…œã‚ãç”ÿã®æ™‚ã‚’åˆ»ã¿ã¯ã˜ã‚ã‚‹â€•2äººãœç´¡ãã€å¤©å›½ã«ä¸€ç•ªè¿‘ã„æ‹ã€ç¬¬1å·»ã€‚,1
95070,919803930X,this is lean: resolving the efficiency paradox,thisislean,1


In [16]:
books.dropna(subset=["desc"], inplace=True)

# Drop records with very short description
books.drop(books.index[books.length.isin(range(0,4))], inplace = True)
del books["length"]

#### Drop Variants of the Same Book


In [17]:
books.drop_duplicates(subset=['title', 'desc', 'author'], keep='first', inplace=True)

### Extract and Remove Book Series Information from the Book Name

In [18]:
series_pattern =  "(?:[;]\s*|\(\s*)([^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*))"
def get_book_series_info(text):
    series_info = re.findall(series_pattern, text)
    if series_info:
        series_info = " ".join([i.replace(" ", "_") for i in series_info])
        return series_info
    else:
        return np.nan
    
books['book_series_info'] = books.title.apply(get_book_series_info)

In [19]:
books[books['book_series_info'].notnull()]['book_series_info'].head()

545                  uniformly_hot!,_#15
1135                harlequin_blaze_#593
1145                    mule_hollow,_#17
1296                          misfile_#1
1856    uncle_john's_bathroom_reader_#10
Name: book_series_info, dtype: object

In [20]:
series_remove_pattern = re.compile("(?:[\(]\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)(?:;|\))|\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)\))")       
books["title"]= books["title"].str.replace(series_remove_pattern, r'', regex=True).str.strip()

In [21]:
books.isna().sum()

author                  0
desc                    0
genre                   0
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
dtype: int64

### Transform Book and Author Names into Single Token

In [22]:
books["author"] = books["author"].str.strip().str.replace(' ','_')
books.head(5)

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings,book_series_info
0,laurence_m._hauptman,reveals that several hundred thousand indians were affected by the civil war and that twenty thousand indians enlisted on both sides in an attempt to gain legitimacy autonomy or simply land,"history,military history,civil war,american history,american civil war,nonfiction,north american hi...,american history,native americans",002914180X,0,3.52,5,between two fires: american indians in the civil war,33,
1,"charlotte_fiell,emmanuelle_dirix",fashion sourcebook 1920s is the first book in a brandnew series by fiell publishing that documents comprehensively the seasonal fashion styles of the 20th century decade by decade sumptuously illustrated with over 600 original photographs drawings and prints this title is a musthave reference work for not only students of fashion but for all fashionistas fashion sourcebook 1920s focuses on the art deco period with its beautiful beaded dresses cloche hats and tbar shoes as worn by the fashionable flappers and the bright young things of the time an accompanying introduction outlines the major themes within fashion during this period and introduces its most famous designers and assesses their creative contributions text in english french german also available fashion sourcebook 1930s isbn 9781906863586 2495,"couture,fashion,historical,art,nonfiction",1906863482,576,4.51,6,fashion sourcebook 1920s,41,
2,andy_anderson,the seminal history and analysis of the hungarian revolution and the workers councils perhaps the single most important revolutionary event ever and this is simply the best book on it,"politics,history",948984147,124,4.15,2,hungary 56,26,
3,carlotta_r._anderson,allamerican anarchist chronicles the life and work of joseph a labadie 18501933 detroits prominent labor organizer and one of early labors most influential activists a dynamic participant in the major social reform movements of the gilded age labadie was a central figure in the pervasive struggle for a new social order as the american midwest underwent rapid industrialization at the end of the 19th century this engaging biography follows labadies colorful career from a childhood among a pottawatomi tribe in the michigan woods through his local and national involvement in a maze of late 19thcentury labor and reform activities including participation in the socialist labor party knights of labor greenback movement trades councils typographical union eighthourday campaigns and the rise of the american federation of labor in writing this biography of her grandfather carlotta r anderson consulted the renowned labadie collection at the university of michigan a unique collection of protest literature which extensively documents pivotal times in american labor history and radical history,"labor,history",814327079,324,3.83,1,all-american anarchist: joseph a. labadie and the labor movement,6,
4,jean_leveille,aujourdâ€™hui lâ€™oiseau nous invite ã sa table table surprenante par sa diversitã© et son originalitã© tous initient leurs petits ã la vie gourmande en puisant dans un panier aux ressources immenses pour y parvenir lâ€™oiseau a modifiã© son anatomie sa morphologie mais surtout il a radicalement adaptã© son organisme ã ses choix par ses photos magnifiques et ses textes fascinants lâ€™auteur nous invite ã dã©couvrir les innombrables et subtiles facettes de lâ€™alimentation des oiseaux wwwjeanleveilleorg,,2761920813,177,4.0,1,les oiseaux gourmands,1,


### Keyword Extraction Using KeyBERT

In [23]:
# descriptions = books['desc'].tolist()
# descriptions[:10]

In [24]:
# kw_model = KeyBERT()

In [25]:
# for i in range(0, len(descriptions)):
#     keywords = kw_model.extract_keywords(descriptions[i], top_n = 10, keyphrase_ngram_range=(1, 1), stop_words="english")
#     keywords = " ".join([k[0] for k in keywords])
#     keywords_list.append(keywords)

## Keywords enhancement with authors and genre

In [26]:
keywords_list = pd.read_csv('/kaggle/input/goodreads-books-descriptions-keywords/keywords92730.csv')

In [27]:
books = books.reset_index().drop('index', axis=1)

In [28]:
books['keywords'] = keywords_list['keywords']
books.isna().sum()

author                  0
desc                    0
genre                   0
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
keywords                1
dtype: int64

In [29]:
books.genre = books['genre'].apply(lambda text: ' '.join(list(set(text.replace(',', ' ').split()))))

In [30]:
# books.loc[books.genre == 'nan']['genre'] = ''
books.genre = books.genre.replace('nan', np.nan)

In [31]:
books.isna().sum()

author                  0
desc                    0
genre                7690
isbn                12341
pages                   0
rating                  0
reviews                 0
title                   0
totalratings            0
book_series_info    92414
keywords                1
dtype: int64

In [32]:
books = books[~books.keywords.isna()]

In [33]:
# books.keywords = books.keywords + ' ' + books.genre

In [34]:
# books.keywords = books.keywords + " " + books.author

In [35]:
# books.loc[books.book_series_info.isnull(), 'book_series_info'] = ''

In [36]:
# books.keywords = books.keywords + " " + books.book_series_info

In [37]:
books_processed = books[['title', 'genre', 'keywords']]
books_processed.head(10)

Unnamed: 0,title,genre,keywords
0,between two fires: american indians in the civil war,nonfiction history americans war hi... military american civil native north,indians war enlisted civil land thousand sides autonomy legitimacy affected
1,fashion sourcebook 1920s,nonfiction art couture fashion historical,fashion 1920s fashionistas fashionable 1930s designers dresses styles brandnew fiell
2,hungary 56,history politics,hungarian revolution revolutionary councils history book event workers important analysis
3,all-american anarchist: joseph a. labadie and the labor movement,history labor,anarchist 18501933 activists protest 19th allamerican 19thcentury labor socialist industrialization
4,les oiseaux gourmands,,table anatomie photos morphologie dans oiseau wwwjeanleveilleorg petits et leurs
5,the human equation: building profits by putting people first,nonfiction human business romance leadership resources management historical,managing management managers manage organizational profits organizations highperformance corporate pfeffer
6,competitive advantage through people: unleashing the power of the work force,business romance leadership management historical,strategic firms competitive pfeffer advantage management industry organizations industries success
7,hawaii: an uncommon history,history nonfiction,hawaiian islands history jacob knowledge adler know understanding new love
8,r101: a pictorial history,,airship titanic 1920s aircraft aviation flying photographs skies construction trenches
9,genuine happiness: meditation as the path to fulfillment,nonfiction buddhism psychology philosophy religion spirituality,meditation meditations meditationâ meditative buddhist blissdrawing happiness blissthis wisdom wellbeing


In [38]:
# Fill missing genres with a placeholder and split multiple genres
books_processed['genre'] = books_processed['genre'].fillna('Unknown')
books_processed['genre'] = books_processed['genre'].apply(lambda x: x.split() if x != 'Unknown' else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_processed['genre'] = books_processed['genre'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_processed['genre'] = books_processed['genre'].apply(lambda x: x.split() if x != 'Unknown' else [])


In [72]:
books_processed.to_csv('books_preprocessed.csv')

## Encoding Keywords Using Tfidf Vectors

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, max_features=7000)
keywords_tfidf = tfidf.fit_transform(books_processed['keywords'])

In [47]:
keywords_tfidf.shape

(92729, 7000)

## Encoding Genres Using OneHot Vectors

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_onehot = mlb.fit_transform(books_processed['genre'])

In [48]:
genres_onehot.nbytes

811564208

## Creating Features from Tfidf and Genres

In [51]:
# Combine TF-IDF features and one-hot encoded genres
features = np.hstack((keywords_tfidf.toarray(), genres_onehot))

# Create a DataFrame for the combined features
feature_columns = tfidf.get_feature_names_out().tolist() + mlb.classes_.tolist()
feature_df = pd.DataFrame(features, columns=feature_columns)

# Display the first few rows of the combined feature DataFrame
print(feature_df.head())

    10   12  1776  1812  1850s  1860  1861  1862  1863  1864  1865  1880s  1900s  1912  1913  1914  1915  1916  1917  1918  1919  1920     1920s  1923  1925  1929     1930s  1931  1936  1937  1938  1939  1940  1940s  1941  1942  1943  1944  1945  1948  1949  1950s  1953  1957  1960  1960s  1961  1963  1965  1967  1970s  1973  1976  1980s  1984  1999      19th  2000  2001  2002  2003  2004  2005  2006   24   25   3d  911  aaron  abandon  abandoned  abandonment  abbess  abbey  abby  abducted  abduction  abigail  abilities  ability  aboard  abolition  abolitionist  abolitionists  aboriginal  abortion  abraham  abroad  abstract  abuse  abused  abusive  abyss  academic  academy  accessories  accident  accidents  acclaimed  accomplish  accomplishments  account  accountability  accountants  accounting  accounts  accused  ace  aceastäƒ  achieve  achievement  achieving  achilles  act  acting  action  actions  activism  activist  activists  activities  activity  actor  actors  actress  acts  ada

In [78]:
# 29845 86466 1477 39537 50950 39536 41582
books_processed[books_processed.title.str.contains('planet')].sample(10)

Unnamed: 0,title,genre,keywords,similarity
63197,doctor who and the planet of the spiders,"[fiction, who, media, in, tie, science, doctor, adventure, audiobook]",ships 2535 days usa,0.104985
49794,planet earth macmillan world atlas,[],atlas earth planet world features macmillan,0.015256
55856,lonely planet travel survival kit: central africa,[],congo gabon africa guinea african jungle canoe equatorial paddle gorilla,0.0
55408,our worlds: the magnetism and thrill of planetary exploration,[],planetary astrophysicist nasas astrophysical mars galileo venus asteroids solar explore,0.0
45847,storm warning: gambling with the climate of our planet,[],warming storms climate storm flooding warning weather greenhouse catastrophic severity,0.0
82759,groundwater for the 21st century: a primer for citizens of planet earth,"[science, geology]",groundwater freshwater wwwmwpubcocomtitlesgroundwaterhtm water contaminated literacy nature earth depleted fresh,0.108338
24571,conspiracy of the planet of the apes,"[fiction, science]",apes paintings illustrated chimpanzee illustrations gorilla astronaut 1968 archaias portraits,0.108338
11311,"wildly affordable organic: eat fabulous food, get healthy, and save the planet -- all on $5 a day or less","[nonfiction, cookbooks, food, reference, and, nutrition, drink, environment, sustainability, health, cooking]",meals sustainable organic meal eat lifestyle greener affordable grocery recipes,0.220843
1953,the compassionate diet: how what you eat can change your life and save the planet,"[nonfiction, spirituality, vegan, food, and, drink, activism, social, health, issues]",vegetarian diet vegetariana hunger compassionate compassion eat eating organic natureâ,0.200199
41146,"the man who planted trees: lost groves, champion trees, and an urgent plan to save the planet","[nonfiction, ecology, plants, history, science, biology, green, environment, gardening, nature, biography]",trees tree forests treesâ forest cloning treeâ planting planted redwoods,0.22501


## Getting Recommendations

In [90]:
# Example user ratings (book_id: rating)
user_ratings = {29845: 5, 41582: 5, 86466: 4, 1477: 3.4, 39537: 4.5, 50950: 4, 39536:3}

# Initialize an empty user profile
user_profile = np.zeros(features.shape[1]) #8049 dimensional vector

# Update user profile based on ratings
for book_id, rating in user_ratings.items():
    book_index = books_processed[books_processed.index == book_id].index[0]
    book_features = features[book_index]
    user_profile += rating * book_features

# Normalize the user profile
user_profile = user_profile / sum(user_ratings.values())

# Calculate similarity between user profile and each book
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity([user_profile], features)[0]

# Sort books based on similarity scores
books_processed['similarity'] = similarities
recommended_books = books_processed.sort_values(by='similarity', ascending=False).head(10)

recommended_books[['title', 'similarity']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_processed['similarity'] = similarities


Unnamed: 0,title,similarity
74437,"the space book: from the beginning to the end of time, 250 milestones in the history of space astronomy",0.72684
84653,archives of the universe: 100 discoveries that transformed our understanding of the cosmos,0.718367
86101,the planet mars: a history of observation and discovery,0.716007
5243,the living cosmos: our search for life in the universe,0.714418
50950,destination mars: new explorations of the red planet,0.710974
965,mars: uncovering the secrets of the red planet,0.703985
39537,exoplanets and alien solar systems,0.699517
64091,stars: a very short introduction,0.698397
41156,how it began: a time-traveler's guide to the universe,0.692816
84495,lives of the planets: a natural history of the solar system,0.690249


## Recommendation Pipeline For Any User

In [82]:
user_profiles = {}  # Dictionary to store user profiles
user_ratings_data = {}  # Dictionary to store user ratings

def update_user_profile(user_id, book_id, rating):
    global user_profiles, user_ratings_data

    if user_id not in user_profiles:
        user_profiles[user_id] = np.zeros(features.shape[1])
        user_ratings_data[user_id] = {}
    
    # Update ratings
    user_ratings_data[user_id][book_id] = rating

    # Recompute user profile
    for book_id, rating in user_ratings_data[user_id].items():
        book_index = books_processed[books_processed.index == book_id].index[0]
        book_features = features[book_index]
        user_profiles[user_id] += rating * book_features
    
    # Normalize the user profile
    user_profiles[user_id] = user_profiles[user_id] / sum(user_ratings_data[user_id].values())

    
def get_recommendations(user_id, top_n=10):
    user_profile = user_profiles[user_id]
    similarities = cosine_similarity([user_profile], features)[0]
    
     # Exclude books the user has already rated
    rated_books = user_ratings_data[user_id].keys()
    recommendations = books_processed[~books_processed.index.isin(rated_books)]
    
    recommended_books = recommendations.sort_values(by='similarity', ascending=False).head(top_n)
    
    return recommended_books[['title', 'similarity']]

## Testing
Lets test our recommendation pipeline on astronomy books. Here are the book ids of popular astronomy and space related books
29845 86466 1477 39537 50950 39536 41582

In [91]:
# Example of adding a new rating and getting updated recommendations
update_user_profile(user_id=1, book_id=29845, rating=5)
update_user_profile(user_id=1, book_id=86466, rating=5)
update_user_profile(user_id=1, book_id=1477, rating=4)
update_user_profile(user_id=1, book_id=39537, rating=3.4)
update_user_profile(user_id=1, book_id=50950, rating=4.5)
update_user_profile(user_id=1, book_id=39536, rating=4)
update_user_profile(user_id=1, book_id=41582, rating=3)

recommendations = get_recommendations(user_id=1, top_n=10)
recommendations

Unnamed: 0,title,similarity
74437,"the space book: from the beginning to the end of time, 250 milestones in the history of space astronomy",0.72684
84653,archives of the universe: 100 discoveries that transformed our understanding of the cosmos,0.718367
86101,the planet mars: a history of observation and discovery,0.716007
5243,the living cosmos: our search for life in the universe,0.714418
965,mars: uncovering the secrets of the red planet,0.703985
64091,stars: a very short introduction,0.698397
41156,how it began: a time-traveler's guide to the universe,0.692816
84495,lives of the planets: a natural history of the solar system,0.690249
34540,an intimate look at the night sky,0.686368
84755,the cambridge guide to the solar system,0.685767


In [93]:
user_ratings_data

{1: {29845: 5, 86466: 5, 1477: 4, 39537: 3.4, 50950: 4.5, 39536: 4, 41582: 3}}

In [88]:
user_profiles

{1: array([0., 0., 0., ..., 0., 0., 0.])}

Our recommender is working exactly as expected. This brings us to the end of the project. If you liked this project. Please upvote this notebook and also share it with other ML Geeks out there.