In [1]:
import pandas as pd
import numpy as np

# For plotting purposes
import matplotlib.pyplot as plt
import seaborn as sns

# RegEx and String Manipulation
import re
import string

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
pd.set_option("max_columns", None) # show all cols
pd.set_option('max_colwidth', None) # show full width of showing cols
pd.set_option("expand_frame_repr", False) # print cols side by side as it's supposed to be
pd.options.display.max_seq_items = 200000
pd.options.display.max_rows = 400000

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv


In [3]:
books = pd.read_csv('/kaggle/input/goodreads-books-100k/GoodReads_100k_books.csv')

In [4]:
books.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   author        100000 non-null  object 
 1   bookformat    96772 non-null   object 
 2   desc          93228 non-null   object 
 3   genre         89533 non-null   object 
 4   img           96955 non-null   object 
 5   isbn          85518 non-null   object 
 6   isbn13        88565 non-null   object 
 7   link          100000 non-null  object 
 8   pages         100000 non-null  int64  
 9   rating        100000 non-null  float64
 10  reviews       100000 non-null  int64  
 11  title         99999 non-null   object 
 12  totalratings  100000 non-null  int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 9.9+ MB


### Filtering Columns

In [9]:
books = books[['author', 'desc', 'genre', 'isbn', 'pages', 'rating', 'reviews', 'title', 'totalratings']]
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",002914180X,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",1906863482,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,The seminal history and analysis of the Hungar...,"Politics,History",948984147,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life a...","Labor,History",814327079,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,2761920813,177,4.0,1,Les oiseaux gourmands,1


In [10]:
books.shape

(100000, 9)

## Data Cleaning

In [11]:
books.isna().sum()

author              0
desc             6772
genre           10467
isbn            14482
pages               0
rating              0
reviews             0
title               1
totalratings        0
dtype: int64

### Removing Books with no Description

In [12]:
books.dropna(subset=['desc'], inplace=True)

### Remove Punctuation from the Descriptions

In [21]:
import string
punctuations = string.punctuation
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', punctuations))

books.desc = books.desc.apply(remove_punctuations)

### Remove URLs from the description

In [32]:
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_url(text):   
    return re.sub(url_pattern, r'', text)

books.desc = books.desc.apply(remove_url)

### Remove extra spaces from the text columns and convert the lettercase to lower

In [33]:
books[["title", "author", "desc", "genre"]] = pd.concat([books[col].astype(str).str.lower().str.strip() for col in ["title", "author", "desc", "genre"]], axis=1)
books.head()

Unnamed: 0,author,desc,genre,isbn,pages,rating,reviews,title,totalratings
0,laurence m. hauptman,reveals that several hundred thousand indians ...,"history,military history,civil war,american hi...",002914180X,0,3.52,5,between two fires: american indians in the civ...,33
1,"charlotte fiell,emmanuelle dirix",fashion sourcebook 1920s is the first book in...,"couture,fashion,historical,art,nonfiction",1906863482,576,4.51,6,fashion sourcebook 1920s,41
2,andy anderson,the seminal history and analysis of the hungar...,"politics,history",948984147,124,4.15,2,hungary 56,26
3,carlotta r. anderson,allamerican anarchist chronicles the life and ...,"labor,history",814327079,324,3.83,1,all-american anarchist: joseph a. labadie and ...,6
4,jean leveille,aujourdâ€™hui lâ€™oiseau nous invite ã sa tab...,,2761920813,177,4.0,1,les oiseaux gourmands,1
