In [8]:
import re
import csv
from tqdm import tqdm
import pandas as pd
import json

# Loading the Dataset

In [9]:
data = []

with open("booksummaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(reader):
        data.append(row)

16559it [00:01, 14785.74it/s]


In [10]:
#storing the data in a dataframe 
# Initialize empty lists to store the data
book_id = []
book_name = []
summary = []
genre = []
# Iterate over the rows in the data

for i in tqdm(data):
  # Extract the information for each column and store it in the corresponding list
    book_id.append(i[0])
    book_name.append(i[2])
    genre.append(i[5])
    summary.append(i[6])

# Create a Pandas DataFrame from the lists
books = pd.DataFrame({'book_id': book_id, 'book_name': book_name,
                       'genre': genre, 'summary': summary})
books.head(2)

100%|██████████| 16559/16559 [00:00<00:00, 608371.19it/s]


Unnamed: 0,book_id,book_name,genre,summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."


In [12]:
#to check for duplicates
books.duplicated().sum() 

0

In [13]:
# to check for null values
books.isnull().sum()

book_id      0
book_name    0
genre        0
summary      0
dtype: int64

In [11]:
# preprocessing
import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize the stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert the input text to string
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Return the processed text as a string
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[column_name] = df[column_name].apply(preprocess_text)
    return df

# Apply the preprocess_dataframe function to the books DataFrame

books_df = preprocess_dataframe(books, 'book_name')
books_df = preprocess_dataframe(books, 'genre')
books_df = preprocess_dataframe(books, 'summary')
# Display the first five rows of the processed dataframe
books_df.head()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,book_id,book_name,genre,summary
0,620,animal farm,lj roman u e clef nbt satire dwly child litera...,old major old boar manor farm call animal farm...
1,843,clockwork orange,n science fiction l h novella dfn speculative ...,alex teenager living near future england lead ...
2,986,plague,existentialism xlf fiction pym absurdist ficti...,text plague divided five part town oran thousa...
3,1756,enquiry concerning human understanding,,argument enquiry proceeds series incremental s...
4,2080,fire upon deep,lrw hard science fiction n science fiction dfn...,novel posit space around milky way divided con...


In [12]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16559 entries, 0 to 16558
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   book_id    16559 non-null  object
 1   book_name  16559 non-null  object
 2   genre      16559 non-null  object
 3   summary    16559 non-null  object
dtypes: object(4)
memory usage: 517.6+ KB


In [13]:
#placing the entries in the summary and genre columns in a column called book info
books_df["book_info"] = books_df["summary"] + " " + books_df["genre"] 
#deleting the summary and genre columns
books_df.drop(['summary','genre'],inplace=True, axis=1)
books_df.sample(3)

Unnamed: 0,book_id,book_name,book_info
1047,423607,valley moon,novel valley moon story working class couple b...
13403,20798854,handle,erica timperley city girl love motorcycle bore...
2394,1337192,fence,focus wilson attention fence troy year old hea...


In [14]:
#dropping the book id column
books_df.drop(['book_id'],inplace=True, axis=1)

In [16]:
# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tf = TfidfVectorizer(analyzer = "word", ngram_range=(1,2), min_df=0, stop_words='english')

tfidf_matrix = tf.fit_transform(books_df['book_info'])

cosine_sim =  cosine_similarity(tfidf_matrix, tfidf_matrix)

In [17]:
print(cosine_sim)

[[1.         0.00852435 0.00695293 ... 0.00413351 0.0016602  0.00621277]
 [0.00852435 1.         0.01304514 ... 0.00442298 0.00266182 0.00762438]
 [0.00695293 0.01304514 1.         ... 0.00626294 0.00404595 0.01404262]
 ...
 [0.00413351 0.00442298 0.00626294 ... 1.         0.00259177 0.00937035]
 [0.0016602  0.00266182 0.00404595 ... 0.00259177 1.         0.00300258]
 [0.00621277 0.00762438 0.01404262 ... 0.00937035 0.00300258 1.        ]]


In [20]:
indices = pd.Series(books_df['book_name'])
indices[:5]

0                               animal farm
1                          clockwork orange
2                                    plague
3    enquiry concerning human understanding
4                            fire upon deep
Name: book_name, dtype: object

In [21]:
def recommend(title, cosine_sim = cosine_sim):
    if title not in indices.values:
        return "Title not found in the database."
    recommended_books = []
    idx = indices[indices == title].index[0]   # to get the index of book name matching the input book_name
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar books
    # [1:11] to exclude 0 (index 0 is the input book itself)
    
    for i in top_10_indices:   # to append the titles of top 10 similar booksto the recommended_books list
        recommended_books.append(list(books_df['book_name'])[i])
        
    return recommended_books

In [22]:
recommend('plague')

['gregor curse warmbloods',
 'octagonal raven',
 'sleepy',
 'year wonder',
 'fablehaven grip shadow plague',
 'vampire plague london',
 'death winter',
 'white plague',
 'forest mage',
 'plague lord ruel']

In [24]:
recommend('the stand')

'Title not found in the database.'

In [25]:
recommend('fire upon deep')

['death glory',
 'jesus incident',
 'marrow',
 'singularity sky',
 'deepness sky',
 'roadside picnic',
 'marriage zone three four five',
 'voyage space beagle',
 'stroke night',
 'mote god eye']