In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# load data set do eda
df=pd.read_csv("books_summary.csv",index_col=None)
print(df.columns)
df=df[['book_name', 'summaries', 'categories']]
print(df.columns)
print(df.shape)

Index(['book_name', 'summaries', 'categories'], dtype='object')
Index(['book_name', 'summaries', 'categories'], dtype='object')
(5201, 3)


EDA

In [3]:
# Dropping duplicates
df=df.drop_duplicates()
df.shape

(4979, 3)

In [4]:
#Checking null values
df.isna().sum()

book_name     0
summaries     7
categories    0
dtype: int64

In [5]:
temp_df=df[["book_name","summaries"]]
temp_df=temp_df.drop_duplicates()
temp_df.shape

(1231, 2)

In [6]:
# Checing different type of categories and book names
for col in df.columns:
    print(col)
    print(df[col].nunique())

book_name
1231
summaries
1227
categories
23


** EDA Finding : there are same books in multiple categories so iam gonna grp by and aggregate all categories for a book by comma in between them

In [7]:
grouped_df=df.groupby(['book_name', 'summaries'])['categories'].apply(', '.join).reset_index()
print(grouped_df.shape)
grouped_df.head()

(1230, 3)


Unnamed: 0,book_name,summaries,categories
0,"Outer Order, Inner Calm",gives you advice to declutter your space and ...,"happiness, productivity, psychology, health, w..."
1,The Book,is a spiritual exploration of true human natur...,mindfulness
2,#GIRLBOSS,shows that even an unconventional life can le...,"motivation, business, creativity, work"
3,10 Days To Faster Reading,helps you bring your reading skills to the cu...,"productivity, psychology, education"
4,10% Happier,"gives skeptics an easy “in” to meditation, by...","happiness, psychology, mindfulness"


# My approach for Content-Based Book Recommendation:



Step 1:  Preprocessing: Clean the dataset by removing duplicates, handling missing values, and combining relevant textual features (e.g., book summaries and categories) into a single column.

Step 2: Vectorization: Convert the textual data into numerical representations using methods like TF-IDF, CountVectorizer, or BERT embeddings to capture content features.

Step 3: Similarity Calculation: Compute the cosine similarity between the input book's vector and the vectors of all other books in the dataset.

Step 4: Recommendation: Identify and rank the top 5 most similar books based on the highest similarity scores, ensuring the input book itself is excluded.

# Step 1: Preprocessing_data

Step-1 : Adding book name ,summary and category into one text column it will help to build a much more accurate recommender system after Renaming columns 

Step-2 : Removing double spaces , converting to lower and removing stopwords

In [8]:
def preprocess_data(data):
    """
    Preprocess the dataset by handling missing values and combining columns.
    - data: DataFrame, raw dataset.
    - Returns: DataFrame, preprocessed dataset with a 'content' column.
    """
    # Renaming columns
    data.rename(columns={"summaries":"summary","categories":"category"},inplace=True)
    data = data.dropna(subset=['summary','category'])  # Remove rows with missing data
    data = data.groupby(['book_name', 'summary'])['category'].apply(', '.join).reset_index()
    # Combine summary and category and book_name as content and embeddings for this is generated at once
    data['content'] = data['book_name']+" "+data['summary'] +" "+data['category']
    print("Preprocessing data is complete")
    return data

In [9]:
df=preprocess_data(df)

Preprocessing data is complete


# Step 2: Vectors, Embeddings

Tried count or tfidf vectorizer and bert ebeddings to see which one performs better and gives better recommendations based on content

In [10]:
# 1. Count Vectorizer
def compute_count_vectorizer(data, column='content'):
    """
    Compute the Count Vectorizer representation for the dataset.
    - data: DataFrame, preprocessed dataset.
    - column: str, name of the column to vectorize.
    - Returns: tuple, (Count Vectorizer matrix, fitted CountVectorizer object).
    """
    vectorizer = CountVectorizer(stop_words='english')
    count_matrix = vectorizer.fit_transform(data[column])
    return count_matrix, vectorizer

# 2. TF-IDF Vectorizer
def compute_tfidf_vectorizer(data, column='content'):
    """
    Compute the TF-IDF representation for the dataset.
    - data: DataFrame, preprocessed dataset.
    - column: str, name of the column to vectorize.
    - Returns: tuple, (TF-IDF matrix, fitted TfidfVectorizer object).
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data[column])
    return tfidf_matrix, vectorizer

# 3. BERT Embeddings
def compute_bert_embeddings(data, column='content', model_name='all-MiniLM-L6-v2'):
    """
    Compute BERT embeddings for the dataset using SentenceTransformer.
    - data: DataFrame, preprocessed dataset.
    - column: str, name of the column to encode.
    - model_name: str, name of the SentenceTransformer model.
    - Returns: numpy.ndarray, BERT embeddings.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(data[column].tolist(), show_progress_bar=True)
    return np.array(embeddings), model


In [11]:
count_matrix, count_vectorizer=compute_count_vectorizer(df)
tfidf_matrix, tfidf_vectorizer=compute_tfidf_vectorizer(df)
# bert  
bert_embeddings, bert_model=compute_bert_embeddings(df)


Batches: 100%|██████████| 39/39 [00:17<00:00,  2.24it/s]


# Step 3: Cosine Similarity and top 5 recommendations

Now to calculate similarity score, I did a bit of research on which are the best methods for the same. I found several techniques like euclidean, the Pearson and the cosine similarity scores. I will be using cosine similarity scores to find the similarity between two movies
One advantage of cosine similarity score is that it is independent of magnitude and is relatively easy and fast to calculate.

In [22]:

def recommend_books_vectorizer(book_title, data, vectorizer_output, vectorizer, vectorizer_type='TF-IDF'):
    """
    Recommend books using the specified vectorizer output.
    - book_title: str, title of the book entered by the user.
    - data: DataFrame, preprocessed dataset.
    - vectorizer_output: array or sparse matrix, vectorized representation of the content.
    - vectorizer: the fitted vectorizer used on the dataset.
    - vectorizer_type: str, type of vectorizer ('Count', 'TF-IDF', 'BERT').
    - Returns: list, top 5 recommended book titles.
    """
    # Vectorize the input book title using the already fitted vectorizer
    if vectorizer_type == 'BERT':
        input_vec = vectorizer.encode([book_title])
    else:
        input_vec = vectorizer.transform([book_title])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(input_vec, vectorizer_output).flatten()

    # Get top 5 recommendations, excluding the input book itself
    similar_indices = cosine_sim.argsort()[-6:-1][::-1]  # Top 6 books to ensure we exclude the input book itself

    # Ensure the recommendations are distinct
    recommendations = []
    for idx in similar_indices:
        if data.iloc[idx]['book_name'].lower() != book_title.lower():
            recommendations.append(data.iloc[idx]['book_name'])
        if len(recommendations) == 5:  # Only take the first 5 distinct books
            break

    print(recommendations)  # Optional: Print to debug
    # return recommendations

Testing how similar the top5 books are 

In [23]:
# example 1
x="How to make friends"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Brandwashed', 'Everything Is Obvious', 'The Art Of Choosing', 'The Mom Test', 'Journey of Awakening']
tfidf
['The Mom Test', 'How To Win Friends And Influence People', 'The Art Of Asking', 'The Data Detective', 'Brandwashed']
bert
['How To Talk To Anyone', 'How to Talk to Anyone, Anytime, Anywhere', 'The Fine Art Of Small Talk', 'Survival Of The Friendliest', 'Talking To Strangers']


In [24]:
# example 2
x="Factfulness: Ten Reasons We're Wrong About the World"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['How Not To Be Wrong', 'The Man Who Fed The World', 'Capitalism', 'Prisoners Of Geography', 'The Facebook Effect']
tfidf
['Factfulness', 'The Joy Of Movement', 'Why Nations Fail', 'The Antidote', 'Happier']
bert
['Merchants of Doubt', 'The Uninhabitable Earth', 'Mistakes Were Made, But Not By Me', 'When Bad Things Happen To Good People', 'One Decision']


In [25]:
# example 3
x="End of a journey"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Storyworthy', 'The Fine Art Of Small Talk', 'A Beginner’s Guide To The End', 'Loving What Is', 'Finding My Virginity']
tfidf
['Storyworthy', 'Loving What Is', 'The Fine Art Of Small Talk', 'A Beginner’s Guide To The End', 'Finding My Virginity']
bert
['The Art Of Travel', 'Never Finished', 'Necessary Endings', 'Ending Aging', 'The Last Lecture']


In [26]:
# Example 4
x="Outliers: The Story of Success"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Outliers', 'The Man Who Solved The Market', 'Predictable Success', 'Titan', 'Lessons from the Titans']
tfidf
['A Beautiful Mind', 'Lessons from the Titans', 'Titan', 'The Man Who Solved The Market', 'Predictable Success']
bert
['Good To Great', 'Predictable Success', 'The Rise', 'Winners Take All', 'The Innovator’s Dilemma']


In [27]:
# example 5
x="Sapiens: A Brief History of Humankind"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Brief Answers To The Big Questions', 'Chernobyl', 'A Brief History Of Time', 'The Lessons Of History', 'The Better Angels Of Our Nature']
tfidf
['A Brief History Of Everyone Who Ever Lived', 'A Brief History Of Time', 'On The Origin Of Species', 'Seven Brief Lessons On Physics', 'The Hero With a Thousand Faces']
bert
['A Brief History Of Everyone Who Ever Lived', 'The Social Leap', 'Homo Deus', 'The Lessons Of History', 'The World Until Yesterday']


In [28]:
# example 6
x="The Courage to Be Disliked"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Lives of the Stoics', 'Daring Greatly', 'Imperfect Courage', 'The Compound Effect', 'Big Magic']
tfidf
['Daring Greatly', 'Big Magic', 'Imperfect Courage', 'Lives of the Stoics', 'The Compound Effect']
bert
['The Courage Habit', 'Courage Is Calling', 'Brave', 'Be Fearless', 'The Road to Character']


In [29]:
# example 6
x="The Courage to Be Disiked"
print("count")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=count_matrix,vectorizer=count_vectorizer)
print("tfidf")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=tfidf_matrix,vectorizer=tfidf_vectorizer)
print("bert")
recommend_books_vectorizer(book_title=x,data=df,vectorizer_output=bert_embeddings,vectorizer=bert_model, vectorizer_type='BERT')

count
['Daring Greatly', 'Imperfect Courage', 'The Compound Effect', 'Big Magic', 'The Courage Habit']
tfidf
['Big Magic', 'Imperfect Courage', 'Lives of the Stoics', 'The Compound Effect', 'The Courage Habit']
bert
['The Courage Habit', 'Daring Greatly', 'Dare To Lead', 'Be Fearless', 'The Execution Factor']


# Final Reasoning

After testing for 7 examples from file and other new book names , i have noticed better results from bert as u can see from above cell tf idf, count also performed very well but i finalized with bert becoz


Reason 1: BERT Gets semantic context right and gives similar books based on context instead of getting unique or highest freuqnecy words and 

Reason 2: Pre-trained Knowledge,Leverages rich embeddings trained on large datasets.(example 5 for input sapiens breif history of time others didnot input homodeus book which contains matches exactly with input but bert did)
 

# Final vectorizer - BERT

Saving model and embeddings in pickle file 
(Reduces latency by instead of generating embeddings and loadin model everytime user inputs)


In [17]:
def compute_and_store_embeddings(data, model_name="all-MiniLM-L6-v2", output_file="book_embeddings.pkl"):
    """
    Compute BERT embeddings and store them alongside book names.
    - data: DataFrame, preprocessed dataset.
    - model_name: str, name of the SentenceTransformer model.
    - output_file: str, path to store embeddings and book names.
    - Returns: dict, mapping of book names to their embeddings.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(data["content"].tolist(), show_progress_bar=True)
    embedding_dict = {data.iloc[i]["book_name"]: embeddings[i] for i in range(len(data))}
    
    # Save to file
    with open(output_file, "wb") as file:
        pickle.dump(embedding_dict, file)

    pickle.dump(model, open("bert_model.pkl", "wb"))  # Save the BERT model
    print("Precomputed embeddings and model saved!")
    
    return embedding_dict, model

In [None]:
# compute_and_store_embeddings(df)

Batches: 100%|██████████| 39/39 [00:17<00:00,  2.25it/s]


Precomputed embeddings and model saved!


({' Outer Order, Inner Calm': array([ 4.81784232e-02, -4.56361137e-02,  8.52670744e-02,  6.01411574e-02,
          3.81252803e-02, -3.55789736e-02,  2.31547523e-02, -6.08980358e-02,
          2.88387537e-02,  9.79044288e-03,  4.93386984e-02,  3.29411365e-02,
         -2.97105927e-02, -8.58163387e-02,  4.68594916e-02,  1.50180059e-02,
          4.63182591e-02, -9.90799069e-03, -8.95929411e-02, -1.16825346e-02,
         -4.96215932e-02, -2.12809294e-02, -3.73211429e-02,  1.62679069e-02,
         -9.16831568e-02,  7.40778074e-02, -5.20297736e-02, -3.84528976e-04,
          1.00931853e-01, -5.07958159e-02,  3.88944149e-02,  7.12155923e-02,
          5.96567281e-02,  1.17291138e-02, -2.51658615e-02,  2.93798838e-02,
          5.04935300e-03,  5.43966796e-03,  7.22206160e-02, -4.66279984e-02,
         -2.23912057e-02,  1.20066646e-02,  1.69134215e-02, -3.89036350e-02,
         -1.73102859e-02, -5.75706922e-02, -1.74035672e-02, -6.94119334e-02,
         -3.44116800e-03, -1.07398428e-01, -9.93

In [19]:
#Done