In [1]:
import pandas as pd

# Load raw data
df = pd.read_csv('../data/books.csv')

# Display shape and columns
print(f"Shape: {df.shape}")
df.head()


Shape: (10000, 30)


Unnamed: 0.1,Unnamed: 0,index,authors,average_rating,best_book_id,book_id,books_count,description,genres,goodreads_book_id,...,ratings_3,ratings_4,ratings_5,ratings_count,small_image_url,title,work_id,work_ratings_count,work_text_reviews_count,authors_2
0,0,0,['Suzanne Collins'],4.34,2767052,1,272,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['young-adult', 'fiction', 'fantasy', 'science...",2767052,...,560092,1481305,2706317,4780653,https://images.gr-assets.com/books/1447303603s...,"The Hunger Games (The Hunger Games, #1)",2792775,4942365,155254,['Suzanne Collins']
1,1,1,"['J.K. Rowling', 'Mary GrandPré']",4.44,3,2,491,Harry Potter's life is miserable. His parents ...,"['fantasy', 'fiction', 'young-adult', 'classics']",3,...,455024,1156318,3011543,4602479,https://images.gr-assets.com/books/1474154022s...,Harry Potter and the Sorcerer's Stone (Harry P...,4640799,4800065,75867,"['J.K. Rowling', 'Mary GrandPré']"
2,2,2,['Stephenie Meyer'],3.57,41865,3,226,About three things I was absolutely positive.\...,"['young-adult', 'fantasy', 'romance', 'fiction...",41865,...,793319,875073,1355439,3866839,https://images.gr-assets.com/books/1361039443s...,"Twilight (Twilight, #1)",3212258,3916824,95009,['Stephenie Meyer']
3,3,3,['Harper Lee'],4.25,2657,4,487,The unforgettable novel of a childhood in a sl...,"['classics', 'fiction', 'historical-fiction', ...",2657,...,446835,1001952,1714267,3198671,https://images.gr-assets.com/books/1361975680s...,To Kill a Mockingbird,3275794,3340896,72586,['Harper Lee']
4,4,4,['F. Scott Fitzgerald'],3.89,4671,5,1356,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"['classics', 'fiction', 'historical-fiction', ...",4671,...,606158,936012,947718,2683664,https://images.gr-assets.com/books/1490528560s...,The Great Gatsby,245494,2773745,51992,['F. Scott Fitzgerald']


In [2]:
# Drop rows with missing descriptions
df = df.dropna(subset=['description'])

# Keep only necessary columns
df = df[['index', 'title', 'authors', 'average_rating', 'ratings_count', 'description', 'genres']]

# Remove duplicates
df = df.drop_duplicates(subset='title')

# Reset index
df.reset_index(drop=True, inplace=True)

print(f"Shape: {df.shape}")
df.head()

Shape: (9909, 7)


Unnamed: 0,index,title,authors,average_rating,ratings_count,description,genres
0,0,"The Hunger Games (The Hunger Games, #1)",['Suzanne Collins'],4.34,4780653,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['young-adult', 'fiction', 'fantasy', 'science..."
1,1,Harry Potter and the Sorcerer's Stone (Harry P...,"['J.K. Rowling', 'Mary GrandPré']",4.44,4602479,Harry Potter's life is miserable. His parents ...,"['fantasy', 'fiction', 'young-adult', 'classics']"
2,2,"Twilight (Twilight, #1)",['Stephenie Meyer'],3.57,3866839,About three things I was absolutely positive.\...,"['young-adult', 'fantasy', 'romance', 'fiction..."
3,3,To Kill a Mockingbird,['Harper Lee'],4.25,3198671,The unforgettable novel of a childhood in a sl...,"['classics', 'fiction', 'historical-fiction', ..."
4,4,The Great Gatsby,['F. Scott Fitzgerald'],3.89,2683664,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"['classics', 'fiction', 'historical-fiction', ..."


In [3]:
# Clean up line breaks in description
df['description'] = df['description'].str.replace('\n', ' ', regex=False)

def clean_string(x):
    if isinstance(x, str):
        x = x.strip("[]")                  # remove brackets
        x = x.replace("'", "")             # remove single quotes
    return x

df['authors'] = df['authors'].apply(clean_string)
df['genres'] = df['genres'].apply(clean_string)

# Recreate clean text column
df['text'] = df['description'] + " " + df['authors'] + " " + (df['genres'] + " ") * 5

df.to_csv('../data/cleaned_books.csv', index=False)

In [4]:
df = pd.read_csv('../data/cleaned_books.csv')
print(df.shape)
df[['title', 'text']].head()

(9909, 8)


Unnamed: 0,title,text
0,"The Hunger Games (The Hunger Games, #1)",WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...
1,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...
2,"Twilight (Twilight, #1)",About three things I was absolutely positive. ...
3,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...
4,The Great Gatsby,Alternate Cover Edition ISBN: 0743273567 (ISBN...


In [6]:
import re
from rapidfuzz import fuzz, process

def extract_series_name(title):
    """Extracts the series name from a title like 'Book Title (Series Name, #2)'."""
    match = re.search(r'\(([^,]+),\s?#\d+\)', title)
    if match:
        return match.group(1).strip().lower()
    return None

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
book_embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)
np.save('../models/embeddings.npy', book_embeddings)

semantic_sim = cosine_similarity(book_embeddings)
np.save('../models/semantic_sim_matrix.npy', semantic_sim)

Batches: 100%|██████████| 310/310 [00:44<00:00,  6.93it/s]


In [17]:
from rapidfuzz import fuzz, process

def recommend_semantic_fuzzy(user_input_title, df, sim_matrix, top_n=5, score_threshold=60):
    titles = df['title'].tolist()
    best_match = process.extractOne(user_input_title, titles, scorer=fuzz.token_set_ratio)

    if best_match is None or best_match[1] < score_threshold:
        return [f"No close match found for: '{user_input_title}'"]

    matched_title = best_match[0]
    print(f"Best match: {matched_title}")
    idx = df[df['title'] == matched_title].index[0]
    original_series = extract_series_name(matched_title)

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    exclusion_keywords = ['boxset', 'box set', 'guide', 'parody', 'authors on', 'world of']
    recommendations = []

    for i in sim_scores:
        candidate_title = df.iloc[i[0]]['title']
        if candidate_title == matched_title:
            continue
        if any(kw in candidate_title.lower() for kw in exclusion_keywords):
            continue
        if original_series and extract_series_name(candidate_title) == original_series:
            continue

        recommendations.append(candidate_title)
        if len(recommendations) == top_n:
            break

    return recommendations

In [18]:
recommend_semantic_fuzzy("to kill a mockingbird", df, semantic_sim)

Best match: To Kill a Mockingbird


['Go Set a Watchman',
 'Gilead (Gilead, #1)',
 'The Grapes of Wrath',
 'Ellen Foster',
 'Beloved']