In [35]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DATASET_DIR = 'dataset'
MODEL_DIR = 'model_assets'

In [48]:
try:
    df_recommendation = pd.read_csv(os.path.join(DATASET_DIR, 'books_cleaned.csv'))
    print("Dataset loaded Successfully.")

    try:
        df_full_metadata = pd.read_csv(os.path.join(DATASET_DIR, "books.csv"))
        print("Full Metadata Dataset loaded successfully.")
    except FileNotFoundError:
        print("Error: Full metadata dataset file not found.")
        df_full_metadata = None       

except FileExistsError:
    print("Dataset not found. Please ensure the dataset is in the correct directory.")
    raise

print(f"\nTotal Number of books (rows): {len(df_recommendation)}")
print("-"*40)

Dataset loaded Successfully.
Full Metadata Dataset loaded successfully.

Total Number of books (rows): 6810
----------------------------------------


In [49]:
tdidf = TfidfVectorizer(
    stop_words='english',
    max_features=8000,
    min_df=5, # words must appear in at least 5 books
    max_df=0.85 # ignore words that appear in more than 85% of the books
)

print("starting TF-IDF vectorization...")
tfidf_matrix = tdidf.fit_transform(df_recommendation['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

starting TF-IDF vectorization...
TF-IDF matrix shape: (6810, 8000)


In [50]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("cosine similarity matrix shape: ", cosine_sim.shape)

output_similarity_path = os.path.join(MODEL_DIR, "cosine_sim.npy")
np.save(output_similarity_path, cosine_sim)
print(f"Cosine similarity matrix saved to {output_similarity_path}")

cosine similarity matrix shape:  (6810, 6810)
Cosine similarity matrix saved to model_assets\cosine_sim.npy


In [51]:
def get_recommendations(title, cosine_sim, df_lean, df_full, N=10, print_output=True):

    indices = pd.Series(df_lean.index, index=df_lean['title']).drop_duplicates()

    # Check if the given title exists in the dataset
    if title not in indices:
        print(f"Error: Book title '{title}' not found.")
        return pd.DataFrame()

    # Get the index of the selected book
    idx = indices[title]

    sim_scores = sorted(
        enumerate(cosine_sim[idx]),
        key=lambda x: x[1],
        reverse=True
    )[1:N+1]


    # Separate book indices and similarity values
    book_indices, similarity_scores = zip(*sim_scores)

    recs = df_lean.iloc[list(book_indices)][['isbn13']].copy()
    recs['Similarity Score'] = similarity_scores


    display_cols = [
        'title', 'subtitle', 'authors',
        'categories', 'published_year', 'average_rating'
    ]

    # Remove duplicate ISBNs to ensure safe merging
    df_full_unique = df_full.drop_duplicates('isbn13')

    final_df = recs.merge(
        df_full_unique[['isbn13'] + display_cols],
        on='isbn13',
        how='left'
    ).fillna({
        'subtitle': '',
        'authors': '',
        'categories': ''
    })

    if print_output:
        print(f"\nTop {len(final_df)} Recommendations for: **{title}**")
        print("-" * 50)

        for i, row in final_df.iterrows():
            print(f"{i+1}. **{row['title']}**")

            # Print subtitle only if available
            if row['subtitle']:
                print(f"    ({row['subtitle']})")

            # Clean category formatting and show authors
            categories_cleaned = row['categories'].strip('[]\"\'')
            print(f"    {categories_cleaned}, by {row['authors']}")

            # Show rating only if present
            if not pd.isna(row['average_rating']):
                print(f"    ⭐ {row['average_rating']:.2f}")

            # Show similarity score
            print(f"    (Similarity: {row['Similarity Score']:.4f})\n")

    # Return final recommendations as a clean DataFrame
    return final_df.reset_index(drop=True)

In [52]:
# usage
recommedation_df = get_recommendations("The One Tree", cosine_sim, df_recommendation, df_full_metadata, N=10, print_output=True)


Top 10 Recommendations for: **The One Tree**
--------------------------------------------------
1. **A Dark and Hungry God Arises**
    (The Gap Into Power)
    Fiction, by Stephen R. Donaldson
    ⭐ 4.08
    (Similarity: 0.4668)

2. **Reave the Just and Other Tales**
    Fiction, by Stephen R. Donaldson
    ⭐ 3.90
    (Similarity: 0.4494)

3. **Forbidden Knowledge**
    (The Gap Into Vision)
    Fiction, by Stephen R. Donaldson
    ⭐ 3.99
    (Similarity: 0.3960)

4. **Crime Novels**
    (American Noir of the 1930s and 40s)
    American fiction, by 
    ⭐ 4.36
    (Similarity: 0.3168)

5. **The Bachman Books**
    (Four Early Novels)
    Fiction, by Stephen King
    ⭐ 4.11
    (Similarity: 0.2698)

6. **The Gap Into Madness**
    (Chaos and Order)
    Hyland, Morn (Fictitious character), by Stephen R. Donaldson
    ⭐ 4.15
    (Similarity: 0.2498)

7. **Assassin's Apprentice**
    American fiction, by Robin Hobb
    ⭐ 4.15
    (Similarity: 0.2407)

8. **Best Science Fiction Stories of