<a href="https://colab.research.google.com/github/Jiyabisht/Book_Recommender/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required libraries and handle the NumPy compatibility issue
!pip uninstall -y numpy
!pip install numpy==1.26.4
!pip install scikit-surprise

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 



In [2]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the full path to the single CSV file
file_path = '/content/drive/MyDrive/Colab Notebooks/book recom./BookCrossingThemes.csv'

# Load the single dataset file with the correct separator (semicolon)
df = pd.read_csv(file_path, sep=';', on_bad_lines='skip', encoding='latin-1')

print("DataFrame loaded successfully with correct columns!")
print(df.columns)
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DataFrame loaded successfully with correct columns!
Index(['Book-Title', 'Book-Author', 'User-ID', 'ISBN', 'Book-Rating',
       'Year-Of-Publication', 'Publisher', 'Location', 'Age', 'category',
       'description', 'num_words', 'num_chars', 'cleaned_description',
       'Theme'],
      dtype='object')
                                Book-Title       Book-Author  User-ID  \
0                         The Terminal Man  Michael Crichton   276964   
1                              The Chamber      John Grisham   276964   
2  The Girl Who Loved Tom Gordon : A Novel      Stephen King   276964   
3                              In the Dark    Richard Laymon   276964   
4                        Tailchaser's Song      Tad Williams   276964   

        ISBN  Book-Rating  Year-Of-Publication                Publisher  \
0  345354621           10                 1988     

In [3]:
# The loaded DataFrame from the previous step is named 'df'

# Clean up column names by stripping any leading or trailing whitespace
df.columns = df.columns.str.strip()

# Filter out users and books with a low number of interactions
user_counts = df['User-ID'].value_counts()
book_counts = df['Book-Title'].value_counts()
active_users = user_counts[user_counts > 100].index
popular_books = book_counts[book_counts > 50].index

filtered_df = df[df['User-ID'].isin(active_users)]
# Create a full copy to prevent the SettingWithCopyWarning
final_data = filtered_df[filtered_df['Book-Title'].isin(popular_books)].copy()

# Select only the columns needed for the collaborative filtering model
model_data = final_data[['User-ID', 'Book-Title', 'Book-Rating']]

print("Filtered Data:")
print(model_data.head())

# Save the filtered data to a CSV file for later use in the web app
final_data.to_csv('final_data.csv')

Filtered Data:
      User-ID                                         Book-Title  Book-Rating
5020    21014                                     The Bean Trees            8
5022    21014    Divine Secrets of the Ya-Ya Sisterhood: A Novel            8
5023    21014                           Prodigal Summer: A Novel            8
5035    21014  The Bad Beginning (A Series of Unfortunate Eve...            8
5036    21014      Fried Green Tomatoes at the Whistle Stop Cafe            8


In [4]:
!pip install scikit-surprise



In [5]:
from surprise import Dataset, Reader, SVD, dump
from surprise.model_selection import train_test_split # Correct function name
from surprise import accuracy

# The Reader class parses the DataFrame, specifying the rating scale.
reader = Reader(rating_scale=(1, 10))

# Load the DataFrame into the Surprise Dataset format
data = Dataset.load_from_df(model_data, reader)

# Split the data into a training set and a testing set
# CORRECTED LINE: Used 'train_test_split'
trainset, testset = train_test_split(data, test_size=0.25)

# Use the SVD algorithm, a popular matrix factorization technique.
algo = SVD()

# Train the model on the training data
algo.fit(trainset)

# Evaluate the model's performance on the test set
predictions = algo.test(testset)
print("Model accuracy (RMSE):")
accuracy.rmse(predictions)

# Save the trained model to a file for later use in the web app
dump.dump('book_recommender_model.pkl', algo=algo)
print("Collaborative filtering model saved.")

Model accuracy (RMSE):
RMSE: 1.3662
Collaborative filtering model saved.


In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Use the filtered data for this step
# This variable should be available from the previous steps
# A mapping from book title to its index in the DataFrame
indices = pd.Series(final_data.index, index=final_data['Book-Title']).drop_duplicates()

# Combine all content-based features into a single string for each book
features = ['Theme', 'category', 'Book-Author', 'description']
for feature in features:
    if feature in final_data.columns:
        final_data[feature] = final_data[feature].fillna('')

def create_combined_features(row):
    return ' '.join(str(row[f]) for f in features)

final_data['combined_features'] = final_data.apply(create_combined_features, axis=1)

# Create the TF-IDF vectorizer and calculate the cosine similarity matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(final_data['combined_features'])

# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Content-based filtering setup complete.")

Content-based filtering setup complete.


In [7]:
from surprise import Dataset, Reader, SVD, dump
from surprise.model_selection import train_test_split
from surprise import accuracy

# The Reader class parses the DataFrame, specifying the rating scale.
reader = Reader(rating_scale=(1, 10))

# Load the DataFrame into the Surprise Dataset format
data = Dataset.load_from_df(model_data, reader)

# Split the data into a training set and a testing set
trainset, testset = train_test_split(data, test_size=0.25)

# Use the SVD algorithm, a popular matrix factorization technique.
algo = SVD()

# Train the model on the training data
algo.fit(trainset)

# Evaluate the model's performance on the test set
predictions = algo.test(testset)
print("Model accuracy (RMSE):")
accuracy.rmse(predictions)

# Save the trained model to a file for later use in the web app
dump.dump('book_recommender_model.pkl', algo=algo)
print("Collaborative filtering model saved.")

Model accuracy (RMSE):
RMSE: 1.9423
Collaborative filtering model saved.


In [8]:
import pandas as pd
import numpy as np
from surprise import dump
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

# --- DEFINE YOUR FILE PATHS HERE ---
BOOK_THEMES_PATH = '/content/drive/MyDrive/Colab Notebooks/book recom./BookCrossingThemes.csv'
FINAL_DATA_PATH = 'final_data.csv'
MODEL_PATH = 'book_recommender_model.pkl'

# --- 1. Load Data and Model ---
print("--- Initializing Hybrid Recommender ---")
try:
    df = pd.read_csv(BOOK_THEMES_PATH, sep=';', on_bad_lines='skip', encoding='latin-1')
    final_data = pd.read_csv(FINAL_DATA_PATH, index_col=0)
    algo = dump.load(MODEL_PATH)[0]
    print("Files loaded successfully. Setting up content matrix...")
except FileNotFoundError as e:
    print(f"\nCRITICAL ERROR: File not found: {e.filename}.")
    print("Please ensure you have run the model training steps (Part 2) and that files are in the /content directory.")
    exit()

# --- 2. Content Matrix Setup (Memory Optimized) ---
df.columns = df.columns.str.strip()
final_data.columns = final_data.columns.str.strip()
indices = pd.Series(df.index, index=df['Book-Title']).drop_duplicates()
features = ['Theme', 'category', 'Book-Author', 'description']

for feature in features:
    if feature in df.columns:
        df[feature] = df[feature].fillna('')
def create_combined_features(row):
    return ' '.join(str(row[f]) for f in features)
df['combined_features'] = df.apply(create_combined_features, axis=1)

content_base = final_data[['Book-Title'] + features].drop_duplicates(subset=['Book-Title']).reset_index(drop=True).copy()
content_base.columns = content_base.columns.str.strip()
content_base['combined_features'] = content_base.apply(create_combined_features, axis=1)
indices_small = pd.Series(content_base.index, index=content_base['Book-Title']).drop_duplicates()

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, max_df=0.85)
tfidf_matrix_small = tfidf_vectorizer.fit_transform(content_base['combined_features'])
cosine_sim_matrix_small = cosine_similarity(tfidf_matrix_small, tfidf_matrix_small)
print("Content matrix setup complete.")


# --- 3. Hybrid Recommendation Function (Kept for completeness) ---
def hybrid_recommendations(user_id, num_recommendations=10, weight_collaborative=0.7, weight_content=0.3):
    try:
        all_books = df['Book-Title'].unique()
        user_rated_books = final_data[final_data['User-ID'] == user_id]['Book-Title'].tolist()

        recommendations = []
        for book_title in all_books:
            if book_title not in user_rated_books:
                collab_score = algo.predict(user_id, book_title).est
                content_score = 0
                if user_rated_books:
                    current_book_index = indices_small.get(book_title)

                    if current_book_index is not None:
                        rated_indices_small = [indices_small[title] for title in user_rated_books if title in indices_small]

                        if rated_indices_small and current_book_index < cosine_sim_matrix_small.shape[0]:
                            content_scores = [cosine_sim_matrix_small[current_book_index][idx] for idx in rated_indices_small if idx < cosine_sim_matrix_small.shape[1]]
                            content_score = np.mean(content_scores) if content_scores else 0

                hybrid_score = (weight_collaborative * collab_score) + (weight_content * content_score * 10)
                recommendations.append((book_title, hybrid_score))

        recommendations.sort(key=lambda x: x[1], reverse=True)
        return recommendations[:num_recommendations]
    except Exception as e:
        print(f"An error occurred during hybrid recommendation: {e}")
        return []

# --- NEW: Content-Based Recommendation by Title ---
def get_content_recommendations_by_title(title, num_recommendations=10):
    """Generates recommendations for a book title based on content similarity."""

    # Find the best matching title (case-insensitive/partial match for user friendliness)
    matches = [t for t in indices_small.index if title.lower().strip() in t.lower()]

    if not matches:
        return "Book not found in our popular list. Try another title."

    # Use the first best match found
    title = matches[0]
    print(f"-> Found best match: '{title}'")

    # Get the index of the book in the small matrix
    idx = indices_small[title]

    # Get the similarity scores for that book with all other books
    sim_scores = list(enumerate(cosine_sim_matrix_small[idx]))

    # Sort the scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N books (excluding the book itself at index 0)
    sim_scores = sim_scores[1:num_recommendations + 1]

    # Get the corresponding book titles and similarity scores
    book_indices = [i[0] for i in sim_scores]
    similar_titles = content_base['Book-Title'].iloc[book_indices].tolist()
    scores = [score[1] for score in sim_scores]

    return list(zip(similar_titles, scores))


# --- 4. NEW Simple Command Line Interface (CLI) by Title ---
if __name__ == '__main__':
    while True:
        try:
            user_input = input("\nEnter Book Title (or 'quit' to exit, e.g., 'The Green Mile'): ")

            if user_input.lower() == 'quit':
                print("Exiting application. Goodbye!")
                break

            if not user_input.strip():
                continue

            print(f"\n--- Searching for books similar to '{user_input.strip()}' ---")
            recs = get_content_recommendations_by_title(user_input.strip())

            if isinstance(recs, str):
                print(f"ERROR: {recs}")
            elif recs:
                print(f"| Rank | Book Title | Similarity Score |")
                print(f"| :--- | :--- | :--- |")
                for i, (title, score) in enumerate(recs):
                    print(f"| {i+1} | {title} | {score:.4f} |")
            else:
                print("Could not generate recommendations for this title.")

        except Exception as e:
            print(f"An unexpected error occurred: {e}")


--- Initializing Hybrid Recommender ---
Files loaded successfully. Setting up content matrix...
Content matrix setup complete.

Enter Book Title (or 'quit' to exit, e.g., 'The Green Mile'): The Green Mile

--- Searching for books similar to 'The Green Mile' ---
-> Found best match: 'The Green Mile'
| Rank | Book Title | Similarity Score |
| :--- | :--- | :--- |
| 1 | Jurassic Park | 0.2112 |
| 2 | Dreamcatcher | 0.2036 |
| 3 | Desperation | 0.1659 |
| 4 | Pet Sematary | 0.1336 |
| 5 | Cause of Death | 0.1027 |
| 6 | A Heartbreaking Work of Staggering Genius | 0.0690 |
| 7 | The Perfect Storm : A True Story of Men Against the Sea | 0.0614 |
| 8 | The Hours: A Novel | 0.0527 |
| 9 | The Fellowship of the Ring (The Lord of the Rings, Part 1) | 0.0520 |
| 10 | Full House (Janet Evanovich's Full Series) | 0.0251 |

Enter Book Title (or 'quit' to exit, e.g., 'The Green Mile'): quit
Exiting application. Goodbye!


In [9]:
!pip install gradio



In [12]:
import pandas as pd
import numpy as np
import os
import gradio as gr
from surprise import dump
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# Suppress warnings that clutter the interface output
warnings.filterwarnings('ignore')

# --- CONFIGURATION & FILE PATHS ---
BOOK_THEMES_PATH = '/content/drive/MyDrive/Colab Notebooks/book recom./BookCrossingThemes.csv'
FINAL_DATA_PATH = 'final_data.csv'
MODEL_PATH = 'book_recommender_model.pkl' # Model is loaded but not used in this specific content demo

# --- 1. Load Data and Model ---
print("--- Initializing Hybrid Recommender ---")
try:
    df = pd.read_csv(BOOK_THEMES_PATH, sep=';', on_bad_lines='skip', encoding='latin-1')
    final_data = pd.read_csv(FINAL_DATA_PATH, index_col=0)
    # Load model but suppress its output for a clean interface
    algo = dump.load(MODEL_PATH)[0]
    print("Files loaded successfully. Setting up content matrix...")
except FileNotFoundError as e:
    print(f"\nCRITICAL ERROR: File not found: {e.filename}.")
    print("Please ensure you have run the model training steps and that files are in the /content directory.")
    exit()

# --- 2. Content Matrix Setup (Memory Optimized) ---
df.columns = df.columns.str.strip()
final_data.columns = final_data.columns.str.strip()
indices = pd.Series(df.index, index=df['Book-Title']).drop_duplicates()
features = ['Theme', 'category', 'Book-Author', 'description']

for feature in features:
    if feature in df.columns:
        df[feature] = df[feature].fillna('')
def create_combined_features(row):
    return ' '.join(str(row[f]) for f in features)
df['combined_features'] = df.apply(create_combined_features, axis=1)

content_base = final_data[['Book-Title'] + features].drop_duplicates(subset=['Book-Title']).reset_index(drop=True).copy()
content_base.columns = content_base.columns.str.strip()
content_base['combined_features'] = content_base.apply(create_combined_features, axis=1)
indices_small = pd.Series(content_base.index, index=content_base['Book-Title']).drop_duplicates()

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, max_df=0.85)
tfidf_matrix_small = tfidf_vectorizer.fit_transform(content_base['combined_features'])
cosine_sim_matrix_small = cosine_similarity(tfidf_matrix_small, tfidf_matrix_small)
print("Content matrix setup complete. Model ready for Gradio.")


# --- 3. Gradio Interface Function ---
def get_recommendations_for_gradio(title):
    """
    Generates recommendations and formats output for Gradio.
    This function is wrapped around the core logic.
    """
    if not title:
        return pd.DataFrame()

    # --- Core Content-Based Logic (from previous steps) ---
    title = title.strip()

    # Find the best matching title (case-insensitive/partial match for user friendliness)
    matches = [t for t in indices_small.index if title.lower() in t.lower()]

    if not matches:
        return pd.DataFrame({"Result": ["Book not found in our popular list."]})

    # Use the first best match found
    match_title = matches[0]

    # Get the index of the book in the small matrix
    idx = indices_small[match_title]

    # Get the similarity scores for that book with all other books
    sim_scores = list(enumerate(cosine_sim_matrix_small[idx]))

    # Sort the scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 10 books (excluding the book itself at index 0)
    sim_scores = sim_scores[1:11]

    # Get the corresponding book titles and similarity scores
    book_indices = [i[0] for i in sim_scores]
    similar_titles = content_base['Book-Title'].iloc[book_indices].tolist()
    scores = [score[1] for score in sim_scores]

    # --- Format output for Gradio ---
    results_df = pd.DataFrame({
        "#": range(1, len(similar_titles) + 1), # Changed "Rank" to "#"
        "Title": similar_titles,
        "Similarity Score": [f"{s:.4f}" for s in scores],
        "Author": content_base['Book-Author'].iloc[book_indices].tolist()
    })

    # Add a header string for display
    header_df = pd.DataFrame([{"#": f"Recommended based on match: '{match_title}'", # Changed "Rank" to "#"
                               "Title": "", "Similarity Score": "", "Author": ""}],
                               columns=results_df.columns)

    return pd.concat([header_df, results_df], ignore_index=True)


# --- 4. Launch Gradio Interface ---

if __name__ == '__main__':
    # Define interface components
    input_text = gr.Textbox(label="Enter Book Title", placeholder="e.g., The Green Mile, Harry Potter, or a keyword like 'pirates'")
    # Updated headers to use '#'
    output_table = gr.Dataframe(label="Top 10 Content-Similar Books", headers=["#", "Title", "Similarity Score", "Author"], row_count=(11, 'fixed'), wrap=True)

    # Create the interface block
    iface = gr.Interface(
        fn=get_recommendations_for_gradio,
        inputs=input_text,
        outputs=output_table,
        title="📚 Simple Book Content Recommender",
        description="This demo uses TF-IDF and Cosine Similarity (Content-Based Filtering) on themes, categories, and descriptions to find similar books. The final Hybrid model logic is integrated in the application structure."
    )

    # Launch the interface, which creates a public link usable in Colab
    iface.launch(share=True)


--- Initializing Hybrid Recommender ---
Files loaded successfully. Setting up content matrix...
Content matrix setup complete. Model ready for Gradio.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://82678b034ca783722c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
