In [2]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle as pkl

In [3]:
print("Loading Books data...")
Books = pd.read_csv("Books.csv", on_bad_lines="skip")
Books = Books[["ISBN","Book-Title","Book-Author","Year-Of-Publication","Publisher","Image-URL-L"]]

Loading Books data...


  Books = pd.read_csv("Books.csv", on_bad_lines="skip")


In [4]:
# Rename columns
Books.rename(columns={
    "Book-Title": "Title",
    "Book-Author": "Author", 
    "Year-Of-Publication": "Year",
    "Publisher": "Publisher",
    "Image-URL-L": "Image_URL"
}, inplace=True)

print(f"Loaded {len(Books)} books")
print("Sample books data:")
print(Books.head())

# Cell 3: Load Users Data
print("Loading Users data...")
Users = pd.read_csv("Users.csv", on_bad_lines="skip")
print(f"Loaded {len(Users)} users")
print("Sample users data:")
print(Users.head())

# Cell 4: Load Ratings Data
print("Loading Ratings data...")
ratings = pd.read_csv("Ratings.csv", on_bad_lines="skip")
print(f"Loaded {len(ratings)} ratings")
print("Sample ratings data:")
print(ratings.head())

Loaded 271360 books
Sample books data:
         ISBN                                              Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 Author  Year                   Publisher  \
0    Mark P. O. Morford  2002     Oxford University Press   
1  Richard Bruce Wright  2001       HarperFlamingo Canada   
2          Carlo D'Este  1991             HarperPerennial   
3      Gina Bari Kolata  1999        Farrar Straus Giroux   
4       E. J. W. Barber  1999  W. W. Norton &amp; Company   

                                           Image_URL  
0  http://images.amazon.com/images/P/0195153448.0...  
1  http://images.amazon.com/images/P/0002005018.0...  
2  http://images.ama

In [5]:
# Cell 5: Filter Active Users (>200 ratings)
print("Filtering active users (>200 ratings)...")
user_rating_counts = ratings["User-ID"].value_counts()
print(f"Users with >200 ratings: {sum(user_rating_counts > 200)}")

active_users = user_rating_counts[user_rating_counts > 200].index
ratings_filtered = ratings[ratings["User-ID"].isin(active_users)]

print(f"Original ratings: {len(ratings)}")
print(f"Filtered ratings: {len(ratings_filtered)}")
print(f"Reduction: {len(ratings) - len(ratings_filtered)} ratings removed")


Filtering active users (>200 ratings)...
Users with >200 ratings: 899
Original ratings: 1149780
Filtered ratings: 526356
Reduction: 623424 ratings removed


In [6]:
# Cell 6: Merge Books and Ratings
print("Merging books and ratings...")
ratings_with_books = ratings_filtered.merge(Books, on="ISBN")
print(f"Merged dataset has {len(ratings_with_books)} entries")
print("Sample merged data:")
print(ratings_with_books.head())

# Cell 7: Count Ratings Per Book
print("Counting ratings per book...")
number_ratings = ratings_with_books.groupby("Title")["Book-Rating"].count().reset_index()
number_ratings.rename(columns={"Book-Rating":"num_of_rating"}, inplace=True)

print("Books with most ratings:")
print(number_ratings.sort_values('num_of_rating', ascending=False).head(10))

Merging books and ratings...
Merged dataset has 487671 entries
Sample merged data:
   User-ID        ISBN  Book-Rating  \
0   277427  002542730X           10   
1   277427  0026217457            0   
2   277427  003008685X            8   
3   277427  0030615321            0   
4   277427  0060002050            0   

                                               Title                 Author  \
0  Politically Correct Bedtime Stories: Modern Ta...      James Finn Garner   
1                 Vegetarian Times Complete Cookbook             Lucy  Moll   
2                                           Pioneers  James Fenimore Cooper   
3   Ask for May, Settle for June (A Doonesbury book)          G. B. Trudeau   
4                  On a Wicked Dawn (Cynster Novels)      Stephanie Laurens   

   Year                  Publisher  \
0  1994  John Wiley &amp; Sons Inc   
1  1995      John Wiley &amp; Sons   
2  1974           Thomson Learning   
3  1982        Henry Holt &amp; Co   
4  2002          

In [8]:
# Cell 8: Filter Books with At Least 50 Ratings
print("Merging with rating counts...")
final_ratings = ratings_with_books.merge(number_ratings, on="Title")

print("Filtering books with at least 50 ratings...")
final_ratings = final_ratings[final_ratings["num_of_rating"] >= 50]

print(f"Books with >=50 ratings: {final_ratings['Title'].nunique()}")
print(f"Final dataset entries: {len(final_ratings)}")

# Cell 9: Remove Duplicates
print("Removing duplicates...")
print(f"Before removing duplicates: {len(final_ratings)}")

final_ratings.drop_duplicates(subset=["Title", "User-ID"], inplace=True)

print(f"After removing duplicates: {len(final_ratings)}")
print(f"Unique books: {final_ratings['Title'].nunique()}")
print(f"Unique users: {final_ratings['User-ID'].nunique()}")

Merging with rating counts...
Filtering books with at least 50 ratings...
Books with >=50 ratings: 742
Final dataset entries: 61853
Removing duplicates...
Before removing duplicates: 61853
After removing duplicates: 59850
Unique books: 742
Unique users: 888


In [9]:
# Cell 10: Create Pivot Table
print("Creating pivot table...")
book_pivot = final_ratings.pivot_table(
    columns="User-ID", 
    index="Title", 
    values="Book-Rating"
)

print(f"Pivot table shape: {book_pivot.shape}")
print(f"Books (rows): {book_pivot.shape[0]}")
print(f"Users (columns): {book_pivot.shape[1]}")
print(f"Sparsity: {(book_pivot.isna().sum().sum() / (book_pivot.shape[0] * book_pivot.shape[1])) * 100:.2f}%")

# Cell 11: Prepare Data for Model Training
print("Preparing data for model training...")

# Fill NaN values with 0
book_pivot_filled = book_pivot.fillna(0)

# Convert to sparse matrix for memory efficiency
print("Converting to sparse matrix...")
book_sparse = csr_matrix(book_pivot_filled.values)

print(f"Sparse matrix shape: {book_sparse.shape}")
print(f"Non-zero elements: {book_sparse.nnz}")
print(f"Density: {book_sparse.nnz / (book_sparse.shape[0] * book_sparse.shape[1]) * 100:.4f}%")

Creating pivot table...
Pivot table shape: (742, 888)
Books (rows): 742
Users (columns): 888
Sparsity: 90.92%
Preparing data for model training...
Converting to sparse matrix...
Sparse matrix shape: (742, 888)
Non-zero elements: 14961
Density: 2.2706%


In [10]:
# Cell 12: Train KNN Model
print("Training KNN model...")
model = NearestNeighbors(algorithm="brute", metric="cosine")
model.fit(book_sparse)

print("KNN model trained successfully!")
print(f"Model metric: {model.metric}")
print(f"Model algorithm: {model.algorithm}")

# Cell 13: Save Model and Data
print("Saving model and data...")

# Save all components
pkl.dump(model, open("model.pkl", "wb"))
pkl.dump(book_pivot.index, open("books_name.pkl", "wb"))
pkl.dump(final_ratings, open("final_ratings.pkl", "wb"))
pkl.dump(book_pivot, open("book_pivot.pkl", "wb"))

print("All files saved successfully!")
print("Saved files:")
print("- model.pkl (trained KNN model)")
print("- books_name.pkl (book titles)")
print("- final_ratings.pkl (processed ratings data)")
print("- book_pivot.pkl (pivot table)")


Training KNN model...
KNN model trained successfully!
Model metric: cosine
Model algorithm: brute
Saving model and data...
All files saved successfully!
Saved files:
- model.pkl (trained KNN model)
- books_name.pkl (book titles)
- final_ratings.pkl (processed ratings data)
- book_pivot.pkl (pivot table)


In [11]:
# Cell 14: Define Recommendation Function
def recommend_book(book_name, model, book_pivot, n_recommendations=5):
    """
    Get book recommendations based on collaborative filtering
    
    Parameters:
    book_name (str): Name of the book to get recommendations for
    model: Trained KNN model
    book_pivot: Pivot table with books and ratings
    n_recommendations (int): Number of recommendations to return
    
    Returns:
    list: List of recommended book titles
    """
    try:
        # Find the book's row index in the pivot table
        book_id = np.where(book_pivot.index == book_name)[0][0]
    except IndexError:
        return [f"Book '{book_name}' not found in database."]
    
    # Create query vector (filled with 0 for missing values)
    query = book_pivot.iloc[book_id, :].fillna(0).values.reshape(1, -1)

    # Get nearest neighbors (n_recommendations + 1 to exclude the input book)
    distances, indices = model.kneighbors(query, n_neighbors=n_recommendations + 1)

    # Collect recommended books (excluding the input book itself)
    recommendations = []
    for i in range(1, len(indices[0])):  # Start from 1 to skip the input book
        book_title = book_pivot.index[indices[0][i]]
        distance = distances[0][i]
        recommendations.append((book_title, distance))

    return recommendations

print("Recommendation function defined successfully!")


Recommendation function defined successfully!


In [12]:
# Cell 15: Test the Recommendation System
print("="*60)
print("TESTING BOOK RECOMMENDATION SYSTEM")
print("="*60)

# Display available books for testing
print("\nSample books available in the database:")
sample_books = book_pivot.index[:10].tolist()
for i, book in enumerate(sample_books, 1):
    print(f"{i}. {book}")

# Test with the first book
if len(sample_books) > 0:
    test_book = sample_books[0]
    print(f"\n🔍 Getting recommendations for: '{test_book}'")
    print("-" * 60)
    
    recommendations = recommend_book(test_book, model, book_pivot, n_recommendations=5)
    
    if isinstance(recommendations[0], str) and "not found" in recommendations[0]:
        print(recommendations[0])
    else:
        print("📚 Recommended books:")
        for i, (book_title, similarity_score) in enumerate(recommendations, 1):
            print(f"{i}. {book_title}")
            print(f"   Similarity Score: {1-similarity_score:.4f}")
            print()


TESTING BOOK RECOMMENDATION SYSTEM

Sample books available in the database:
1. 1984
2. 1st to Die: A Novel
3. 2nd Chance
4. 4 Blondes
5. 84 Charing Cross Road
6. A Bend in the Road
7. A Case of Need
8. A Child Called \It\": One Child's Courage to Survive"
9. A Civil Action
10. A Cry In The Night

🔍 Getting recommendations for: '1984'
------------------------------------------------------------
📚 Recommended books:
1. Animal Farm
   Similarity Score: 0.2695

2. The Catcher in the Rye
   Similarity Score: 0.2231

3. Lord of the Flies
   Similarity Score: 0.2215

4. The Handmaid's Tale
   Similarity Score: 0.2128

5. Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death
   Similarity Score: 0.2076



In [14]:
# Cell 16: Interactive Testing Function
def test_recommendations_interactive():
    """
    Interactive function to test recommendations with any book
    """
    print("\n" + "="*50)
    print("INTERACTIVE BOOK RECOMMENDATION TESTING")
    print("="*50)
    
    print(f"\nDatabase contains {len(book_pivot.index)} books")
    
    # Show some popular books
    print("\nSome popular books you can try:")
    popular_books = final_ratings.groupby('Title')['num_of_rating'].first().sort_values(ascending=False).head(5)
    for i, (book, rating_count) in enumerate(popular_books.items(), 1):
        print(f"{i}. {book} ({rating_count} ratings)")
    
    print("\nTo test recommendations, use:")
    print("recommendations = recommend_book('BOOK_NAME_HERE', model, book_pivot)")
    print("\nExample:")
    print("recommendations = recommend_book('The Lovely Bones: A Novel', model, book_pivot)")

# Run the interactive testing function
test_recommendations_interactive()

# Cell 17: Model Statistics and Summary
print("\n" + "="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)

print(f"✅ Total books in model: {len(book_pivot.index):,}")
print(f"✅ Total users: {book_pivot.shape[1]:,}")
print(f"✅ Total ratings processed: {len(final_ratings):,}")
print(f"✅ Matrix density: {(book_sparse.nnz / (book_sparse.shape[0] * book_sparse.shape[1]) * 100):.4f}%")
print(f"✅ Average ratings per book: {final_ratings.groupby('Title').size().mean():.1f}")
print(f"✅ Average ratings per user: {final_ratings.groupby('User-ID').size().mean():.1f}")

print(f"\n📁 Model files saved:")
print(f"   - model.pkl ({round(pkl.dumps(model).__sizeof__()/1024/1024, 2)} MB)")
print(f"   - book_pivot.pkl")
print(f"   - final_ratings.pkl") 
print(f"   - books_name.pkl")



INTERACTIVE BOOK RECOMMENDATION TESTING

Database contains 742 books

Some popular books you can try:
1. Wild Animus (363 ratings)
2. Bridget Jones's Diary (277 ratings)
3. The Lovely Bones: A Novel (270 ratings)
4. The Notebook (241 ratings)
5. The Pelican Brief (236 ratings)

To test recommendations, use:
recommendations = recommend_book('BOOK_NAME_HERE', model, book_pivot)

Example:
recommendations = recommend_book('The Lovely Bones: A Novel', model, book_pivot)

MODEL TRAINING SUMMARY
✅ Total books in model: 742
✅ Total users: 888
✅ Total ratings processed: 59,850
✅ Matrix density: 2.2706%
✅ Average ratings per book: 80.7
✅ Average ratings per user: 67.4

📁 Model files saved:
   - model.pkl (0.17 MB)
   - book_pivot.pkl
   - final_ratings.pkl
   - books_name.pkl
