<a href="https://colab.research.google.com/github/Hafizur-Rahman-SD/ML-with-Python-FCC-Course-/blob/main/Book_Recommendation_Engine_KNN_for_FCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# Cell 1 - Import libraries and download dataset

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Download the dataset
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip -O book-crossings.zip

# Unzip the dataset
!unzip -o book-crossings.zip


--2025-10-06 14:50:37--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-10-06 14:50:37 (143 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [49]:
# Cell 2 - Load CSV files into DataFrames

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

# Load books data
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    usecols=['ISBN', 'Book-Title', 'Book-Author'],
    dtype={'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str'}
)

# Load ratings data
df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    usecols=['User-ID', 'ISBN', 'Book-Rating'],
    dtype={'User-ID': 'int32', 'ISBN': 'str', 'Book-Rating': 'float32'}
)

print("Books shape:", df_books.shape)
print("Ratings shape:", df_ratings.shape)
df_books.head()


Books shape: (271379, 3)
Ratings shape: (1149780, 3)


Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [50]:
# Cell 3 - Filter for statistical significance
# Only users with >=200 ratings
user_counts = df_ratings['User-ID'].value_counts()
df_ratings = df_ratings[df_ratings['User-ID'].isin(user_counts[user_counts >= 200].index)]

# Only books with >=100 ratings
book_counts = df_ratings['ISBN'].value_counts()
df_ratings = df_ratings[df_ratings['ISBN'].isin(book_counts[book_counts >= 100].index)]

print("Filtered Ratings:", df_ratings.shape)


Filtered Ratings: (13793, 3)


In [51]:
# Cell 4 - Create pivot table (Book x User)
book_ratings = df_ratings.pivot_table(
    index='ISBN',
    columns='User-ID',
    values='Book-Rating'
).fillna(0)

print("Pivot table shape:", book_ratings.shape)


Pivot table shape: (100, 857)


In [52]:
# Cell 5 - Train KNN model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_ratings.values)
print("KNN model trained!")


KNN model trained!


In [56]:
# Cell 6 - Robust recommendation function
def get_recommends(book=""):
    # Try exact match first (strip whitespace)
    book_strip = book.strip()
    matches = df_books[df_books['Book-Title'].str.strip() == book_strip]

    # Case-insensitive match
    if matches.empty:
        matches = df_books[df_books['Book-Title'].str.lower().str.strip() == book_strip.lower()]

    # Partial match
    if matches.empty:
        matches = df_books[df_books['Book-Title'].str.lower().str.contains(book_strip.lower(), na=False)]

    if matches.empty:
        return [book, []]  # Not found

    # Take first matching ISBN
    book_id = matches['ISBN'].values[0]

    if book_id not in book_ratings.index:
        return [book, []]  # Not enough ratings

    book_idx = book_ratings.index.get_loc(book_id)

    # Find 6 nearest neighbors (skip self)
    distances, indices = model.kneighbors(book_ratings.iloc[book_idx, :].values.reshape(1, -1), n_neighbors=6)

    recommended_books = []
    for i in range(1, min(6, len(distances.flatten()))):
        idx = indices.flatten()[i]
        isbn_sim = book_ratings.index[idx]
        title_vals = df_books[df_books['ISBN'] == isbn_sim]['Book-Title'].values
        title_sim = title_vals[0] if len(title_vals) > 0 else isbn_sim
        recommended_books.append([title_sim, round(float(distances.flatten()[i]), 2)])

    return [book, recommended_books]


In [54]:
# Cell 7 - Manual test
print(get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))"))


["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Lovely Bones: A Novel', 0.72], ["The Pilot's Wife : A Novel", 0.82], ['The Joy Luck Club', 0.82], ['The Notebook', 0.82], ['Bel Canto: A Novel', 0.82]]]


In [53]:
# Cell 8 - Validation test
def test_book_recommendation():
    title = "Where the Heart Is (Oprah's Book Club (Paperback))"
    recommends = get_recommends(title)

    test_pass = True
    if recommends[0] != title:
        test_pass = False

    if not (isinstance(recommends, list) and len(recommends) > 1 and isinstance(recommends[1], list) and len(recommends[1]) >= 2):
        test_pass = False
    else:
        recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
        recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
        for i in range(2):
            rec_title = recommends[1][i][0]
            rec_dist = recommends[1][i][1]
            if rec_title not in recommended_books:
                test_pass = False
            if abs(rec_dist - recommended_books_dist[i]) >= 0.05:
                test_pass = False

    if test_pass:
        print("✅ You passed the challenge! 🎉🎉🎉")
    else:
        print("❌ You haven't passed yet. Keep trying!")

# Run the test
test_book_recommendation()


❌ You haven't passed yet. Keep trying!
