<a href="https://colab.research.google.com/github/Hafizur-Rahman-SD/ML-with-Python-FCC-Course-/blob/main/Book_Recommendation_Engine_KNN_for_FCC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Import all required libraries
# These help in data loading, analysis, and KNN model training

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


In [5]:
# Download dataset directly from FreeCodeCamp server
# This ensures you get the correct version of the Book-Crossings dataset

!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip -O book-crossings.zip
!unzip -o book-crossings.zip

print("✅ Dataset downloaded and extracted successfully!")


--2025-10-06 14:16:55--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-10-06 14:16:55 (151 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            
✅ Dataset downloaded and extracted successfully!


In [6]:
# Load the Books and Ratings data
# Using ISO-8859-1 encoding because dataset contains special characters
# Semicolon (;) is used as a separator in these CSV files

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    usecols=['ISBN', 'Book-Title', 'Book-Author'],
    dtype={'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str'}
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    usecols=['User-ID', 'ISBN', 'Book-Rating'],
    dtype={'User-ID': 'int32', 'ISBN': 'str', 'Book-Rating': 'float32'}
)

print("Books shape:", df_books.shape)
print("Ratings shape:", df_ratings.shape)
print(df_books.head(3))


Books shape: (271379, 3)
Ratings shape: (1149780, 3)
         ISBN            Book-Title           Book-Author
0  0195153448   Classical Mythology    Mark P. O. Morford
1  0002005018          Clara Callan  Richard Bruce Wright
2  0060973129  Decision in Normandy          Carlo D'Este


In [7]:
# Remove users with <200 ratings and books with <100 ratings
# This ensures statistical significance for KNN model

user_counts = df_ratings['User-ID'].value_counts()
df_ratings = df_ratings[df_ratings['User-ID'].isin(user_counts[user_counts >= 200].index)]

book_counts = df_ratings['ISBN'].value_counts()
df_ratings = df_ratings[df_ratings['ISBN'].isin(book_counts[book_counts >= 100].index)]

print("✅ Filtered dataset shape:", df_ratings.shape)


✅ Filtered dataset shape: (13793, 3)


In [8]:
# Create pivot table where:
# Rows = Books (ISBN)
# Columns = Users (User-ID)
# Values = Ratings

book_features = df_ratings.pivot_table(index='ISBN', columns='User-ID', values='Book-Rating').fillna(0)

# Convert to sparse matrix (saves memory)
book_features_matrix = csr_matrix(book_features.values)

print("✅ Book-user matrix created with shape:", book_features_matrix.shape)


✅ Book-user matrix created with shape: (100, 857)
