# Import Libraries

In [235]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

In [236]:
#Load the datasets
direc = r"C:\Users\handd\OneDrive\Desktop\Semester 6 (2025)\DL\PSET3 - collaborative filtering\book recom"

books = pd.read_csv(os.path.join(direc, "Books.csv"), low_memory=False, encoding='ISO-8859-1')
ratings = pd.read_csv(os.path.join(direc, "Ratings.csv"), encoding='ISO-8859-1')
users = pd.read_csv(os.path.join(direc, "Users.csv"), encoding='ISO-8859-1')

### Data review

In [237]:
# Display the first few rows of each dataset
print("\nBooks sample:")
print(books.head())
print("\nRatings sample:")
print(ratings.head())
print("\nUsers sample:")
print(users.head())


Books sample:
         ISBN  \
0  0195153448   
1  0002005018   
2  0060973129   
3  0374157065   
4  0393045218   

                                                                                           Book-Title  \
0                                                                                 Classical Mythology   
1                                                                                        Clara Callan   
2                                                                                Decision in Normandy   
3  Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It   
4                                                                              The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este   

In [238]:
# Display basic information about the datasets
print("Books dataset info:")
print(books.info())
print("\nRatings dataset info:")
print(ratings.info())
print("\nUsers dataset info:")
print(users.info())

Books dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB
None

Ratings dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating 

In [239]:
# Display basic information
print("Books dataset shape:", books.shape)
print("Ratings dataset shape:", ratings.shape)
print("Users dataset shape:", users.shape)

Books dataset shape: (271360, 8)
Ratings dataset shape: (1149780, 3)
Users dataset shape: (278858, 3)


In [240]:
# Check for duplicates
print("Duplicate books:", books.duplicated().sum())
print("Duplicate ratings:", ratings.duplicated().sum())
print("Duplicate users:", users.duplicated().sum())

Duplicate books: 0
Duplicate ratings: 0
Duplicate users: 0


# Clean

In [241]:
active_users = ratings['User-ID'].value_counts() > 200
filtered_users = active_users[active_users].index
ratings = ratings[ratings['User-ID'].isin(filtered_users)]

In [242]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [243]:
ratings.shape

(526356, 3)

In [244]:
# Merge datasets
r_b = ratings.merge(books, on='ISBN')

In [245]:
# Count ratings per book
num_rating = r_b.groupby('Book-Title')['Book-Rating'].count().reset_index()
num_rating.rename(columns={'Book-Rating':'Number_Of_Rating'},inplace=True)
num_rating.head()

Unnamed: 0,Book-Title,Number_Of_Rating
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance for the 1990s,1
4,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),1


In [246]:
# Merge to keep books with at least 50 ratings
Rating = r_b.merge(num_rating, on='Book-Title')
Rating = Rating[Rating['Number_Of_Rating'] >= 50]
Rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Number_Of_Rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Tales for Our Life and Times,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/002542730X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/002542730X.01.LZZZZZZZ.jpg,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060930535.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060930535.01.LZZZZZZZ.jpg,133
15,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060934417.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060934417.01.LZZZZZZZ.jpg,108
18,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Paperback)),Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0061009059.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0061009059.01.LZZZZZZZ.jpg,108
24,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/006440188X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/006440188X.01.LZZZZZZZ.jpg,79


In [247]:
# Dropping the duplicates
Rating.drop_duplicates(['User-ID','Book-Title'],inplace=True)

In [248]:
# Create a pivot table
book_pivot = Rating.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating', fill_value=0)
book_sparse = csr_matrix(book_pivot)

# Recommendation

In [249]:
model = NearestNeighbors(algorithm= 'brute')
model.fit(book_sparse)

In [250]:
# Function to recommend books
def recommend_book(book_name):
    if book_name not in book_pivot.index:
        print("Book not found in dataset.")
        return
    
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distances, suggestions = model.kneighbors(book_pivot.iloc[book_id, :].values.reshape(1, -1), n_neighbors=6)
    
    print(f"You searched for '{book_name}'\n")
    print("Recommended books:\n")
    
    for i in suggestions[0]:
        if book_pivot.index[i] != book_name:
            print(book_pivot.index[i])

In [254]:
# Save model and data using pickle
pickle.dump(model, open(os.path.join(direc, 'model.pkl'), 'wb'))
pickle.dump(book_pivot.index, open(os.path.join(direc, 'book_names.pkl'), 'wb'))
pickle.dump(Rating, open(os.path.join(direc, 'Rating.pkl'), 'wb'))
pickle.dump(book_pivot, open(os.path.join(direc, 'book_pivot.pkl'), 'wb'))

In [255]:
# Example usage
recommend_book("Harry Potter and the Chamber of Secrets (Book 2)")

You searched for 'Harry Potter and the Chamber of Secrets (Book 2)'

Recommended books:

Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Sorcerer's Stone (Book 1)
Exclusive
The Cradle Will Fall


In [185]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='Book-Title')


In [186]:
#keeping books name
book_names = book_pivot.index

In [198]:
book_names

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=742)

In [187]:
np.where(book_pivot.index == '4 Blondes')[0][0]

np.int64(3)

In [188]:
# final_rating['title'].value_counts()
ids = np.where(Rating['Book-Title'] == "Harry Potter and the Chamber of Secrets (Book 2)")[0][0]

In [189]:
Rating.iloc[ids]['Image-URL-L']

'http://images.amazon.com/images/P/0439064872.01.LZZZZZZZ.jpg'

In [190]:
book_name = []
for book_id in suggestion:
    book_name.append(book_pivot.index[book_id])
    

In [191]:
book_name[0]

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='Book-Title')

In [192]:

ids_index = []
for name in book_name[0]: 
    ids = np.where(Rating['Book-Title'] == name)[0][0]
    ids_index.append(ids)

In [193]:

for idx in ids_index:
    url = Rating.iloc[idx]['Image-URL-L']
    print(url)

http://images.amazon.com/images/P/0439064872.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0439139597.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0439136369.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/043936213X.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0446604232.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0440115450.01.LZZZZZZZ.jpg


In [200]:

import pickle
pickle.dump(model,open(r"C:\Users\handd\OneDrive\Desktop\Semester 6 (2025)\DL\PSET3 - collaborative filtering\book recom\model.pkl",'wb'))
pickle.dump(book_names,open(r"C:\Users\handd\OneDrive\Desktop\Semester 6 (2025)\DL\PSET3 - collaborative filtering\book recom\book_names.pkl",'wb'))
pickle.dump(Rating,open(r"C:\Users\handd\OneDrive\Desktop\Semester 6 (2025)\DL\PSET3 - collaborative filtering\book recom\Rating.pkl",'wb'))
pickle.dump(book_pivot,open(r"C:\Users\handd\OneDrive\Desktop\Semester 6 (2025)\DL\PSET3 - collaborative filtering\book recom\book_pivot.pkl",'wb'))

# Testing

In [195]:

def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6 )
    
    for i in range(len(suggestion)):
            books = book_pivot.index[suggestion[i]]
            for j in books:
                if j == book_name:
                    print(f"You searched '{book_name}'\n")
                    print("The suggestion books are: \n")
                else:
                    print(j)

In [196]:
book_name = "Harry Potter and the Chamber of Secrets (Book 2)"
recommend_book(book_name)

You searched 'Harry Potter and the Chamber of Secrets (Book 2)'

The suggestion books are: 

Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Sorcerer's Stone (Book 1)
Exclusive
The Cradle Will Fall


In [197]:
# Analyze the ratings distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Book-Rating', data=ratings_df)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

NameError: name 'ratings_df' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
# Check the year of publication distribution
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
plt.figure(figsize=(12, 6))
year_counts = books_df['Year-Of-Publication'].value_counts().sort_index()
year_counts = year_counts[(year_counts.index >= 1950) & (year_counts.index <= 2006)]
plt.plot(year_counts.index, year_counts.values)
plt.title('Number of Books by Publication Year (1950-2006)')
plt.xlabel('Year')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
# Let's check how many ratings users typically give
user_rating_counts = ratings_df.groupby('User-ID').size()
plt.figure(figsize=(12, 6))
sns.histplot(user_rating_counts, bins=50, kde=True)
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Users')
plt.xlim(0, 100)  # Limiting x-axis for better visualization
plt.show()

In [None]:
# Print some statistics about user rating behaviors
print(f"Average ratings per user: {user_rating_counts.mean():.2f}")
print(f"Median ratings per user: {user_rating_counts.median():.2f}")
print(f"Max ratings per user: {user_rating_counts.max()}")
print(f"Users with only one rating: {(user_rating_counts == 1).sum()} ({(user_rating_counts == 1).sum() / len(user_rating_counts) * 100:.2f}%)")

In [None]:
# Check the rating value distribution
explicit_ratings = ratings_df[ratings_df['Book-Rating'] != 0]
implicit_ratings = ratings_df[ratings_df['Book-Rating'] == 0]
print(f"Explicit ratings (1-10): {len(explicit_ratings)} ({len(explicit_ratings) / len(ratings_df) * 100:.2f}%)")
print(f"Implicit ratings (0): {len(implicit_ratings)} ({len(implicit_ratings) / len(ratings_df) * 100:.2f}%)")

In [None]:
# Check how many books have at least 10 ratings
book_rating_counts = ratings_df.groupby('ISBN').size()
books_with_enough_ratings = book_rating_counts[book_rating_counts >= 10].index
print(f"Books with at least 10 ratings: {len(books_with_enough_ratings)} ({len(books_with_enough_ratings) / len(books_df) * 100:.2f}% of all books)")