<a href="https://colab.research.google.com/github/Haithem999/Book-Recommendation-Engine-using-KNN/blob/main/book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2023-10-30 16:10:22--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2023-10-30 16:10:22 (284 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
# add your code here - consider creating a new cell for each section of code

In [5]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [6]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [7]:
df_books.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   isbn    271379 non-null  object
 1   title   271379 non-null  object
 2   author  271378 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [8]:
df_ratings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [9]:
df_ratings.user.unique()


array([276725, 276726, 276727, ..., 276709, 276721, 276723], dtype=int32)

In [10]:
df = df_ratings

#This line calculates the frequency of each unique user in the df dataframe.
# The result is a Series where the indices are user IDs and the values are their respective counts.
counts1 = df['user'].value_counts()
# this line calculates the frequency of each unique ISBN in the df dataframe.
# The result is a Series where the indices are ISBNs and the values are their respective counts.
counts2 = df['isbn'].value_counts()

# users who have rated fewer than 200 books
#The ~ symbol in front of the expression negates this Series
# counts1[counts1 < 200]: This filters counts1 to only include users who have rated fewer than 200 books.
# The result is still a Series where the indices are the user IDs and the values are the counts of ratings for each of those user IDs.
#counts1[counts1 < 200].index: By adding .index, we extract just the user IDs (which are the indices in the counts1 Series) that have rated fewer than 200 books.
df = df[~df['user'].isin(counts1[counts1 < 200].index)]
df = df[~df['isbn'].isin(counts2[counts2 < 100].index)]

#Here, the dataframe df is being merged with another dataframe df_books based on the common column "isbn".
# The result is a dataframe that has combined information from both df and df_books.
df = pd.merge(right=df, left = df_books, on="isbn")

#This line drops any rows with duplicated combinations of "title" and "user".
# Essentially, if a user has rated the same book title more than once, only the first occurrence is retained.
df = df.drop_duplicates(["title", "user"])

# Here, the dataframe is being reshaped. The pivot function creates a new dataframe where:
# Each unique 'title' becomes a row.
# Each unique 'user' becomes a column.
# The values inside the resulting matrix represent 'rating'.
# After the pivot, any NaN values (which indicate missing ratings) are filled with 0 using fillna(0).
piv = df.pivot(index='title', columns='user', values='rating').fillna(0)

In [11]:
matrix = piv.values


In [12]:
matrix.shape


(673, 888)

In [13]:
piv

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#The NearestNeighbors class is used to find the k-nearest neighbors for a given point in a dataset.
# It's often used for recommendation systems and other applications where similarity between items is important.
from sklearn.neighbors import NearestNeighbors

## metric='cosine': This specifies that the cosine similarity will be used as the distance metric.
# The cosine similarity measures the cosine of the angle between two vectors, which is a measure of their orientation, regardless of their magnitude.
# It's a commonly used metric in recommendation systems, especially for cases like collaborative filtering.
#In the context of NearestNeighbors, specifying a cosine metric essentially means you're using cosine distance (which is 1 minus the cosine similarity).

##algorithm='brute': This means that the brute-force search algorithm will be used to compute the nearest neighbors.
# In brute-force search, the distances between a query point and every other point in the dataset are computed to find the nearest neighbors.
# This approach can be computationally intensive for large datasets, but it's straightforward and doesn't require any preprocessing of the data.
model_knn=NearestNeighbors(metric='cosine',algorithm='brute')

#This line trains the model using the data in matrix. In the context of NearestNeighbors, "training" doesn't involve any iterative optimization like in many other machine learning algorithms.
# Instead, it just involves storing the dataset so that it can later compute distances to query points when you want to find the nearest neighbors.
# Given that you've specified algorithm='brute', the fit method will just store the matrix for future distance computations without creating any specialized data structures (like KD-trees or Ball Trees).
model_knn.fit(matrix)

In [15]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
# piv.loc[book]: This line fetches the data corresponding to the provided book from the piv DataFrame. The resulting data represents that book's ratings across all users.
# .array: Converts the Series to an array.
# .reshape(1, -1): This reshapes the array into a 2D array with one row.
# This is necessary because many sklearn functions (like model_knn.kneighbors()) expect input samples in a 2D format
  x=piv.loc[book].array.reshape(1, -1)

# This line uses the kneighbors method of the model_knn object (which should be an instance of NearestNeighbors)
# to find the 6 books most similar to the provided book. The method returns two arrays:
# distances: Contains the distances from the book to each of its 6 nearest neighbors.
# indices: Contains the indices in piv of each of those 6 neighbors.
  distances,indices=model_knn.kneighbors(x,n_neighbors=6)
  R_books=[]
  for distance,indice in zip(distances[0],indices[0]):
    if distance!=0:
      R_book=piv.index[indice]
      R_books.append([R_book,distance])
  recommended_books=[book,R_books[::-1]]
  return recommended_books


In [16]:
get_recommends('The Queen of the Damned (Vampire Chronicles (Paperback))')


['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['Catch 22', 0.7939835],
  ['The Witching Hour (Lives of the Mayfair Witches)', 0.74486566],
  ['Interview with the Vampire', 0.73450685],
  ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.53763384],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.51784116]]]

In [17]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.7677075], ['The Lovely Bones: A Novel', 0.7234864]]]
You passed the challenge! 🎉🎉🎉🎉🎉
