In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-03-14 11:27:21--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-03-14 11:27:23 (25.8 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [2]:
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [20]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [21]:
df_rate_per_book = df_ratings.groupby('isbn', as_index=False).agg(
    n_rating=('rating', 'count'),
    rating_avg=('rating', 'mean')
)
df_rate_per_book = df_rate_per_book[df_rate_per_book['n_rating'] > 100]
df_rate_per_book

Unnamed: 0,isbn,n_rating,rating_avg
3800,002542730X,171,3.514620
5416,0060008032,104,2.442308
5683,0060096195,107,4.028038
6204,006016848X,147,2.693877
6338,0060173289,130,3.453846
...,...,...,...
262739,1573227331,105,3.904762
262803,1573229326,217,3.198157
262813,1573229571,106,4.179245
268744,1592400876,120,3.966667


In [22]:
df = pd.merge(df_books, df_rate_per_book, on='isbn', how='inner')
df

Unnamed: 0,isbn,title,author,n_rating,rating_avg
0,0440234743,The Testament,John Grisham,422,3.085308
1,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,180,3.411111
2,0971880107,Wild Animus,Rich Shapero,2502,1.019584
3,0345402871,Airframe,Michael Crichton,207,2.845411
4,0345417623,Timeline,MICHAEL CRICHTON,407,3.761671
...,...,...,...,...,...
712,0425178765,Easy Prey,John Sandford,113,2.778761
713,0449223604,M Is for Malice,Sue Grafton,151,2.331126
714,0345444884,The Talisman,STEPHEN KING,103,4.145631
715,0060008032,Angels,Marian Keyes,104,2.442308


In [23]:
#df_ratings['rating'] = df_ratings['rating'].astype("int8")

count_user = df_ratings['user'].value_counts()
df_ratings = df_ratings[~df_ratings['user'].isin(count_user[count_user < 200].index)]
df_ratings

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0
...,...,...,...
1147612,275970,3829021860,0.0
1147613,275970,4770019572,0.0
1147614,275970,896086097,0.0
1147615,275970,9626340762,8.0


In [24]:
pd_matrix = pd.merge(df_rate_per_book, df_ratings, 
                     how="left", left_on="isbn", right_on="isbn")

pd_matrix

Unnamed: 0,isbn,n_rating,rating_avg,user,rating
0,002542730X,171,3.514620,277427,10.0
1,002542730X,171,3.514620,3363,0.0
2,002542730X,171,3.514620,11676,6.0
3,002542730X,171,3.514620,12538,10.0
4,002542730X,171,3.514620,13552,0.0
...,...,...,...,...,...
49328,1878424319,133,3.496241,229313,0.0
49329,1878424319,133,3.496241,252222,0.0
49330,1878424319,133,3.496241,252695,0.0
49331,1878424319,133,3.496241,254971,0.0


In [25]:
# Reshape so that ISBN is row index, User-ID is column index and values are ratings
pd_matrix = pd_matrix.pivot(index='isbn', columns='user', values='rating').fillna(0).astype("int8")
pd_matrix

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,10,0,0,0
0060008032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060096195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
006016848X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060173289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0,0,0,0,0,0,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
1573229326,0,0,0,0,0,0,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
1573229571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1592400876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
matrix = csr_matrix(pd_matrix.values)

In [27]:
# Create a model
N_predicted_neighbours = 6
KNN = NearestNeighbors(metric='cosine', n_neighbors=N_predicted_neighbours, n_jobs=-1)
KNN.fit(matrix)

In [28]:
# Predict
distances, indices = KNN.kneighbors(matrix)

In [29]:
def get_book_index(title):
    result = df[df['title'].str.lower() == title.lower()] 
    if not result.empty:
        result = result.iloc[0]['isbn']
        selected_index = np.where(pd_matrix.index==result)[0][0]
        return selected_index
    return None 

In [30]:
# function to return recommended books - this will be tested
recommended_books = []
book_list = []

def get_recommends(book = ""):
    book_index = get_book_index(book)
    recommended_books.append(book)
    
    i = N_predicted_neighbours - 1
    while i > 0:    
        predictions = []
        
        book_title = df.loc[df['isbn'] == pd_matrix.index[indices[book_index][i]], 'title'].values[0]
        book_distance = float(distances[book_index][i])
        
        predictions.append(book_title)
        predictions.append(book_distance)
        
        book_list.append(predictions)
        
        i = i - 1
        
    recommended_books.append(book_list)
    
    return recommended_books

In [31]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016210581447822], ['The Weight of Water', 0.7708583572697412], ['The Surgeon', 0.7699410973804288], ['I Know This Much Is True', 0.7677075092617776], ['The Lovely Bones: A Novel', 0.7234864549790632]]]
You passed the challenge! 🎉🎉🎉🎉🎉
