In [13]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from zipfile import ZipFile
import os

pd.options.display.float_format = '{:.0f}'.format # Supress scientific notation

In [136]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip -nv

with ZipFile('book-crossings.zip', 'r') as zObject:
    zObject.extractall()

os.remove('book-crossings.zip')

filenames = ['BX-Book-Ratings.csv', 'BX-Books.csv', 'BX-Users.csv']

2024-01-11 22:18:01 URL:https://cdn.freecodecamp.org/project-data/books/book-crossings.zip [26085508/26085508] -> "book-crossings.zip" [1]


In [137]:
# import csv data into dataframes
df_books = pd.read_csv(
    filenames[1],
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    filenames[0],
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

for file in filenames:
    os.remove(file)

In [138]:
df = df_ratings.merge(df_books)

df.describe()

Unnamed: 0,user,rating
count,1031175,1031175
mean,140594,3
std,80524,4
min,2,0
25%,70415,0
50%,141210,0
75%,211426,7
max,278854,10


In [145]:
df.head(10)

Unnamed: 0,user,isbn,rating,title,author
63,278418,446520802,0,The Notebook,Nicholas Sparks
65,3363,446520802,0,The Notebook,Nicholas Sparks
66,7158,446520802,10,The Notebook,Nicholas Sparks
69,11676,446520802,10,The Notebook,Nicholas Sparks
74,23768,446520802,6,The Notebook,Nicholas Sparks
77,27617,446520802,9,The Notebook,Nicholas Sparks
78,28204,446520802,0,The Notebook,Nicholas Sparks
79,29855,446520802,0,The Notebook,Nicholas Sparks
81,30711,446520802,6,The Notebook,Nicholas Sparks
82,32440,446520802,0,The Notebook,Nicholas Sparks


In [140]:
# Get list of users that posted 200+ reviews
users = df.groupby('user').count().title.sort_values()
users_few = users.loc[users >= 200].index.to_list()

# Get list of books that have 100+ reviews
books = df.groupby('title').count().user.sort_values()
books_few = books.loc[books >= 100].index.to_list()

# Filter dataframe with previously generated lists
df = df.loc[(df.title.isin(books_few)) & (df.user.isin(users_few))]

df.describe()

Unnamed: 0,user,rating
count,66730,66730
mean,140319,2
std,81015,4
min,254,0
25%,69697,0
50%,139742,0
75%,212923,2
max,278418,10


In [149]:
df['title']

63                   The Notebook
65                   The Notebook
66                   The Notebook
69                   The Notebook
74                   The Notebook
                    ...          
1027962                    Echoes
1028816             The Rainmaker
1029109            Fahrenheit 451
1030863            Stormy Weather
1030907    Me Talk Pretty One Day
Name: title, Length: 66730, dtype: object

In [150]:
# Convert dataframe into 2D matrix
matrix = df.pivot_table(index='title', columns='user', values='rating').fillna(0)

matrix.head()

user,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9,0,0,0,0,0,0,0,0,0,...,10,0,0,0,0,0,0,0,0,0
1st to Die: A Novel,0,0,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
24 Hours,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,10,0,0
2nd Chance,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4 Blondes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [206]:
N_RECOMMENDS = 5

In [207]:
neighbors = NearestNeighbors(algorithm='auto', n_neighbors=N_RECOMMENDS + 1)
neighbors.fit(matrix)

In [236]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  list = [book, []]
  
  if book in df['title'].values:
    distances, indices = neighbors.kneighbors(matrix.loc[[book]])
    
    for i in range(1, N_RECOMMENDS + 1):
      recomm = [matrix.index[indices.flatten()[i]], distances.flatten()[i]]
      list[1].append(recomm)
  
  return list

In [239]:
get_recommends('The Queen of the Damned (Vampire Chronicles (Paperback))')

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['The Vendetta Defense', 35.443617196894564],
  ['No Safe Place', 35.4682957019364],
  ['Jacob Have I Loved', 35.94440151122286],
  ['Long After Midnight', 35.9722114972099],
  ['The Prometheus Deception', 36.193922141707716]]]

In [238]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Perks of Being a Wallflower', 60.76183012385325], ['Silent Night : A Christmas Suspense Story', 60.78651166171653], ['Blue Diary', 60.83584469702052], ['Gap Creek: The Story Of A Marriage', 60.860496218811754], ['Jacob Have I Loved', 60.88513775955508]]]
You haven't passed yet. Keep trying!
