<a href="https://colab.research.google.com/github/Isaiah-Gonzales/book-recommend-KNN/blob/main/IG_fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2023-10-02 04:54:59--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.1’


2023-10-02 04:54:59 (115 MB/s) - ‘book-crossings.zip.1’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
#clean data, users with only 200 or more ratings
df_ratings['isbn_freq'] = df_ratings['isbn'].map(df_ratings['isbn'].value_counts())
df_ratings['user_freq'] = df_ratings['user'].map(df_ratings['user'].value_counts())

print(df_ratings.head())

df_clean = df_ratings[
    (df_ratings['isbn_freq'] >= 100) &
    (df_ratings['user_freq'] >= 200)]

print(df_clean)

     user        isbn  rating  isbn_freq  user_freq
0  276725  034545104X     0.0         60          1
1  276726  0155061224     5.0          2          1
2  276727  0446520802     0.0        116          1
3  276729  052165615X     3.0          1          2
4  276729  0521795028     6.0          1          2
           user        isbn  rating  isbn_freq  user_freq
1456     277427  002542730X    10.0        171        497
1469     277427  0060930535     0.0        494        497
1471     277427  0060934417     0.0        350        497
1474     277427  0061009059     9.0        291        497
1484     277427  0140067477     0.0        189        497
...         ...         ...     ...        ...        ...
1147304  275970  0804111359     0.0        167       1376
1147436  275970  140003065X     0.0        157       1376
1147439  275970  1400031346     0.0        106       1376
1147440  275970  1400031354     0.0        202       1376
1147441  275970  1400031362     0.0        128    

In [None]:
#verify cleaning process went correctly
print(df_clean['isbn_freq'].min())
print(df_clean['user_freq'].min())

100
200


In [None]:
#scipy expects features in 'scipy sparse matrix'
df_book_features = df_clean.pivot(
    index='isbn',
    columns='user',
    values='rating'
).fillna(0)
print(df_book_features.head())
mat_book_features = csr_matrix(df_book_features.values)

#define model
model_knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=5, n_jobs=-1)

#dictionary for converting between isbn and title, also find index of book in scipy sparse matrix (built from pivot) so we can reference it in model
isbn_to_title = df_books.set_index('isbn').to_dict()['title']
title_to_isbn = df_books.set_index('title').to_dict()['isbn']
isbn_in_pivot = df_book_features.index.to_list()

user        254     2276    2766    2977    3363    4017    4385    6242    \
isbn                                                                         
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060173289     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

user        6251    6323    ...  274004  274061  274301  274308  274808  \
isbn                        ...                                           
002542730X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060008032     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060096195     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
006016848X     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
006

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  #find index of book in scipy sparse matrix
  isbn = title_to_isbn[book]
  book_index = isbn_in_pivot.index(isbn)

  #fit model to data
  model_knn.fit(mat_book_features)
  #find (n_neigbhors-1) nearest neighbors to book, returns their indices and distances
  distance, index = model_knn.kneighbors(
      mat_book_features[book_index],
      n_neighbors=6
  )

  #convert knn data to lists
  distances = distance.squeeze().tolist()
  indices = index.squeeze().tolist()
  distances.pop(0)
  indices.pop(0)

  #find isbn of index in scipy sparse matrix, convert to title
  titles = []
  for index in indices:
    titles.append(isbn_to_title[isbn_in_pivot[index]])
  recommended_books = [book]
  #knn returned distances low-to-high, test expects distance high-to-low
  titles.reverse()
  distances.reverse()

  #formatting for test
  recommendations=[[],[],[],[],[]]
  i = 0
  j = 0
  for x in titles:
    recommendations[i].insert(0,x)
    i+=1
  for x in distances:
    recommendations[j].insert(1,x)
    j+=1

  recommended_books=[book,recommendations]
  return recommended_books

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016210794448853], ['The Weight of Water', 0.7708583474159241], ['The Surgeon', 0.7699410915374756], ['I Know This Much Is True', 0.7677075266838074], ['The Lovely Bones: A Novel', 0.7234864234924316]]]
You passed the challenge! 🎉🎉🎉🎉🎉
