<a href="https://colab.research.google.com/github/KeremAydin98/machine-learning-with-python-projects/blob/main/BookRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this challenge, you will create a book recommendation algorithm using K-Nearest Neighbors.

You will use the Book-Crossings dataset. This dataset contains 1.1 million ratings (scale of 1-10) of 270,000 books by 90,000 users.

After importing and cleaning the data, use NearestNeighbors from sklearn.neighbors to develop a model that shows books that are similar to a given book. The Nearest Neighbors algorithm measures distance to determine the “closeness” of instances.

Create a function named get_recommends that takes a book title (from the dataset) as an argument and returns a list of 5 similar books with their distances from the book argument.

In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'
users_filename = 'BX-Users.csv'

--2022-04-20 09:54:06--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2022-04-20 09:54:06 (128 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

df_users = pd.read_csv(
    users_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'Location', 'Age'],
    usecols=['user', 'Location', 'Age'],
    dtype={'user': 'int32', 'Location': 'str', 'Age': 'float32'})

In [4]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [6]:
df_ratings.shape

(1149780, 3)

In [7]:
# To ensure statistical signifigance, I will remove users with less than 200 ratings

user_rating_count = pd.DataFrame(df_ratings.groupby('user')['rating'].count()).rename(columns={'rating':'userRatingCount'})


user_popularity_threshold = 200
user_rating_count = user_rating_count.query('userRatingCount >= @user_popularity_threshold')
user_rating_count.head()

Unnamed: 0_level_0,userRatingCount
user,Unnamed: 1_level_1
254,314
2276,498
2766,274
2977,232
3363,901


In [8]:
df_ratings = pd.merge(df_ratings, user_rating_count, on='user')
df_ratings.head()

Unnamed: 0,user,isbn,rating,userRatingCount
0,277427,002542730X,10.0,497
1,277427,0026217457,0.0,497
2,277427,003008685X,8.0,497
3,277427,0030615321,0.0,497
4,277427,0060002050,0.0,497


In [9]:
df_users.head()

Unnamed: 0,user,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [10]:
rating_count = pd.DataFrame(df_ratings.groupby('isbn')['rating'].count()).rename(columns={'rating':'bookRatingCount'}) # count the ratings of each book
top_5 = rating_count.sort_values('bookRatingCount',ascending=False).head() # top 5 rated books
top_5

Unnamed: 0_level_0,bookRatingCount
isbn,Unnamed: 1_level_1
971880107,365
316666343,272
60928336,221
440214041,218
385504209,217


In [11]:
# Which books are these?

most_rated_books = pd.merge(top_5, df_books, on='isbn')
most_rated_books

Unnamed: 0,isbn,bookRatingCount,title,author
0,971880107,365,Wild Animus,Rich Shapero
1,316666343,272,The Lovely Bones: A Novel,Alice Sebold
2,60928336,221,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells
3,440214041,218,The Pelican Brief,John Grisham
4,385504209,217,The Da Vinci Code,Dan Brown


In [12]:
df = pd.merge(df_ratings, df_books, on='isbn')
df.head()

Unnamed: 0,user,isbn,rating,userRatingCount,title,author
0,277427,002542730X,10.0,497,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,3363,002542730X,0.0,901,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,11676,002542730X,6.0,13602,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,12538,002542730X,10.0,1351,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,13552,002542730X,0.0,709,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner


In [13]:
df_total_rating = pd.merge(df, rating_count,on='isbn')
df_total_rating.head()

Unnamed: 0,user,isbn,rating,userRatingCount,title,author,bookRatingCount
0,277427,002542730X,10.0,497,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80
1,3363,002542730X,0.0,901,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80
2,11676,002542730X,6.0,13602,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80
3,12538,002542730X,10.0,1351,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80
4,13552,002542730X,0.0,709,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80


In [14]:
df = pd.merge(df_total_rating, df_users, on='user')
df = df.drop('Age',axis=1)
df.head()

Unnamed: 0,user,isbn,rating,userRatingCount,title,author,bookRatingCount,Location
0,277427,002542730X,10.0,497,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,80,"gilbert, arizona, usa"
1,277427,0026217457,0.0,497,Vegetarian Times Complete Cookbook,Lucy Moll,7,"gilbert, arizona, usa"
2,277427,003008685X,8.0,497,Pioneers,James Fenimore Cooper,1,"gilbert, arizona, usa"
3,277427,0030615321,0.0,497,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1,"gilbert, arizona, usa"
4,277427,0060002050,0.0,497,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,13,"gilbert, arizona, usa"


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 488756 entries, 0 to 488755
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user             488756 non-null  int32  
 1   isbn             488756 non-null  object 
 2   rating           488756 non-null  float32
 3   userRatingCount  488756 non-null  int64  
 4   title            488756 non-null  object 
 5   author           488756 non-null  object 
 6   bookRatingCount  488756 non-null  int64  
 7   Location         488756 non-null  object 
dtypes: float32(1), int32(1), int64(2), object(4)
memory usage: 29.8+ MB


In [16]:
df = df.drop_duplicates(['user', 'title'])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 484507 entries, 0 to 488755
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user             484507 non-null  int32  
 1   isbn             484507 non-null  object 
 2   rating           484507 non-null  float32
 3   userRatingCount  484507 non-null  int64  
 4   title            484507 non-null  object 
 5   author           484507 non-null  object 
 6   bookRatingCount  484507 non-null  int64  
 7   Location         484507 non-null  object 
dtypes: float32(1), int32(1), int64(2), object(4)
memory usage: 29.6+ MB


In [18]:
df = df.drop('author',axis=1)
df.head()

Unnamed: 0,user,isbn,rating,userRatingCount,title,bookRatingCount,Location
0,277427,002542730X,10.0,497,Politically Correct Bedtime Stories: Modern Ta...,80,"gilbert, arizona, usa"
1,277427,0026217457,0.0,497,Vegetarian Times Complete Cookbook,7,"gilbert, arizona, usa"
2,277427,003008685X,8.0,497,Pioneers,1,"gilbert, arizona, usa"
3,277427,0030615321,0.0,497,"Ask for May, Settle for June (A Doonesbury book)",1,"gilbert, arizona, usa"
4,277427,0060002050,0.0,497,On a Wicked Dawn (Cynster Novels),13,"gilbert, arizona, usa"


In [19]:
# To ensure statistical significance, I will remove books with less than 100 ratings

popularity_threshold = 25
df = df.query('bookRatingCount >= @popularity_threshold')
df.head()

Unnamed: 0,user,isbn,rating,userRatingCount,title,bookRatingCount,Location
0,277427,002542730X,10.0,497,Politically Correct Bedtime Stories: Modern Ta...,80,"gilbert, arizona, usa"
9,277427,0060542128,7.0,497,When the Storm Breaks,26,"gilbert, arizona, usa"
12,277427,006092988X,0.0,497,A Tree Grows in Brooklyn,31,"gilbert, arizona, usa"
13,277427,0060930535,0.0,497,The Poisonwood Bible: A Novel,133,"gilbert, arizona, usa"
15,277427,0060934417,0.0,497,Bel Canto: A Novel,108,"gilbert, arizona, usa"


In [20]:
df = df.dropna(axis = 0, subset = ['title'])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87023 entries, 0 to 488102
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user             87023 non-null  int32  
 1   isbn             87023 non-null  object 
 2   rating           87023 non-null  float32
 3   userRatingCount  87023 non-null  int64  
 4   title            87023 non-null  object 
 5   bookRatingCount  87023 non-null  int64  
 6   Location         87023 non-null  object 
dtypes: float32(1), int32(1), int64(2), object(3)
memory usage: 4.6+ MB


In [22]:
df_pivot = df.pivot(index='title',columns='user',values='rating').fillna(0)
df_matrix = csr_matrix(df_pivot.values)

In [23]:
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')
model_knn.fit(df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [24]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):

  index = df_pivot.transpose().columns.get_loc(book)
  distances, indices = model_knn.kneighbors(df_pivot.iloc[index, :].values.reshape(1, -1), n_neighbors = 6)

  recommended_books = []

  for i in range(len(distances.flatten())):

      recommended_book = []   
      recommended_book.append(df_pivot.index[indices.flatten()[i]])
      recommended_book.append(distances.flatten()[i])

      recommended_books.append(recommended_book)

  return recommended_books



---





In [30]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0][0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["Blue Diary", 'The Weight of Water', 'The Lovely Bones: A Novel', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.75, 0.75, 0.75, 0.75]
  for i in range(4): 
    if recommends[i+1][0] not in recommended_books:
      print("is it")
      test_pass = False
    if abs(recommends[i+1][1] - recommended_books_dist[i]) >= 0.05:
      print("is it")
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

[["Where the Heart Is (Oprah's Book Club (Paperback))", 0.0], ['Blue Diary', 0.71828747], ['The Lovely Bones: A Novel', 0.7234864], ['I Know This Much Is True', 0.7677075], ['The Surgeon', 0.7699411], ['The Weight of Water', 0.77085835]]
You passed the challenge! 🎉🎉🎉🎉🎉
