In [16]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from IPython.display import display
import jupyter_black

jupyter_black.load(lab=False)

In [17]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = "BX-Books.csv"
ratings_filename = "BX-Book-Ratings.csv"

In [18]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["isbn", "title", "author"],
    usecols=["isbn", "title", "author"],
    dtype={"isbn": "str", "title": "str", "author": "str"},
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["user", "isbn", "rating"],
    usecols=["user", "isbn", "rating"],
    dtype={"user": "int32", "isbn": "str", "rating": "float32"},
)

In [19]:
# Find books with at least 100 ratings (isbn)
num_of_ratings_per_book = df_ratings["isbn"].value_counts()
books_to_keep = num_of_ratings_per_book.loc[
    num_of_ratings_per_book >= 100
].index.to_list()

# Find users with at least 200 ratings (user)
num_of_ratings_per_user = df_ratings["user"].value_counts()
users_to_keep = num_of_ratings_per_user.loc[
    num_of_ratings_per_user >= 200
].index.to_list()

# Only keep above-mentioned entries in df_ratings
df_ratings = df_ratings.loc[
    df_ratings["isbn"].isin(books_to_keep) & df_ratings["user"].isin(users_to_keep)
]

final_users = df_ratings["user"].unique()

print("Shape of df_ratings data frame:", df_ratings.shape)

Shape of df_ratings data frame: (49781, 3)


In [20]:
# Join the tables such that only books with known title
# and available ratings are kept (inner join)
books = pd.merge(left=df_ratings, right=df_books, how="inner", on="isbn")

# Remove duplicates ('title' is not unique in df_books!!!)
books.drop_duplicates(subset=["user", "title"], inplace=True, ignore_index=True)

print("Shape of books data frame:", books.shape)
display(books.head())

# Convert to 'wide' format using pandas' pivot
books = pd.pivot(
    books, index=["isbn", "author", "title"], columns="user", values="rating"
)
books.reset_index(inplace=True)

# Check if the 'check books' (below) are still in the data set
required_books = [
    "Where the Heart Is (Oprah's Book Club (Paperback))",
    "I'll Be Seeing You",
    "The Weight of Water",
    "The Surgeon",
    "I Know This Much Is True",
]
present = []

for book in required_books:
    if book in books["title"].to_list():
        present.append(True)
    else:
        present.append(False)

print("Required books are present:", all(present))

# Impute missing ratings with 0.0
books[final_users] = books[final_users].fillna(0.0)

display(books.head())

Shape of books data frame: (49136, 5)


Unnamed: 0,user,isbn,rating,title,author
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner


Required books are present: True


user,isbn,author,title,254,2276,2766,2977,3363,4017,4385,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
0,002542730X,James Finn Garner,Politically Correct Bedtime Stories: Modern Ta...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
1,0060008032,Marian Keyes,Angels,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0060096195,Meggin Cabot,The Boy Next Door,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,006016848X,John Gray,"Men Are from Mars, Women Are from Venus: A Pra...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0060173289,Rebecca Wells,Divine Secrets of the Ya-Ya Sisterhood : A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Init and fit the NearestNeighbors estimator on the ratings
neigh = NearestNeighbors(n_neighbors=6, metric="cosine", algorithm="brute", n_jobs=-1)

neigh = neigh.fit(books[final_users])

# Quickly check the estimators output
test_book = "The Queen of the Damned (Vampire Chronicles (Paperback))"
test_ratings = books.loc[books["title"] == test_book, final_users]
test_neighbors = neigh.kneighbors(test_ratings, return_distance=True)
display(test_neighbors)
test_titles = books.iloc[test_neighbors[1][0]]["title"]
display(test_titles)

(array([[0.        , 0.51784116, 0.5376338 , 0.73450685, 0.74486566,
         0.7939835 ]], dtype=float32),
 array([[136, 126, 152, 127, 151, 641]]))

136    The Queen of the Damned (Vampire Chronicles (P...
126     The Vampire Lestat (Vampire Chronicles, Book II)
152    The Tale of the Body Thief (Vampire Chronicles...
127                           Interview with the Vampire
151     The Witching Hour (Lives of the Mayfair Witches)
641                                             Catch 22
Name: title, dtype: object

In [22]:
# function to return recommended books - this will be tested
def get_recommends(book=""):
    if book not in books["title"].to_list():
        raise ValueError("Requested book not in database.")

    # Get the requested book's ratings
    ratings = books.loc[books["title"] == book, final_users]

    # Get the 6 nearest neighbors
    distances, indexes = neigh.kneighbors(ratings, return_distance=True)

    # For some strange reason, you have to reverse the order.
    # Slicing here also removes the original/requested book.
    distances = distances[0][-1:0:-1]
    indexes = indexes[0][-1:0:-1]

    recommended_books = [
        [books.loc[index, "title"], distance]
        for index, distance in zip(indexes, distances)
    ]

    recommended_books = [book] + [recommended_books]

    return recommended_books

In [23]:
recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(recommends)


def test_book_recommendation():
    test_pass = True
    recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False
    recommended_books = [
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True",
    ]
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
    for i in range(2):
        if recommends[1][i][0] not in recommended_books:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
            test_pass = False
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")


test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8016211], ['The Weight of Water', 0.77085835], ['The Surgeon', 0.7699411], ['I Know This Much Is True', 0.76770747], ['The Lovely Bones: A Novel', 0.7234864]]]
You passed the challenge! 🎉🎉🎉🎉🎉
