In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-09-20 12:56:01--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.7’


2024-09-20 12:56:02 (107 MB/s) - ‘book-crossings.zip.7’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
# add your code here - consider creating a new cell for each section of code
print(df_ratings)
print('\n')#Formatting
print(df_books.head())

In [None]:
####DATA CLEANING#####

#Check for missing values in books
df_miss_chck = df_books
print('Count of values missing in books df')
print(df_miss_chck.isnull().sum())
#Missing some values so we will drop the whole row with missing values
df_books = df_books.dropna()
print('\n')#Formatting
print('Count of values missing after dropping incomplete rows')
print(df_books.isnull().sum())

print('\n')#Formatting

#Check for missing values in ratings
print('Count of values missing in ratings df')
print(df_ratings.isnull().sum())
#No Missing values need to be dropped from data in ratings df

print('\n')#Formatting

#Check for duplicates
pre_dup_chck = df_books.count() #Count ammount of books
dup_book = df_books.groupby(['title', 'author']).title.agg(['count']).reset_index().query('count > 1')
print(dup_book.count())

In [None]:
print(df_ratings.head()) #Check if columns names don't need changing
print('\n')#Formatting
print(df_books.head())

In [None]:
####DATA ANALYSIS/VISUALISATION#####

#Analysis of ratings data:
plt.figure(figsize=(8, 6))
sns.histplot(df_ratings['rating'], bins=10, kde=False, color='blue')
plt.title('Distribution of Book Ratings', fontsize=14)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Count the number of ratings by each user
user_ratings_count = df_ratings['user'].value_counts()

# Plot the number of ratings per user
plt.figure(figsize=(8, 6))
sns.histplot(user_ratings_count, bins=30, kde=False, color='green')
plt.title('Distribution of Ratings Per User', fontsize=14)
plt.xlabel('Number of Ratings per User')
plt.ylabel('Count')
plt.yscale('log')  # Log scale to handle skewness
plt.show()

# Count the number of ratings for each book (ISBN)
book_ratings_count = df_ratings['isbn'].value_counts()

# Plot the number of ratings per book
plt.figure(figsize=(8, 6))
sns.histplot(book_ratings_count, bins=30, kde=False, color='purple')
plt.title('Distribution of Ratings Per Book', fontsize=14)
plt.xlabel('Number of Ratings per Book')
plt.ylabel('Count')
plt.yscale('log')  # Log scale to handle skewness
plt.show()

#Analysis of books data:

# Count the number of books per author
author_counts = df_books['author'].value_counts().head(10)

# Plot the top 10 authors by number of books
plt.figure(figsize=(10, 6))
sns.barplot(y=author_counts.index, x=author_counts.values, palette='coolwarm')
plt.title('Top 10 Authors by Number of Books', fontsize=14)
plt.xlabel('Number of Books')
plt.ylabel('Author')
plt.show()

# Plot the distribution of books per author (consider authors with more than 1 book)
plt.figure(figsize=(8, 6))
sns.histplot(df_books['author'].value_counts(), bins=30, kde=False, color='orange')
plt.title('Distribution of Books Per Author', fontsize=14)
plt.xlabel('Number of Books per Author')
plt.ylabel('Count')
plt.yscale('log')  # Log scale due to skewness
plt.show()

# Count the number of books with the same title
title_counts = df_books['title'].value_counts().head(10)

# Plot the top 10 most common book titles
plt.figure(figsize=(10, 6))
sns.barplot(y=title_counts.index, x=title_counts.values, palette='viridis')
plt.title('Top 10 Most Common Book Titles', fontsize=14)
plt.xlabel('Number of Books')
plt.ylabel('Title')
plt.show()


In [None]:
# function to return recommended books - this will be tested

def get_recommends(book="Where the Heart Is (Oprah's Book Club (Paperback))"):
    # 1. Merge the books and ratings data on 'isbn'
    df = pd.merge(df_books[['isbn', 'title']], df_ratings[['isbn', 'user', 'rating']], on='isbn')

    # 2. Create a user-book matrix (pivot table), but convert it to a sparse matrix to save memory
    user_book_matrix = df.pivot_table(index='title', columns='user', values='rating', fill_value=0)
    sparse_matrix = csr_matrix(user_book_matrix.values)

    # 3. Use Nearest Neighbors algorithm to fit on the sparse user-book matrix
    model = NearestNeighbors(metric='cosine', algorithm='brute')  # 'brute' is faster for sparse matrices
    model.fit(sparse_matrix)

    # 4. Find the index of the input book title
    book_index = user_book_matrix.index.get_loc(book)

    # 5. Find the 5 nearest neighbors (books) for the given book
    distances, indices = model.kneighbors(sparse_matrix[book_index], n_neighbors=6)

    # 6. Extract the recommended books and distances (excluding the input book itself)
    recommended_books = [(user_book_matrix.index[i], distances.flatten()[j]) for j, i in enumerate(indices.flatten()) if i != book_index]

    # 7. Return the top 5 recommendations
    return recommended_books[:5]

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()