In [79]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

In [80]:
# Load the data
books = pd.read_csv('books.csv', sep=";", error_bad_lines=False, encoding='latin-1')
users = pd.read_csv('users.csv', sep=';', error_bad_lines=False, encoding='latin-1')
ratings = pd.read_csv('ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')



  books = pd.read_csv('books.csv', sep=";", error_bad_lines=False, encoding='latin-1')
Skipping line 6452: expected 8 fields, saw 9
Skipping line 43667: expected 8 fields, saw 10
Skipping line 51751: expected 8 fields, saw 9

Skipping line 92038: expected 8 fields, saw 9
Skipping line 104319: expected 8 fields, saw 9
Skipping line 121768: expected 8 fields, saw 9

Skipping line 144058: expected 8 fields, saw 9
Skipping line 150789: expected 8 fields, saw 9
Skipping line 157128: expected 8 fields, saw 9
Skipping line 180189: expected 8 fields, saw 9
Skipping line 185738: expected 8 fields, saw 9

Skipping line 209388: expected 8 fields, saw 9
Skipping line 220626: expected 8 fields, saw 9
Skipping line 227933: expected 8 fields, saw 11
Skipping line 228957: expected 8 fields, saw 10
Skipping line 245933: expected 8 fields, saw 9
Skipping line 251296: expected 8 fields, saw 9
Skipping line 259941: expected 8 fields, saw 9
Skipping line 261529: expected 8 fields, saw 9

  books = pd.rea

In [81]:
# Rename columns for consistency
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']]
books.rename(columns={
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-URL-L": "img_url"
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.rename(columns={


In [82]:
# Filter users with more than 200 ratings
x = ratings['User-ID'].value_counts() > 200
y = x[x].index
ratings = ratings[ratings['User-ID'].isin(y)]

In [83]:
# Merge dataframes
ratings_with_books = ratings.merge(books, on="ISBN")
num_rating = ratings_with_books.groupby('title')['Book-Rating'].count().reset_index()
num_rating.rename(columns={'Book-Rating': 'num_of_rating'}, inplace=True)
final_rating = ratings_with_books.merge(num_rating, on='title')
final_rating = final_rating[final_rating['num_of_rating'] >= 50]
final_rating.drop_duplicates(['User-ID', 'title'], inplace=True)

In [84]:
# Create a pivot table
book_pivot = final_rating.pivot_table(columns="User-ID", index='title', values='Book-Rating')
book_pivot.fillna(0, inplace=True)
book_sparse = csr_matrix(book_pivot)

In [85]:
# Train the model
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [86]:
# Save the model and related data
books_name = book_pivot.index
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(books_name, open('artifacts/books_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('artifacts/book_pivot.pkl', 'wb'))

In [87]:
def recommend_books(book_name):
    # Make recommendations
    book_id_candidates = book_pivot.index[book_pivot.index.str.contains(book_name, case=False, regex=False)]

    if len(book_id_candidates) == 0:
        print(f"Book '{book_name}' not found in the dataset.")
        return

    book_id = book_id_candidates[0]

    print(f"Making recommendations for '{book_name}' (Index: {book_id})")

    try:
        _, suggestions = model.kneighbors(book_pivot.loc[book_id, :].values.reshape(1, -1), n_neighbors=6)
        suggestions = [book_pivot.index[i] for i in suggestions[0][1:]]  # Map the indices to the original DataFrame
        recommendations = {book_name: suggestions}  # Use book_name as the key
    except KeyError as e:
        print(f"Error: {e}")
        return

    # Print recommendations
    print("Recommended books:")
    for book_title in suggestions:
        print(book_title)

    return recommendations

In [88]:
def evaluate_recommendations(book_name, recommendations, test_set_size=0.1):
    # Split the data into training and test sets
    train_set_size = 1 - test_set_size
    train_set = book_pivot.sample(frac=train_set_size, random_state=42)
    test_set = book_pivot.drop(train_set.index)

    # Evaluate accuracy using MSE
    mse_values = []
    for book_title in test_set.index:  # Iterate using book titles
        actual_ratings = test_set.loc[book_title, :].values
        predicted_ratings = np.mean(book_pivot.loc[recommendations[book_name], :].values, axis=0)
        mse = mean_squared_error(actual_ratings, predicted_ratings)
        mse_values.append(mse)

    overall_mse = np.mean(mse_values)
    print(f'Mean Squared Error (MSE) for Recommendations: {overall_mse}')

In [89]:
# Example usage
book_to_recommend = '1st to Die: A Novel'
recommendations = recommend_books(book_to_recommend)

Making recommendations for '1st to Die: A Novel' (Index: 1st to Die: A Novel)
Recommended books:
Exclusive
The Cradle Will Fall
The Clinic (Alex Delaware Novels (Paperback))
No Safe Place
The Sum of All Fears (Jack Ryan Novels)


In [90]:
evaluate_recommendations(book_to_recommend, recommendations)

Mean Squared Error (MSE) for Recommendations: 1.4215041392744094
