In [2]:
# Book Recommendation System 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
from sklearn.metrics.pairwise import cosine_similarity as c_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score, roc_curve, auc, make_scorer
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

# Create directory for dataset if it doesn't exist
if not os.path.exists('book-recommendation-dataset'):
    os.makedirs('book-recommendation-dataset')

# Download datasets if they don't exist
def download_file(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        response = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"{filename} already exists")

# Download the datasets
base_url = "https://raw.githubusercontent.com/arashnic/book-recommendation-dataset/main/"
files = ['Books.csv', 'Users.csv', 'Ratings.csv']

for file in files:
    url = base_url + file
    filename = f'book-recommendation-dataset/{file}'
    download_file(url, filename)

# Load datasets
books = pd.read_csv('book-recommendation-dataset/Books.csv')
users = pd.read_csv('book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('book-recommendation-dataset/Ratings.csv')

# Fix image URLs
books['Image-URL-M'] = books['Image-URL-M'].str.replace('http', 'https')

# Popularity-based recommendation
temp_br = books.merge(ratings, on="ISBN")
temp_num = temp_br.groupby('Book-Title').count()['Book-Rating'].reset_index().rename(columns={'Book-Rating': 'Votes'})
temp_avg = temp_br.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns={'Book-Rating': 'Avg-rating'})
pop_ = temp_num.merge(temp_avg, on='Book-Title')
temp__ = pop_[pop_['Votes'] >= 250].sort_values('Avg-rating', ascending=False)
pop = temp__.head(50)
top50 = pop.merge(books, on='Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M', 'Votes', 'Avg-rating']].drop_duplicates('Book-Title')
top50['Avg-rating'] = round(top50['Avg-rating'], 2)
top50['Book-Title'] = top50['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)

# Create processed-dataset directory if it doesn't exist
if not os.path.exists('processed-dataset'):
    os.makedirs('processed-dataset')

top50.to_csv('processed-dataset/top50.csv')

# Collaborative Filtering
x = temp_br.groupby('User-ID').count()['Book-Rating']
top_users = x[x > 200].index
filtered_users = temp_br[temp_br['User-ID'].isin(top_users)]
y = filtered_users.groupby('Book-Title').count()['Book-Rating']
famous_books = y[y >= 50].reset_index()['Book-Title'].values
filtered_books = filtered_users[filtered_users['Book-Title'].isin(famous_books)]
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.strip().replace(r'\s{1,}\(.*\)', '', regex=True)
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('&amp;', 'and')
filtered_books['Book-Title'] = filtered_books['Book-Title'].str.replace('\\O\\\" Is for Outlaw"', "O is for Outlaw")

pt = filtered_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0.0)
sim_scores = c_score(pt)

# Recommender function
def recommend(book):
    suggestions = []
    index = np.where(pt.index == book)[0][0]
    s_books = sorted(list(enumerate(sim_scores[index])), key=lambda x:x[1], reverse=True)[1:6]
    for book in s_books:
        suggestions.append(pt.index[book[0]])
    return suggestions

# Suggestion table
all_suggestions = {}
for name in pt.index:
    all_suggestions.update({name: recommend(name)})
suggestions = pd.DataFrame(all_suggestions).T.reset_index()
suggestions.rename(columns={'index': 'book-title', 0: "1st", 1: "2nd", 2: "3rd", 3: "4th", 4: "5th"}, inplace=True)
suggestions.to_csv('processed-dataset/sugg.csv')

# Book metadata
temp_books = filtered_books.drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication', 'Image-URL-M']]
temp_books.to_csv('processed-dataset/final.csv')

# --- EDA & Evaluation ---
print("\n--- Dataset Shapes ---")
print("Books:", books.shape)
print("Users:", users.shape)
print("Ratings:", ratings.shape)

print("\n--- Basic Stats ---")
print(books.describe(include='all'))
print(ratings.describe())

# Convert 'Year-Of-Publication' to numeric, coercing errors to NaN
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce')

# Rating Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=ratings, x='Book-Rating', bins=10)
plt.title('Distribution of Book Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('processed-dataset/rating_distribution.png')
plt.close()

# Top Authors
top_authors = books['Book-Author'].value_counts().head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_authors.values, y=top_authors.index)
plt.title('Top 10 Authors by Number of Books')
plt.xlabel('Number of Books')
plt.ylabel('Author')
plt.savefig('processed-dataset/top_authors.png')
plt.close()

# Publication Year
plt.figure(figsize=(12, 6))
sns.histplot(data=books, x='Year-Of-Publication', bins=50)
plt.title('Distribution of Publication Years')
plt.xlabel('Year')
plt.ylabel('Count')
plt.savefig('processed-dataset/publication_years.png')
plt.close()

# Evaluation metrics
sample_true = ratings['Book-Rating'].head(1000)
sample_pred = np.random.normal(sample_true.mean(), sample_true.std(), 1000)

def evaluate_rating_predictions(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

def evaluate_recommendations(true_ratings, predicted_ratings, threshold=7):
    true_binary = (true_ratings >= threshold).astype(int)
    pred_binary = (predicted_ratings >= threshold).astype(int)
    precision = precision_score(true_binary, pred_binary)
    recall = recall_score(true_binary, pred_binary)
    f1 = f1_score(true_binary, pred_binary)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

def plot_roc_curve(true_ratings, predicted_ratings, threshold=7):
    true_binary = (true_ratings >= threshold).astype(int)
    fpr, tpr, _ = roc_curve(true_binary, predicted_ratings)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.savefig('processed-dataset/roc_curve.png')
    plt.close()

# Run Evaluations
print("\n--- Evaluation Metrics ---")
evaluate_rating_predictions(sample_true, sample_pred)
evaluate_recommendations(sample_true, sample_pred)
plot_roc_curve(sample_true, sample_pred)


book-recommendation-dataset/Books.csv already exists
book-recommendation-dataset/Users.csv already exists
book-recommendation-dataset/Ratings.csv already exists

--- Dataset Shapes ---
Books: (271360, 8)
Users: (278858, 3)
Ratings: (1149780, 3)

--- Basic Stats ---
              ISBN      Book-Title      Book-Author  Year-Of-Publication  \
count       271360          271360           271358               271360   
unique      271360          242135           102022                  202   
top     020130998X  Selected Poems  Agatha Christie                 2002   
freq             1              27              632                13903   

        Publisher                                        Image-URL-S  \
count      271358                                             271360   
unique      16807                                             271044   
top     Harlequin  http://images.amazon.com/images/P/155936078X.0...   
freq         7535                                                