# Book Recommendation AI System

This notebook implements a hybrid book recommendation system with two phases:
1. Content-based filtering using BART summaries and metadata
2. Neural Collaborative Filtering with content embeddings

Key features:
- Bias reduction and diversity promotion
- Comprehensive evaluation metrics
- Hybrid approach combining content and collaborative filtering

In [None]:
%pip install transformers sentence-transformers torch pandas numpy scikit-learn mongoengine



In [3]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

## Phase 1: Content-Based Filtering

In this phase, we'll:
1. Generate book summaries using BART
2. Create embeddings using Sentence-BERT
3. Implement content-based recommendations
4. Add diversity-aware re-ranking

In [4]:
class BARTSummarizer:
    def __init__(self):
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        self.model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

    def generate_summary(self, text):
        inputs = self.tokenizer(text, max_length=1024, truncation=True, return_tensors='pt')
        summary_ids = self.model.generate(inputs['input_ids'], num_beams=4, max_length=100,
                                        early_stopping=True)
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def get_embedding(self, text):
        return self.sentence_transformer.encode(text)

In [5]:
class ContentBasedRecommender:
    def __init__(self, books_df):
        self.books_df = books_df
        self.summarizer = BARTSummarizer()
        self.embeddings = None

    def prepare_embeddings(self):
        # Combine book metadata and summaries
        combined_features = self.books_df.apply(
            lambda x: f"{x['title']} {x['author']} {x['genre']} {x['description']}", axis=1
        )
        self.embeddings = np.vstack([
            self.summarizer.get_embedding(text) for text in combined_features
        ])

    def get_recommendations(self, book_idx, n=5, diversity_weight=0.3):
        # Calculate similarity scores
        similarities = cosine_similarity([self.embeddings[book_idx]], self.embeddings)[0]

        # Apply diversity re-ranking
        recommendations = []
        for _ in range(n):
            if not recommendations:
                idx = np.argmax(similarities)
            else:
                # Balance between similarity and diversity
                diversity_scores = [1 - max([cosine_similarity(
                    [self.embeddings[idx]], [self.embeddings[rec_idx]])[0][0]
                    for rec_idx in recommendations]) for idx in range(len(similarities))]

                combined_scores = (
                    (1 - diversity_weight) * similarities +
                    diversity_weight * np.array(diversity_scores)
                )
                idx = np.argmax(combined_scores)

            recommendations.append(idx)
            similarities[idx] = -1  # Exclude from future consideration

        return recommendations

## Phase 2: Neural Collaborative Filtering

Implementing a hybrid NCF model that incorporates:
1. User-item interactions
2. Content embeddings from Phase 1
3. Fairness constraints and bias reduction

In [6]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, content_dim):
        super(NCF, self).__init__()

        # User and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        # Content embedding projection
        self.content_projection = nn.Linear(content_dim, embedding_dim)

        # Layers
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 3, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, user_idx, item_idx, content_embedding):
        # Get embeddings
        user_embed = self.user_embedding(user_idx)
        item_embed = self.item_embedding(item_idx)
        content_embed = self.content_projection(content_embedding)

        # Combine embeddings
        x = torch.cat([user_embed, item_embed, content_embed], dim=1)
        return self.fc_layers(x)

In [7]:
def train_ncf(model, train_loader, val_loader, num_epochs=10):
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.BCELoss()

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for user, item, content, label in train_loader:
            pred = model(user, item, content)
            loss = criterion(pred, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for user, item, content, label in val_loader:
                pred = model(user, item, content)
                val_loss += criterion(pred, label).item()

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')

def evaluate_recommendations(predictions, true_labels, user_demographics=None):
    metrics = {
        'precision': precision_score(true_labels, predictions > 0.5),
        'recall': recall_score(true_labels, predictions > 0.5),
        'rmse': np.sqrt(mean_squared_error(true_labels, predictions)),
        'mae': mean_absolute_error(true_labels, predictions)
    }

    if user_demographics is not None:
        # Calculate demographic parity
        predictions_by_group = {}
        for demo_group in user_demographics.unique():
            group_mask = user_demographics == demo_group
            predictions_by_group[demo_group] = (predictions[group_mask] > 0.5).mean()

        # Calculate max difference between groups
        max_disparity = max(predictions_by_group.values()) - min(predictions_by_group.values())
        metrics['demographic_parity'] = 1 - max_disparity

    return metrics

In [8]:
def calculate_diversity_metrics(recommendations, book_embeddings):
    if len(recommendations) < 2:
        return {
            'diversity_score': 0,
            'coverage': 0
        }

    # Calculate pairwise distances between recommendations
    pair_distances = []
    for i in range(len(recommendations)):
        for j in range(i + 1, len(recommendations)):
            distance = 1 - cosine_similarity(
                [book_embeddings[recommendations[i]]],
                [book_embeddings[recommendations[j]]]
            )[0][0]
            pair_distances.append(distance)

    # Calculate metrics
    diversity_score = np.mean(pair_distances)
    coverage = len(set(recommendations)) / len(book_embeddings)

    return {
        'diversity_score': diversity_score,
        'coverage': coverage
    }

## Data Loading and Preprocessing

Load book data and user interactions from the database, then prepare it for both recommendation approaches.

In [37]:
from google.colab import userdata
import os

# Assuming you've named your secret 'MONGO_URI' in the Colab Secrets manager
mongo_uri = userdata.get('MONGO_URI')

if mongo_uri:
    os.environ['MONGO_URI'] = mongo_uri
    print("MONGO_URI environment variable set from Colab Secrets.")
else:
    print("Warning: 'MONGO_URI' secret not found in Colab Secrets.")
    # Handle this case - maybe prompt the user or exit if the connection is essential

MONGO_URI environment variable set from Colab Secrets.


In [38]:
def load_data():
    """Load book data and user interactions from MongoDB"""
    import sys
    import os
    import inspect

    current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
    path_to_add = None
    max_levels_up = 5 # Check up to 5 levels up

    for i in range(max_levels_up):
        candidate_path = os.path.abspath(os.path.join(current_dir, *(['..'] * i)))
        models_path_candidate = os.path.join(candidate_path, 'models')
        if os.path.isdir(models_path_candidate):
            path_to_add = candidate_path
            print(f"Found 'models' directory at: {models_path_candidate}")
            break # Found the path, exit loop

    if path_to_add and path_to_add not in sys.path:
        sys.path.insert(0, path_to_add)
        print(f"Added '{path_to_add}' to sys.path")
    elif not path_to_add:
        print(f"Error: Could not automatically find the directory containing 'models' within {max_levels_up} levels up.")
        print("Please manually specify the correct 'path_to_add'.")
        # Optionally, raise the error here if finding models is critical
        # raise FileNotFoundError("Could not locate the 'models' directory.")

    # Import models - this will now look in the added parent directory
    try:
      from models.Book import Book
      from models.User import User
      from models.ReadingProfile import ReadingProfile
      from config.db import connect_db # Assuming config is also in the parent dir
      print("Successfully imported models and config.")
    except ModuleNotFoundError  as e:
      print(f"Failed to import models or config: {e}")
      print("Please ensure the correct path containing the 'models' and 'config' directories is added to sys.path.")
      print(f"Current sys.path: {sys.path}")
      raise
    # Connect to database
    connect_db()

    try:
        # Assuming Book.objects.to_json() returns a valid JSON string
        # that can be read by pd.read_json
        # Add read_json parameters to handle potential objectid or other types if needed
        # Example: `orient='records'` if to_json gives a list of dicts
        # If to_json produces a string that is not a valid JSON array,
        # you might need json.loads first.
        books = pd.read_json(Book.objects.to_json())
    except ValueError:
        # If to_json returns a string that can't be directly read as JSON array
        # Or if it's just a representation, you might need to process it
        # Alternatively, if mongoengine QuerySet can be iterated:
        print("Attempting to load books using list(Book.objects.values())")
        try:
             # This assumes .values() returns a list of dictionaries suitable for DataFrame
             books = pd.DataFrame(list(Book.objects.values()))
        except Exception as e_values:
             print(f"Error loading books using .values(): {e_values}")
             print("Could not load book data.")
             # You might need more specific error handling or structure checking here
             # Depending on the exact output of Book.objects
             raise # Re-raise if book loading is essential

    books['description'] = books['description'].fillna('')

    # Load user interactions
    interactions = []
    # Iterate over ReadingProfile objects
    try:
        for profile in ReadingProfile.objects:
            # Ensure user is an ObjectId and handle potential None
            if profile.user:
                user_id = str(profile.user.id) # Convert ObjectId to string
            else:
                continue # Skip profiles with no user

            # Add positive interactions from favorites
            if hasattr(profile, 'favorites') and profile.favorites:
                 for book in profile.favorites:
                      # Ensure book is an ObjectId and handle potential None
                     if book:
                         interactions.append({
                             'user_id': user_id,
                             'book_id': str(book.id), # Convert ObjectId to string
                             'rating': 1
                         })
            # Add implicit negative interactions from viewed but not favorited
            if hasattr(profile, 'viewed_books') and profile.viewed_books:
                 for book in profile.viewed_books:
                      # Ensure book is an ObjectId and handle potential None
                     # Check if book.id (ObjectId) is in the list of favorite book ObjectIds
                     favorite_ids = [fav_book.id for fav_book in (profile.favorites if hasattr(profile, 'favorites') else []) if fav_book]
                     if book and book.id not in favorite_ids:
                          interactions.append({
                             'user_id': user_id,
                             'book_id': str(book.id), # Convert ObjectId to string
                             'rating': 0
                         })
    except Exception as e_interactions:
         print(f"Error loading interactions from ReadingProfile: {e_interactions}")
         # Depending on your application, you might continue or raise an error
         # if interaction data is critical.
         # For now, we'll proceed, but the interactions_df might be empty.


    interactions_df = pd.DataFrame(interactions)

    # Ensure book_id and user_id are consistent object types (strings)
    if not interactions_df.empty:
        interactions_df['book_id'] = interactions_df['book_id'].astype(str)
        interactions_df['user_id'] = interactions_df['user_id'].astype(str)

    # Ensure books_df index (which becomes book_id after map) is string if needed
    # if not books_df.empty:
    #    books_df.index = books_df.index.astype(str) # Might be needed depending on how book_map is created

    return books, interactions_df

In [10]:
def preprocess_data(books_df, interactions_df):
    """Prepare data for both recommendation approaches"""
    # Create user and item mappings
    user_map = {id_: idx for idx, id_ in enumerate(interactions_df['user_id'].unique())}
    book_map = {id_: idx for idx, id_ in enumerate(books_df.index)}

    # Transform interactions using mappings
    interactions_df['user_idx'] = interactions_df['user_id'].map(user_map)
    interactions_df['book_idx'] = interactions_df['book_id'].map(book_map)

    # Split data
    train_data, val_data = train_test_split(
        interactions_df, test_size=0.2, random_state=42,
        stratify=interactions_df['rating']
    )

    return {
        'user_map': user_map,
        'book_map': book_map,
        'train_data': train_data,
        'val_data': val_data,
        'n_users': len(user_map),
        'n_items': len(book_map)
    }

## Public Datasets Integration

Enrich our recommendation system with public datasets:
1. Goodreads Dataset (Kaggle): 10M+ book ratings and metadata
2. Amazon Customer Reviews: Book reviews and ratings
3. Project Gutenberg: Text content for style analysis
4. Open Library API: Rich book metadata

This helps address the cold-start problem and improves recommendation quality.

In [11]:
def load_goodreads_data(data_path='../data/goodreads'):
    """Load and preprocess Goodreads dataset
    Download from: https://www.kaggle.com/datasets/bahramjannesarr/goodreads-book-datasets-10m
    """
    # Load ratings and books
    ratings = pd.read_csv(f'{data_path}/ratings.csv')
    books = pd.read_csv(f'{data_path}/books.csv')

    # Clean and preprocess
    books['description'] = books['description'].fillna('')
    books['genres'] = books['genres'].fillna('')

    # Convert ratings to binary interactions (rating >= 4 considered positive)
    ratings['rating'] = (ratings['rating'] >= 4).astype(int)

    # Sample a manageable subset for training
    n_users = 10000
    n_books = 5000

    # Get top users and books by interaction count
    top_users = ratings['user_id'].value_counts().nlargest(n_users).index
    top_books = ratings['book_id'].value_counts().nlargest(n_books).index

    # Filter ratings
    ratings_subset = ratings[
        ratings['user_id'].isin(top_users) &
        ratings['book_id'].isin(top_books)
    ]

    # Get corresponding books
    books_subset = books[books['book_id'].isin(top_books)]

    return books_subset, ratings_subset

In [12]:
def load_amazon_reviews(data_path='../data/amazon'):
    """Load and preprocess Amazon book reviews dataset
    Download from: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz
    """
    # Load reviews
    reviews = pd.read_csv(
        f'{data_path}/amazon_reviews_us_Books_v1_02.tsv',
        sep='\t',
        usecols=[
            'customer_id', 'product_id', 'product_title',
            'star_rating', 'review_body'
        ]
    )

    # Clean and preprocess
    reviews = reviews.dropna(subset=['review_body'])

    # Convert ratings to binary interactions (rating >= 4 considered positive)
    reviews['rating'] = (reviews['star_rating'] >= 4).astype(int)

    # Create books dataframe
    books = reviews[['product_id', 'product_title']].drop_duplicates()
    books.columns = ['book_id', 'title']

    # Sample a manageable subset
    n_users = 10000
    n_books = 5000

    # Get top users and books
    top_users = reviews['customer_id'].value_counts().nlargest(n_users).index
    top_books = reviews['product_id'].value_counts().nlargest(n_books).index

    # Filter reviews
    reviews_subset = reviews[
        reviews['customer_id'].isin(top_users) &
        reviews['product_id'].isin(top_books)
    ][['customer_id', 'product_id', 'rating', 'review_body']]

    # Get corresponding books
    books_subset = books[books['book_id'].isin(top_books)]

    return books_subset, reviews_subset

In [13]:
def enrich_with_open_library(books_df):
    """Enrich book metadata using Open Library API"""
    import requests
    import time
    from tqdm import tqdm

    def fetch_book_data(title, author=None):
        # Search Open Library
        query = f'title:{title}'
        if author:
            query += f' author:{author}'

        response = requests.get(
            'https://openlibrary.org/search.json',
            params={'q': query}
        )

        if response.status_code == 200:
            data = response.json()
            if data['docs']:
                book = data['docs'][0]
                return {
                    'ol_id': book.get('key', ''),
                    'subjects': book.get('subject', []),
                    'first_publish_year': book.get('first_publish_year', None),
                    'language': book.get('language', []),
                    'ebook_available': book.get('ebook_access', '') != 'no_ebook'
                }
        return None

    # Enrich each book
    enriched_data = []
    for _, book in tqdm(books_df.iterrows(), total=len(books_df)):
        data = fetch_book_data(book['title'], book.get('author'))
        if data:
            enriched_data.append(data)
        else:
            enriched_data.append({
                'ol_id': '',
                'subjects': [],
                'first_publish_year': None,
                'language': [],
                'ebook_available': False
            })
        time.sleep(1)  # Rate limiting

    # Add enriched data to dataframe
    enriched_df = pd.DataFrame(enriched_data)
    return pd.concat([books_df, enriched_df], axis=1)

In [14]:
def combine_data_sources(use_public_data=True):
    """Combine internal database with public datasets"""
    # Load internal data
    internal_books, internal_interactions = load_data()

    if not use_public_data:
        return internal_books, internal_interactions

    # Load public datasets
    try:
        goodreads_books, goodreads_ratings = load_goodreads_data()
        amazon_books, amazon_reviews = load_amazon_reviews()

        # Combine books
        all_books = pd.concat([
            internal_books,
            goodreads_books.add_prefix('gr_'),
            amazon_books.add_prefix('amzn_')
        ], axis=0)

        # Combine interactions
        all_interactions = pd.concat([
            internal_interactions,
            goodreads_ratings.add_prefix('gr_'),
            amazon_reviews.add_prefix('amzn_')
        ], axis=0)

        # Enrich with Open Library data
        print("Enriching with Open Library data...")
        all_books = enrich_with_open_library(all_books)

        return all_books, all_interactions

    except Exception as e:
        print(f"Warning: Could not load public datasets: {str(e)}")
        print("Falling back to internal data only.")
        return internal_books, internal_interactions

## Main Execution

Put everything together and train both recommendation systems.

In [None]:
%pip install --upgrade mongoengine



In [48]:
# Example usage of combined datasets
books_df, interactions_df = combine_data_sources(use_public_data=True)
print(f"Total books: {len(books_df)}")
print(f"Total interactions: {len(interactions_df)}")

# Display sample of enriched book data
print("\nSample of enriched book data:")
print(books_df[['title', 'author', 'genre', 'ol_id', 'subjects', 'first_publish_year']].head())

Found 'models' directory at: /content/models
Successfully imported models and config.
Error connecting to MongoDB: A different connection with alias `default` was already registered. Use disconnect() first


KeyboardInterrupt: 

In [None]:
if __name__ == '__main__':
    # Load and preprocess data
    print("Loading data...")
    books_df, interactions_df = load_data()
    data = preprocess_data(books_df, interactions_df)

    # Initialize and train content-based system
    print("\nTraining content-based system...")
    content_recommender = ContentBasedRecommender(books_df)
    content_recommender.prepare_embeddings()

    # Initialize and train NCF model
    print("\nTraining neural collaborative filtering model...")
    EMBEDDING_DIM = 64
    CONTENT_DIM = 384  # dimension of Sentence-BERT embeddings

    model = NCF(
        num_users=data['n_users'],
        num_items=data['n_items'],
        embedding_dim=EMBEDDING_DIM,
        content_dim=CONTENT_DIM
    )

    # Create data loaders
    from torch.utils.data import TensorDataset, DataLoader

    def create_loader(df, batch_size=32):
        users = torch.LongTensor(df['user_idx'].values)
        items = torch.LongTensor(df['book_idx'].values)
        # Get content embeddings for each book
        contents = torch.FloatTensor(np.array([
            content_recommender.embeddings[idx] for idx in df['book_idx'].values
        ]))
        labels = torch.FloatTensor(df['rating'].values)
        return DataLoader(
            TensorDataset(users, items, contents, labels),
            batch_size=batch_size,
            shuffle=True
        )

    train_loader = create_loader(data['train_data'])
    val_loader = create_loader(data['val_data'])

    # Train the model
    train_ncf(model, train_loader, val_loader, num_epochs=10)

    print("\nTraining complete! Models are ready for making recommendations.")

    # Save the trained models
    import joblib
    import torch

    # Save content-based recommender
    joblib.dump(content_recommender, '../models/content_recommender.pkl')

    # Save NCF model
    torch.save({
        'model_state_dict': model.state_dict(),
        'user_map': data['user_map'],
        'book_map': data['book_map']
    }, '../models/ncf_model.pth')

    print("\nModels saved successfully!")

In [29]:
import os

# Define the path for the directories you want to create
# For example, to create a 'models' directory and a 'config' directory
models_dir = 'models'
config_dir = 'config'

# Create the 'models' directory
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Directory '{models_dir}' created.")
else:
    print(f"Directory '{models_dir}' already exists.")

# Create the 'config' directory
if not os.path.exists(config_dir):
    os.makedirs(config_dir)
    print(f"Directory '{config_dir}' created.")
else:
    print(f"Directory '{config_dir}' already exists.")

Directory 'models' created.
Directory 'config' created.


config	models	sample_data
