In [1]:
import numpy as np
import pandas as pd

import pickle
import joblib




# Content Based

In [2]:

content_based_data = pd.read_pickle('processed_df.pkl')


# Collabative Filtring

In [3]:

# Load the ratings file
colaborative_data = pd.read_csv('colaborative.csv')

ratings_df = colaborative_data[['user_id', 'book_id', 'rating']]
book_metadata = colaborative_data[['book_id', 'title', 'authors', 'categories']].drop_duplicates('book_id').set_index('book_id')



In [5]:
import numpy as np

class BayesianSVD:
    def __init__(self, n_factors=20, n_epochs=20, lr=0.008, reg=0.002,
                 prior_mean=0, prior_std=0.1, content_weight=0.5, content_rate=0.1,
                 lr_decay=0.99, clip_value=5.0):
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr = lr                # initial learning rate
        self.reg = reg
        self.prior_mean = prior_mean
        self.prior_std = prior_std
        self.content_weight = content_weight
        self.content_rate = content_rate
        # New hyperparameters for adaptive learning and gradient clipping
        self.lr_decay = lr_decay    # factor to exponentially decay the learning rate each epoch
        self.clip_value = clip_value

        self.user_factors = None
        self.item_factors = None
        self.global_mean = None
        self.user_biases = None
        self.item_biases = None
        self.user_mapping = None
        self.item_mapping = None
        self.item_features = None  # optional content features for items

    def fit(self, ratings_df, item_features=None):
        """
        Train the model given a ratings DataFrame.
        Optionally provide item_features (a dictionary mapping book_id to content feature vector).
        """
        # Create mappings from user and item IDs to indices.
        unique_users = ratings_df['user_id'].unique()
        unique_items = ratings_df['book_id'].unique()
        self.user_mapping = {user: i for i, user in enumerate(unique_users)}
        self.item_mapping = {item: i for i, item in enumerate(unique_items)}

        # Convert ratings to internal indices.
        user_indices = ratings_df['user_id'].map(self.user_mapping).values
        item_indices = ratings_df['book_id'].map(self.item_mapping).values
        ratings = ratings_df['rating'].values

        # Global mean.
        self.global_mean = np.mean(ratings)

        n_users = len(unique_users)
        n_items = len(unique_items)

        # Initialize user factors and biases.
        self.user_factors = np.random.normal(self.prior_mean, self.prior_std, (n_users, self.n_factors))
        self.user_biases = np.zeros(n_users)

        # Initialize item factors.
        if item_features is not None:
            self.item_features = item_features
            self.item_factors = np.zeros((n_items, self.n_factors))
            for item in unique_items:
                idx = self.item_mapping[item]
                if item in item_features:
                    self.item_factors[idx] = (self.content_weight * np.array(item_features[item]) +
                                              (1 - self.content_weight) * np.random.normal(self.prior_mean, self.prior_std, self.n_factors))
                else:
                    self.item_factors[idx] = np.random.normal(self.prior_mean, self.prior_std, self.n_factors)
        else:
            self.item_factors = np.random.normal(self.prior_mean, self.prior_std, (n_items, self.n_factors))
        self.item_biases = np.zeros(n_items)

        # Training loop.
        for epoch in range(self.n_epochs):
            # Adaptive learning rate: decaying exponentially.
            current_lr = self.lr * (self.lr_decay ** epoch)
            indices = np.arange(len(ratings))
            np.random.shuffle(indices)

            for idx in indices:
                u = user_indices[idx]
                i = item_indices[idx]
                r = ratings[idx]

                # Prediction.
                pred = self.global_mean + self.user_biases[u] + self.item_biases[i] + \
                       np.dot(self.user_factors[u], self.item_factors[i])
                err = r - pred

                # Update biases (no clipping for biases, since they are scalars).
                self.user_biases[u] += current_lr * (err - self.reg * self.user_biases[u])
                self.item_biases[i] += current_lr * (err - self.reg * self.item_biases[i])

                # Save a copy of the current user factors for use in the item update.
                temp_user_factors = self.user_factors[u].copy()

                # Compute gradients for user and item factors.
                grad_user = err * self.item_factors[i] - self.reg * self.user_factors[u]
                grad_item = err * temp_user_factors - self.reg * self.item_factors[i]

                # Gradient clipping.
                norm_grad_user = np.linalg.norm(grad_user)
                if norm_grad_user > self.clip_value:
                    grad_user = grad_user * (self.clip_value / norm_grad_user)
                norm_grad_item = np.linalg.norm(grad_item)
                if norm_grad_item > self.clip_value:
                    grad_item = grad_item * (self.clip_value / norm_grad_item)

                # Update latent factors with the current learning rate.
                self.user_factors[u] += current_lr * grad_user
                self.item_factors[i] += current_lr * grad_item

            # Adaptive regularization: Bayesian prior pull.
            self._apply_adaptive_regularization(epoch)
            # Customized bias removal: Center item biases to reduce item bias dominance.
            self._center_item_biases()

            # Model inspection: print mean and std of latent factors.
            user_factors_mean = np.mean(self.user_factors)
            user_factors_std = np.std(self.user_factors)
            item_factors_mean = np.mean(self.item_factors)
            item_factors_std = np.std(self.item_factors)
            print(f"Epoch {epoch + 1}/{self.n_epochs} | LR: {current_lr:.5f} | "
                  f"User Factors: mean={user_factors_mean:.4f}, std={user_factors_std:.4f} | "
                  f"Item Factors: mean={item_factors_mean:.4f}, std={item_factors_std:.4f}")

    def _apply_adaptive_regularization(self, epoch):
        # Decay the 'prior pull' over epochs.
        initial_prior_weight = 0.1
        final_prior_weight = 0.01
        decay_rate = (initial_prior_weight - final_prior_weight) / self.n_epochs
        prior_weight = initial_prior_weight - (epoch * decay_rate)

        self.user_factors = (1 - prior_weight) * self.user_factors + prior_weight * self.prior_mean
        self.item_factors = (1 - prior_weight) * self.item_factors + prior_weight * self.prior_mean

        # If content features are available, further pull item factors toward their content vector.
        if self.item_features is not None:
            for item, idx in self.item_mapping.items():
                if item in self.item_features:
                    content_vector = np.array(self.item_features[item])
                    self.item_factors[idx] = (1 - self.content_rate) * self.item_factors[idx] + \
                                             self.content_rate * content_vector

    def _center_item_biases(self):
        # Center item biases to remove systematic over-emphasis.
        mean_item_bias = np.mean(self.item_biases)
        self.item_biases -= mean_item_bias

    def predict(self, user_id, book_id):
        if user_id not in self.user_mapping or book_id not in self.item_mapping:
            return self.global_mean

        u = self.user_mapping[user_id]
        i = self.item_mapping[book_id]
        pred = self.global_mean + self.user_biases[u] + 0.2 * self.item_biases[i] + \
               np.dot(self.user_factors[u], self.item_factors[i])
        # Ensure predicted rating is within the bounds, e.g. 1 to 5.
        return max(1, min(5, pred))

    def predict_for_user(self, user_id, book_ids):
        predictions = []
        for book_id in book_ids:
            pred = self.predict(user_id, book_id)
            predictions.append((book_id, pred))
        return predictions


In [6]:
import pickle

# Load the augmented hybrid model
with open("augmented_hybrid_model.pkl", "rb") as f:
    augmented_model = pickle.load(f)




In [7]:
def generate_recommendations_augmented(model, user_id, top_n=10, rated_books=None, metadata_df=None):
    """
    Generate recommendations for a given user using the augmented BayesianSVD model.
    Only items with available metadata will be shown.

    Parameters:
      model       : The trained augmented BayesianSVD model.
      user_id     : The user ID for which to generate recommendations.
      top_n       : Number of recommendations to return.
      rated_books : An optional set (or list) of book IDs the user has already rated.
      metadata_df : A DataFrame containing metadata for books having at least 'book_id', 'title', and 'authors'.

    Returns:
      A list of dictionaries, each containing:
         - 'book_id'
         - 'title'
         - 'authors'
         - 'predicted_rating'
    """
    # Invert the item mapping: {index: book_id}
    inv_item_mapping = {index: book for book, index in model.item_mapping.items()}
    recommendations = []

    # Iterate over all items in the model.
    for i in range(len(model.item_mapping)):
        book_id = inv_item_mapping[i]

        # Skip if this book is already rated by the user.
        if rated_books is not None and book_id in rated_books:
            continue

        # Look up metadata if the DataFrame is provided.
        if metadata_df is not None:
            meta_row = metadata_df.loc[metadata_df['book_id'] == book_id]
            # Skip this candidate if metadata is missing.
            if meta_row.empty:
                continue
            title = meta_row.iloc[0]['title']
            authors = meta_row.iloc[0]['authors']
        else:
            # If no metadata dataframe is provided, skip candidate (or you could fill with default values)
            continue

        # Generate a predicted rating.
        pred_rating = model.predict(user_id, book_id)
        rec = {
            'book_id': book_id,
            'title': title,
            'authors': authors,
            'predicted_rating': pred_rating
        }
        recommendations.append(rec)

    # Sort recommendations by predicted rating in descending order.
    recommendations.sort(key=lambda x: x['predicted_rating'], reverse=True)
    return recommendations[:top_n]


# --- Example usage ---
# Assume:
# - `augmented_model` is your trained augmented BayesianSVD model.
# - `ratings_df` is your ratings DataFrame (with at least 'user_id', 'book_id', 'rating').
# - `content_based_data` (or another DataFrame) contains metadata with columns 'book_id', 'title', and 'authors'.

user_id_example = 'A2F6N60Z96CAJI'

# Collect the set of books the user has already rated.
rated_books = set(ratings_df[ratings_df['user_id'] == user_id_example]['book_id'].tolist())

# Generate recommendations; only candidates with metadata (i.e. known titles) will appear.
top_recommendations = generate_recommendations_augmented(
    model=augmented_model,
    user_id=user_id_example,
    top_n=15,
    rated_books=rated_books,
    metadata_df=content_based_data
)

# Display recommendations.
for rec in top_recommendations:
    print(f"Book ID: {rec['book_id']}, Title: {rec['title']}, Authors: {rec['authors']}, "
          f"Predicted Rating: {rec['predicted_rating']:.3f}")


Book ID: B000MZW2AO, Title: Charlotte's Web, Authors: ['E. B. White'], Predicted Rating: 4.536
Book ID: 0736694242, Title: Under the Banner of Heaven, Authors: ['Jon Krakauer'], Predicted Rating: 4.455
Book ID: B000NPRINY, Title: All Creatures Great and Small, Authors: ['James Herriot'], Predicted Rating: 4.453
Book ID: 0446781819, Title: Gone with the Wind, Authors: ['Margaret Mitchell'], Predicted Rating: 4.436
Book ID: B000KHZ3QE, Title: The Great Divorce, Authors: ['C. S. Lewis'], Predicted Rating: 4.431
Book ID: B000L346OM, Title: Man's Search for Meaning, Authors: ['Viktor E Frankl'], Predicted Rating: 4.418
Book ID: 0460872702, Title: Great Gatsby (Everyman), Authors: ['F. Scott Fitzgerald'], Predicted Rating: 4.413
Book ID: B0000632ZJ, Title: Lonesome Dove, Authors: ['Larry McMurtry'], Predicted Rating: 4.410
Book ID: B000J1OR0Y, Title: The Lord of the Rings (3 Volume Set), Authors: ['J. R. R. Tolkien'], Predicted Rating: 4.409
Book ID: B00017JIQE, Title: Under the Banner of He