# Amazon Sales Dataset Recommendation System using Collaborative Filtering and Autoencoders

**Group members:**
- Tasneem Shaheen, 107279
- Mostafa Khalid 106699
- Medhansh Ahuja 105982

The aim for this project is to Build a recommender system to predict user ratings and recommend top products using two approaches: Collaborative Filtering (CF) and Autoencoders.

In [2]:
# Amazon Sales Dataset Recommendation System
# Group members: Tasneem Shaheen (107279), Mostafa Khalid (106699), Medhansh Ahuja (105982)
# This script builds a recommender system using SVD (baseline), kNN, and Autoencoder.
# Evaluates models with 5-fold cross-validation on RMSE, Precision@10, and Recall@10.

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, KNNBasic, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise.accuracy import rmse
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import KFold
import json
import os
import surprise
import tensorflow as tf

# Suppress CUDA warnings by forcing CPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # Disable GPU

# Print library version for debugging
print(f"Surprise library version: {surprise.__version__}")

# 1. Data Preprocessing
def preprocess_data(input_file='amazon.csv', output_file='amazon_preprocessed.csv'):
    print("Starting data preprocessing...")
    
    # Load dataset
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: {input_file} not found. Please ensure the dataset is in the correct directory.")
        return None, None, None, None, None
    
    # Handle mixed types
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['rating_count'] = df['rating_count'].str.replace(',', '').astype(float, errors='ignore')
    
    # Drop missing critical fields
    df = df.dropna(subset=['user_id', 'product_id', 'rating'])
    print(f"Dataset shape after dropping missing values: {df.shape}")
    
    # Split comma-separated user_id into individual rows
    df_expanded = df.assign(user_id=df['user_id'].str.split(',')).explode('user_id')
    df_expanded = df_expanded.dropna(subset=['user_id'])
    print(f"Dataset shape after splitting user_id: {df_expanded.shape}")
    
    # Sentiment analysis on review_content
    analyzer = SentimentIntensityAnalyzer()
    df_expanded['sentiment_score'] = df_expanded['review_content'].apply(
        lambda x: analyzer.polarity_scores(str(x))['compound'] if pd.notnull(x) else 0
    )
    
    # Encode user_id and product_id
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df_expanded['user_idx'] = user_encoder.fit_transform(df_expanded['user_id'])
    df_expanded['item_idx'] = item_encoder.fit_transform(df_expanded['product_id'])
    
    # Check for duplicate user_idx, item_idx pairs
    duplicates = df_expanded.duplicated(subset=['user_idx', 'item_idx'], keep=False)
    if duplicates.any():
        print(f"Found {duplicates.sum()} duplicate user-item interactions.")
        # Aggregate duplicates by taking the mean rating
        df_expanded = df_expanded.groupby(['user_idx', 'item_idx']).agg({
            'rating': 'mean',
            'user_id': 'first',
            'product_id': 'first',
            'sentiment_score': 'mean'
        }).reset_index()
        print(f"Dataset shape after aggregating duplicates: {df_expanded.shape}")
    
    # Create user-item matrix
    try:
        pivot_table = df_expanded.pivot(index='user_idx', columns='item_idx', values='rating').fillna(0)
    except ValueError as e:
        print(f"Error during pivot: {e}")
        return None, None, None, None, None
    sparse_matrix = csr_matrix(pivot_table.values)
    
    # Save preprocessed dataset
    df_expanded.to_csv(output_file, index=False)
    
    # Display basic info
    print(f"Dataset shape: {df_expanded.shape}")
    print(f"Number of unique products: {df_expanded['product_id'].nunique()}")
    print(f"Number of unique users: {df_expanded['user_id'].nunique()}")
    print(f"Sparse matrix shape: {sparse_matrix.shape}, non-zero entries: {sparse_matrix.nnz}")
    
    return df_expanded, sparse_matrix, pivot_table, user_encoder, item_encoder

# 2. Visualizations
def plot_visualizations(df_expanded, pivot_table):
    if df_expanded is None or pivot_table is None:
        print("Skipping visualizations due to preprocessing error.")
        return
    
    # Create results directory if it doesn't exist
    os.makedirs('results', exist_ok=True)
    
    # Rating distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df_expanded['rating'], bins=10, kde=True)
    plt.title('Distribution of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.grid(True)
    plt.savefig('results/rating_distribution.png')
    plt.close()
    
    # Sparsity visualization (subset for readability)
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_table.iloc[:100, :100], cmap='Blues', cbar_kws={'label': 'Rating'})
    plt.title('User-Item Matrix Sparsity (First 100 Users and Items)')
    plt.xlabel('Item Index')
    plt.ylabel('User Index')
    plt.savefig('results/sparsity_heatmap.png')
    plt.close()

# 3. SVD Model (Baseline)
def train_svd(df_expanded):
    if df_expanded is None:
        print("Skipping SVD training due to preprocessing error.")
        return None, None
    
    print("Training SVD model...")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df_expanded[['user_id', 'product_id', 'rating']], reader)
    
    # 5-fold Cross-Validation
    param_grid = {
        'n_factors': [50, 100, 200],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02, 0.1]
    }
    gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
    gs_svd.fit(data)
    print(f"Best SVD RMSE: {gs_svd.best_score['rmse']:.4f}")
    print(f"Best SVD Parameters: {gs_svd.best_params['rmse']}")
    
    # Train final model
    svd = SVD(**gs_svd.best_params['rmse'])
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    return svd, data

# 4. kNN Model (User-Based)
def train_knn(df_expanded):
    if df_expanded is None:
        print("Skipping kNN training due to preprocessing error.")
        return None
    
    print("Training kNN model...")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df_expanded[['user_id', 'product_id', 'rating']], reader)
    
    # Manual cross-validation to avoid GridSearchCV issues
    k_values = [10, 20, 40]
    sim_options_list = [
        {'name': 'cosine', 'user_based': True},
        {'name': 'pearson', 'user_based': True}
    ]
    best_rmse = float('inf')
    best_params = None
    best_knn = None
    
    for k in k_values:
        for sim_options in sim_options_list:
            print(f"Evaluating kNN with k={k}, sim_options={sim_options}")
            knn = KNNBasic(k=k, sim_options=sim_options)
            cv_results = cross_validate(knn, data, measures=['rmse', 'mae'], cv=5, verbose=False)
            mean_rmse = np.mean(cv_results['test_rmse'])
            print(f"Mean RMSE: {mean_rmse:.4f}")
            if mean_rmse < best_rmse:
                best_rmse = mean_rmse
                best_params = {'k': k, 'sim_options': sim_options}
                best_knn = knn
    
    print(f"Best kNN RMSE: {best_rmse:.4f}")
    print(f"Best kNN Parameters: {best_params}")
    
    # Train final model
    trainset = data.build_full_trainset()
    best_knn.fit(trainset)
    return best_knn

# 5. Autoencoder Model
def train_autoencoder(sparse_matrix):
    if sparse_matrix is None:
        print("Skipping Autoencoder training due to preprocessing error.")
        return None
    
    print("Training Autoencoder model...")
    X = sparse_matrix.toarray()
    
    def build_autoencoder(hidden_units=100, dropout_rate=0.2):
        model = Sequential([
            Input(shape=(X.shape[1],)),
            Dense(hidden_units, activation='relu'),
            Dropout(dropout_rate),
            Dense(hidden_units // 2, activation='relu'),
            Dense(X.shape[1], activation='linear')
        ])
        model.compile(optimizer='adam', loss='mse')
        return model
    
    # 5-Fold Cross-Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        model = build_autoencoder()
        model.fit(X_train, X_train, epochs=10, batch_size=32, validation_data=(X_val, X_val), verbose=0)
        val_pred = model.predict(X_val, verbose=0)
        rmse = np.sqrt(np.mean((X_val - val_pred) ** 2))
        rmse_scores.append(rmse)
    print(f"Autoencoder Mean RMSE (5-fold CV): {np.mean(rmse_scores):.4f}")
    
    # Train final model
    autoencoder = build_autoencoder()
    autoencoder.fit(X, X, epochs=10, batch_size=32, verbose=0)
    return autoencoder

# 6. Evaluation
def evaluate_models(svd, knn, autoencoder, data, pivot_table, user_encoder, item_encoder):
    if data is None or pivot_table is None:
        print("Skipping evaluation due to preprocessing error.")
        return [None] * 9
    if svd is None or knn is None or autoencoder is None:
        print("Warning: One or more models failed to train. Evaluating available models.")
    
    print("Evaluating models...")
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
    
    # SVD Evaluation
    svd_rmse, svd_prec, svd_recall = None, None, None
    if svd is not None:
        svd.fit(trainset)
        svd_predictions = svd.test(testset)
        svd_rmse = rmse(svd_predictions, verbose=False)
        svd_prec, svd_recall = precision_recall_at_k(svd_predictions)
    
    # kNN Evaluation
    knn_rmse, knn_prec, knn_recall = None, None, None
    if knn is not None:
        knn.fit(trainset)
        knn_predictions = knn.test(testset)
        knn_rmse = rmse(knn_predictions, verbose=False)
        knn_prec, knn_recall = precision_recall_at_k(knn_predictions)
    
    # Autoencoder Evaluation
    auto_rmse, auto_prec, auto_recall = None, None, None
    if autoencoder is not None:
        # Map user_id to user_idx
        test_indices = []
        skipped = 0
        for x in testset:
            try:
                user_idx = user_encoder.transform([x[0]])[0]
                test_indices.append(user_idx)
            except ValueError:
                skipped += 1
                continue
        if skipped > 0:
            print(f"Warning: Skipped {skipped} testset entries due to unknown user_id.")
        if test_indices:
            test_users = pivot_table.iloc[test_indices].values
            auto_pred = autoencoder.predict(test_users, verbose=0)
            auto_rmse = np.sqrt(np.mean((test_users - auto_pred) ** 2))
            auto_prec, auto_recall = autoencoder_precision_recall(autoencoder, pivot_table, testset, user_encoder, item_encoder)
        else:
            print("Warning: No valid test indices for Autoencoder evaluation.")
    
    # Print metrics
    print(f"SVD Test RMSE: {svd_rmse if svd_rmse is not None else 'N/A'}")
    print(f"SVD Precision@10: {svd_prec if svd_prec is not None else 'N/A'}")
    print(f"SVD Recall@10: {svd_recall if svd_recall is not None else 'N/A'}")
    print(f"kNN Test RMSE: {knn_rmse if knn_rmse is not None else 'N/A'}")
    print(f"kNN Precision@10: {knn_prec if knn_prec is not None else 'N/A'}")
    print(f"kNN Recall@10: {knn_recall if knn_recall is not None else 'N/A'}")
    print(f"Autoencoder Test RMSE: {auto_rmse if auto_rmse is not None else 'N/A'}")
    print(f"Autoencoder Precision@10: {auto_prec if auto_prec is not None else 'N/A'}")
    print(f"Autoencoder Recall@10: {auto_recall if auto_recall is not None else 'N/A'}")
    
    return svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall

# Precision@10 and Recall@10 for Autoencoder
def autoencoder_precision_recall(autoencoder, pivot_table, testset, user_encoder, item_encoder, k=10, threshold=4.0):
    test_indices = []
    user_item_pairs = []
    skipped = 0
    for x in testset:
        try:
            user_idx = user_encoder.transform([x[0]])[0]
            test_indices.append(user_idx)
            user_item_pairs.append((x[0], x[1], x[2]))  # user_id, item_id, true_rating
        except ValueError:
            skipped += 1
            continue
    if skipped > 0:
        print(f"Warning: Skipped {skipped} testset entries due to unknown user_id in Autoencoder precision/recall.")
    if not test_indices:
        print("Warning: No valid test indices for Autoencoder precision/recall.")
        return 0, 0
    
    test_users = pivot_table.iloc[test_indices].values
    auto_pred = autoencoder.predict(test_users, verbose=0)
    user_pred = {}
    skipped_items = 0
    for idx, (uid, iid, true_r) in enumerate(user_item_pairs):
        if uid not in user_pred:
            user_pred[uid] = []
        try:
            item_idx = item_encoder.transform([iid])[0]
            est = auto_pred[idx, item_idx]
            user_pred[uid].append((est, true_r))
        except ValueError:
            skipped_items += 1
            continue
    if skipped_items > 0:
        print(f"Warning: Skipped {skipped_items} testset entries due to unknown item_id.")
    
    precision, recall = [], []
    for uid, preds in user_pred.items():
        preds.sort(reverse=True)
        top_k = preds[:k]
        n_rel = sum(1 for _, r in top_k if r >= threshold)
        n_rec = len(top_k)
        n_rel_total = sum(1 for _, r in preds if r >= threshold)
        precision.append(n_rel / n_rec if n_rec > 0 else 0)
        recall.append(n_rel / n_rel_total if n_rel_total > 0 else 0)
    return np.mean(precision) if precision else 0, np.mean(recall) if recall else 0

# Precision@10 and Recall@10 for SVD and kNN
def precision_recall_at_k(predictions, k=10, threshold=4.0):
    user_pred = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_pred:
            user_pred[uid] = []
        user_pred[uid].append((est, true_r))
    precision, recall = [], []
    for uid, preds in user_pred.items():
        preds.sort(reverse=True)
        top_k = preds[:k]
        n_rel = sum(1 for _, r in top_k if r >= threshold)
        n_rec = len(top_k)
        n_rel_total = sum(1 for _, r in preds if r >= threshold)
        precision.append(n_rel / n_rec if n_rec > 0 else 0)
        recall.append(n_rel / n_rel_total if n_rel_total > 0 else 0)
    return np.mean(precision), np.mean(recall)

# 7. Model Comparison
def plot_model_comparison(svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall):
    if all(x is None for x in [svd_rmse, knn_rmse, auto_rmse]):
        print("Skipping model comparison due to evaluation error.")
        return
    
    # Chart.js configuration
    chart_config = {
        "type": "bar",
        "data": {
            "labels": ["SVD", "kNN", "Autoencoder"],
            "datasets": [
                {
                    "label": "Test RMSE",
                    "data": [
                        svd_rmse if svd_rmse is not None else None,
                        knn_rmse if knn_rmse is not None else None,
                        auto_rmse if auto_rmse is not None else None
                    ],
                    "backgroundColor": "#FF6B6B"
                },
                {
                    "label": "Precision@10",
                    "data": [
                        svd_prec if svd_prec is not None else None,
                        knn_prec if knn_prec is not None else None,
                        auto_prec if auto_prec is not None else None
                    ],
                    "backgroundColor": "#4ECDC4"
                },
                {
                    "label": "Recall@10",
                    "data": [
                        svd_recall if svd_recall is not None else None,
                        knn_recall if knn_recall is not None else None,
                        auto_recall if auto_recall is not None else None
                    ],
                    "backgroundColor": "#45B7D1"
                }
            ]
        },
        "options": {
            "scales": {
                "y": {"beginAtZero": True, "title": {"display": True, "text": "Scores"}},
                "x": {"title": {"display": True, "text": "Models"}}
            },
            "plugins": {
                "title": {"display": True, "text": "Model Comparison: RMSE, Precision@10, Recall@10"},
                "legend": {"display": True}
            }
        }
    }
    print("\nChart.js Configuration for Model Comparison:")
    print(json.dumps(chart_config, indent=2))
    
    # Detailed analysis
    print("\nDetailed Analysis:")
    if svd_rmse is not None and auto_rmse is not None:
        print(f"- RMSE Difference (SVD vs Autoencoder): {abs(svd_rmse - auto_rmse):.4f}")
    if svd_prec is not None and auto_prec is not None and svd_prec > 0 and auto_prec > 0:
        print(f"- Precision@10 Difference (SVD vs Autoencoder): {abs(svd_prec - auto_prec):.4f}")
        if svd_prec > auto_prec:
            print(f"- SVD has {((svd_prec - auto_prec) / auto_prec * 100):.1f}% higher Precision@10")
        else:
            print(f"- Autoencoder has {((auto_prec - svd_prec) / svd_prec * 100):.1f}% higher Precision@10")
    if svd_recall is not None and auto_recall is not None and svd_recall > 0 and auto_recall > 0:
        print(f"- Recall@10 Difference (SVD vs Autoencoder): {abs(svd_recall - auto_recall):.4f}")
        if svd_recall > auto_recall:
            print(f"- SVD has {((svd_recall - auto_recall) / auto_recall * 100):.1f}% higher Recall@10")
        else:
            print(f"- Autoencoder has {((auto_recall - svd_recall) / svd_recall * 100):.1f}% higher Recall@10")

# Main execution
if __name__ == "__main__":
    # Preprocess data
    df_expanded, sparse_matrix, pivot_table, user_encoder, item_encoder = preprocess_data()
    
    # Plot visualizations
    plot_visualizations(df_expanded, pivot_table)
    
    # Train models
    svd, data = train_svd(df_expanded)
    knn = train_knn(df_expanded)
    autoencoder = train_autoencoder(sparse_matrix)
    
    # Evaluate models
    metrics = evaluate_models(svd, knn, autoencoder, data, pivot_table, user_encoder, item_encoder)
    svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall = metrics
    
    # Plot model comparison
    plot_model_comparison(svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall)
    
    # Save metrics
    if any(x is not None for x in metrics):
        metrics_df = pd.DataFrame({
            'Model': ['SVD', 'kNN', 'Autoencoder'],
            'RMSE': [svd_rmse if svd_rmse is not None else 'N/A',
                     knn_rmse if knn_rmse is not None else 'N/A',
                     auto_rmse if auto_rmse is not None else 'N/A'],
            'Precision@10': [svd_prec if svd_prec is not None else 'N/A',
                             knn_prec if knn_prec is not None else 'N/A',
                             auto_prec if auto_prec is not None else 'N/A'],
            'Recall@10': [svd_recall if svd_recall is not None else 'N/A',
                          knn_recall if knn_recall is not None else 'N/A',
                          auto_recall if auto_recall is not None else 'N/A']
        })
        os.makedirs('results', exist_ok=True)
        metrics_df.to_csv('results/metrics.csv', index=False)
        print("\nMetrics saved to results/metrics.csv")

Surprise library version: 1.1.4
Starting data preprocessing...
Dataset shape after dropping missing values: (1464, 16)
Dataset shape after splitting user_id: (11495, 16)
Found 1622 duplicate user-item interactions.
Dataset shape after aggregating duplicates: (10596, 6)
Dataset shape: (10596, 6)
Number of unique products: 1350
Number of unique users: 9042
Sparse matrix shape: (9042, 1350), non-zero entries: 10596
Training SVD model...
Best SVD RMSE: 0.1346
Best SVD Parameters: {'n_factors': 50, 'lr_all': 0.01, 'reg_all': 0.02}
Training kNN model...
Evaluating kNN with k=10, sim_options={'name': 'cosine', 'user_based': True}
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similari

2025-07-16 07:22:06.940755: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Autoencoder Mean RMSE (5-fold CV): 0.1155
Evaluating models...
Computing the cosine similarity matrix...
Done computing similarity matrix.
SVD Test RMSE: 0.13065526873700165
SVD Precision@10: 0.7345588235294118
SVD Recall@10: 0.7352941176470589
kNN Test RMSE: 0.27625971386341736
kNN Precision@10: 0.7345588235294118
kNN Recall@10: 0.7352941176470589
Autoencoder Test RMSE: 0.1135829801247042
Autoencoder Precision@10: 0.7345588235294118
Autoencoder Recall@10: 0.7352941176470589

Chart.js Configuration for Model Comparison:
{
  "type": "bar",
  "data": {
    "labels": [
      "SVD",
      "kNN",
      "Autoencoder"
    ],
    "datasets": [
      {
        "label": "Test RMSE",
        "data": [
          0.13065526873700165,
          0.27625971386341736,
          0.1135829801247042
        ],
        "backgroundColor": "#FF6B6B"
      },
      {
        "label": "Precision@10",
        "data": [
          0.7345588235294118,
          0.7345588235294118,
          0.7345588235294118
    