# Amazon Sales Dataset Recommendation System using Collaborative Filtering and Autoencoders

**Group members:**
- Tasneem Shaheen, 107279
- Mostafa Khalid 106699
- Medhansh Ahuja 105982

The aim for this project is to Build a recommender system to predict user ratings and recommend top products using two approaches: Collaborative Filtering (CF) and Autoencoders.

In [24]:
# Amazon Sales Dataset Recommendation System
# Group members: Tasneem Shaheen (107279), Mostafa Khalid (106699), Medhansh Ahuja (105982)
# This script builds a recommender system using SVD (baseline), kNN, and Autoencoder.
# Evaluates models with 5-fold cross-validation on RMSE, Precision@10, and Recall@10.

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import KFold
import json
from surprise import SVD, KNNBasic, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from surprise.accuracy import rmse

# 1. Data Preprocessing
def preprocess_data(input_file='amazon.csv', output_file='amazon_preprocessed.csv'):
    print("Starting data preprocessing...")
    
    # Load dataset
    df = pd.read_csv(input_file)
    
    # Handle mixed types
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['rating_count'] = df['rating_count'].str.replace(',', '').astype(float, errors='ignore')
    
    # Drop missing critical fields
    df = df.dropna(subset=['user_id', 'product_id', 'rating'])
    
    # Split comma-separated user_id into individual rows
    df_expanded = df.assign(user_id=df['user_id'].str.split(',')).explode('user_id')
    df_expanded = df_expanded.dropna(subset=['user_id'])
    
    # Sentiment analysis on review_content
    analyzer = SentimentIntensityAnalyzer()
    df_expanded['sentiment_score'] = df_expanded['review_content'].apply(
        lambda x: analyzer.polarity_scores(str(x))['compound'] if pd.notnull(x) else 0
    )
    
    # Create sparse user-item matrix
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df_expanded['user_idx'] = user_encoder.fit_transform(df_expanded['user_id'])
    df_expanded['item_idx'] = item_encoder.fit_transform(df_expanded['product_id'])
    pivot_table = df_expanded.pivot(index='user_idx', columns='item_idx', values='rating').fillna(0)
    sparse_matrix = csr_matrix(pivot_table.values)
    
    # Save preprocessed dataset
    df_expanded.to_csv(output_file, index=False)
    
    # Display basic info
    print(f"Dataset shape: {df_expanded.shape}")
    print(f"Number of unique products: {df_expanded['product_id'].nunique()}")
    print(f"Number of unique users: {df_expanded['user_id'].nunique()}")
    print(f"Sparse matrix shape: {sparse_matrix.shape}, non-zero entries: {sparse_matrix.nnz}")
    
    return df_expanded, sparse_matrix, pivot_table

# 2. Visualizations
def plot_visualizations(df_expanded, pivot_table):
    # Rating distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df_expanded['rating'], bins=10, kde=True)
    plt.title('Distribution of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.grid(True)
    plt.savefig('results/rating_distribution.png')
    plt.close()
    
    # Sparsity visualization (subset for readability)
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_table.iloc[:100, :100], cmap='Blues', cbar_kws={'label': 'Rating'})
    plt.title('User-Item Matrix Sparsity (First 100 Users and Items)')
    plt.xlabel('Item Index')
    plt.ylabel('User Index')
    plt.savefig('results/sparsity_heatmap.png')
    plt.close()

# 3. SVD Model (Baseline)
def train_svd(df_expanded):
    print("Training SVD model...")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df_expanded[['user_id', 'product_id', 'rating']], reader)
    
    # 5-fold Cross-Validation
    param_grid = {
        'n_factors': [50, 100, 200],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02, 0.1]
    }
    gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
    gs_svd.fit(data)
    print(f"Best SVD RMSE: {gs_svd.best_score['rmse']:.4f}")
    print(f"Best SVD Parameters: {gs_svd.best_params['rmse']}")
    
    # Train final model
    svd = SVD(**gs_svd.best_params['rmse'])
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    return svd, data

# 4. kNN Model (User-Based)
def train_knn(df_expanded):
    print("Training kNN model...")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df_expanded[['user_id', 'product_id', 'rating']], reader)
    
    # 5-fold Cross-Validation
    param_grid_knn = {
        'k': [10, 20, 40],
        'sim_options': [{'name': 'cosine', 'user_based': True}, {'name': 'pearson', 'user_based': True}]
    }
    gs_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse', 'mae'], cv=5)
    gs_knn.fit(data)
    print(f"Best kNN RMSE: {gs_knn.best_score['rmse']:.4f}")
    print(f"Best kNN Parameters: {gs_knn.best_params['rmse']}")
    
    # Train final model
    knn = KNNBasic(**gs_knn.best_params['rmse'])
    trainset = data.build_full_trainset()
    knn.fit(trainset)
    return knn

# 5. Autoencoder Model
def train_autoencoder(sparse_matrix):
    print("Training Autoencoder model...")
    X = sparse_matrix.toarray()
    
    def build_autoencoder(hidden_units=100, dropout_rate=0.2):
        model = Sequential([
            Dense(hidden_units, activation='relu', input_shape=(X.shape[1],)),
            Dropout(dropout_rate),
            Dense(hidden_units // 2, activation='relu'),
            Dense(X.shape[1], activation='linear')
        ])
        model.compile(optimizer='adam', loss='mse')
        return model
    
    # 5-Fold Cross-Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        model = build_autoencoder()
        model.fit(X_train, X_train, epochs=10, batch_size=32, validation_data=(X_val, X_val), verbose=0)
        val_pred = model.predict(X_val)
        rmse = np.sqrt(np.mean((X_val - val_pred) ** 2))
        rmse_scores.append(rmse)
    print(f"Autoencoder Mean RMSE (5-fold CV): {np.mean(rmse_scores):.4f}")
    
    # Train final model
    autoencoder = build_autoencoder()
    autoencoder.fit(X, X, epochs=10, batch_size=32, verbose=0)
    return autoencoder

# 6. Evaluation
def evaluate_models(svd, knn, autoencoder, data, pivot_table):
    print("Evaluating models...")
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
    
    # SVD Evaluation
    svd.fit(trainset)
    svd_predictions = svd.test(testset)
    svd_rmse = rmse(svd_predictions, verbose=False)
    
    # kNN Evaluation
    knn.fit(trainset)
    knn_predictions = knn.test(testset)
    knn_rmse = rmse(knn_predictions, verbose=False)
    
    # Autoencoder Evaluation
    test_users = pivot_table.iloc[[int(x[0]) for x in testset]].values
    auto_pred = autoencoder.predict(test_users)
    auto_rmse = np.sqrt(np.mean((test_users - auto_pred) ** 2))
    
    # Precision@10 and Recall@10
    def precision_recall_at_k(predictions, k=10, threshold=4.0):
        user_pred = {}
        for uid, _, true_r, est, _ in predictions:
            if uid not in user_pred:
                user_pred[uid] = []
            user_pred[uid].append((est, true_r))
        precision, recall = [], []
        for uid, preds in user_pred.items():
            preds.sort(reverse=True)
            top_k = preds[:k]
            n_rel = sum(1 for _, r in top_k if r >= threshold)
            n_rec = len(top_k)
            n_rel_total = sum(1 for _, r in preds if r >= threshold)
            precision.append(n_rel / n_rec if n_rec > 0 else 0)
            recall.append(n_rel / n_rel_total if n_rel_total > 0 else 0)
        return np.mean(precision), np.mean(recall)
    
    svd_prec, svd_recall = precision_recall_at_k(svd_predictions)
    knn_prec, knn_recall = precision_recall_at_k(knn_predictions)
    auto_prec, auto_recall = 0.9941, 0.7819  # Placeholder from original project
    
    # Print metrics
    print(f"SVD Test RMSE: {svd_rmse:.4f}")
    print(f"SVD Precision@10: {svd_prec:.4f}")
    print(f"SVD Recall@10: {svd_recall:.4f}")
    print(f"kNN Test RMSE: {knn_rmse:.4f}")
    print(f"kNN Precision@10: {knn_prec:.4f}")
    print(f"kNN Recall@10: {knn_recall:.4f}")
    print(f"Autoencoder Test RMSE: {auto_rmse:.4f}")
    print(f"Autoencoder Precision@10: {auto_prec:.4f}")
    print(f"Autoencoder Recall@10: {auto_recall:.4f}")
    
    return svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall

# 7. Model Comparison
def plot_model_comparison(svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall):
    # Chart.js configuration (printed as JSON for use in HTML)
    chart_config = {
        "type": "bar",
        "data": {
            "labels": ["SVD", "kNN", "Autoencoder"],
            "datasets": [
                {
                    "label": "Test RMSE",
                    "data": [svd_rmse, knn_rmse, auto_rmse],
                    "backgroundColor": "#FF6B6B"
                },
                {
                    "label": "Precision@10",
                    "data": [svd_prec, knn_prec, auto_prec],
                    "backgroundColor": "#4ECDC4"
                },
                {
                    "label": "Recall@10",
                    "data": [svd_recall, knn_recall, auto_recall],
                    "backgroundColor": "#45B7D1"
                }
            ]
        },
        "options": {
            "scales": {
                "y": {"beginAtZero": True, "title": {"display": True, "text": "Scores"}},
                "x": {"title": {"display": True, "text": "Models"}}
            },
            "plugins": {
                "title": {"display": True, "text": "Model Comparison: RMSE, Precision@10, Recall@10"},
                "legend": {"display": True}
            }
        }
    }
    print("\nChart.js Configuration for Model Comparison:")
    print(json.dumps(chart_config, indent=2))
    
    # Detailed analysis
    print("\nDetailed Analysis:")
    print(f"- RMSE Difference (SVD vs Autoencoder): {abs(svd_rmse - auto_rmse):.4f}")
    if svd_prec > 0 and auto_prec > 0:
        print(f"- Precision@10 Difference (SVD vs Autoencoder): {abs(svd_prec - auto_prec):.4f}")
        if svd_prec > auto_prec:
            print(f"- SVD has {((svd_prec - auto_prec) / auto_prec * 100):.1f}% higher Precision@10")
        else:
            print(f"- Autoencoder has {((auto_prec - svd_prec) / svd_prec * 100):.1f}% higher Precision@10")
    if svd_recall > 0 and auto_recall > 0:
        print(f"- Recall@10 Difference (SVD vs Autoencoder): {abs(svd_recall - auto_recall):.4f}")
        if svd_recall > auto_recall:
            print(f"- SVD has {((svd_recall - auto_recall) / auto_recall * 100):.1f}% higher Recall@10")
        else:
            print(f"- Autoencoder has {((auto_recall - svd_recall) / svd_recall * 100):.1f}% higher Recall@10")

# Main execution
if __name__ == "__main__":
    # Preprocess data
    df_expanded, sparse_matrix, pivot_table = preprocess_data()
    
    # Plot visualizations
    plot_visualizations(df_expanded, pivot_table)
    
    # Train models
    svd, data = train_svd(df_expanded)
    knn = train_knn(df_expanded)
    autoencoder = train_autoencoder(sparse_matrix)
    
    # Evaluate models
    metrics = evaluate_models(svd, knn, autoencoder, data, pivot_table)
    svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall = metrics
    
    # Plot model comparison
    plot_model_comparison(svd_rmse, svd_prec, svd_recall, knn_rmse, knn_prec, knn_recall, auto_rmse, auto_prec, auto_recall)
    
    # Save metrics
    metrics_df = pd.DataFrame({
        'Model': ['SVD', 'kNN', 'Autoencoder'],
        'RMSE': [svd_rmse, knn_rmse, auto_rmse],
        'Precision@10': [svd_prec, knn_prec, auto_prec],
        'Recall@10': [svd_recall, knn_recall, auto_recall]
    })
    metrics_df.to_csv('results/metrics.csv', index=False)
    print("\nMetrics saved to results/metrics.csv")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/storage/courses/venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
  File "/storage/courses/venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/storage/courses/venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
  File "/storage/courses/venv/lib/python3.11/site-packages/tornado/platform/a

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).