In [1]:
!pip install surprise



In [3]:
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader, SVD, SlopeOne, CoClustering, SVDpp
from surprise import accuracy
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from collections import defaultdict

seed = 42
random.seed(seed)
np.random.seed(seed)
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

In [4]:
train_df = pd.read_pickle(data_dir+'clean_data/train_set.pkl')
test_df = pd.read_pickle(data_dir+'clean_data/test_set.pkl')

In [5]:
train_df.user_id.value_counts()

user_id
AG73BVBKUOH22USSFJA5ZWL7AKXA    1426
AHPOHKN4PU4W3V5PGFL7AGTAD2AA    1198
AHEMJ62SUJPUYNWGROPI6MUAYQ5A     924
AEYVPPWR4CIKWX4BGYKCBCDL2CZQ     924
AH665SQ6SQF6DXAGYIQFCX76LALA     778
                                ... 
AFI6HA2MBKJJEMGAIBHQOGPO5TMQ       2
AFI6G2GDXADHCF5F5NOQ4YGOFLJQ       2
AFI6F65FPRQLQZCNY5QGHRHBCAOQ       2
AFI6F4Q7RL5DWL7OQIPM5Z6AE2TA       2
AFI6JBTMHUXK77EE2DHURX7PFXTQ       2
Name: count, Length: 290475, dtype: int64

In [8]:
train_df.shape

(1258132, 5)

In [6]:
train_df.head()

Unnamed: 0,user_id,parent_asin,label,rating,timestamp
0,AE22236AFRRSMQIKGG7TPTB75QEA,B0002C7FHC,1,5,2009-09-19 19:42:10
1,AE22236AFRRSMQIKGG7TPTB75QEA,B00UFKQKLS,1,5,2014-03-07 15:31:31
2,AE22236AFRRSMQIKGG7TPTB75QEA,B01I6X61OQ,1,5,2014-03-07 17:06:29
3,AE22236AFRRSMQIKGG7TPTB75QEA,B0713WBZM7,0,0,2009-09-19 19:42:10
4,AE22236AFRRSMQIKGG7TPTB75QEA,B0BVM3J8GW,0,0,2014-03-07 15:31:31


In [None]:
import os
from surprise import Dataset, Reader, SVD, SVDpp, SlopeOne, CoClustering, accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

# Load and preprocess explicit feedback data for collaborative filtering
# Define rating range for explicit feedback
reader_explicit = Reader(rating_scale=(1, 5))  # Explicit ratings range from 1 to 5

# Load training data
# train_df should be a DataFrame containing 'user_id', 'parent_asin' (item ID), and 'rating' columns
train_explicit = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader_explicit)
trainset_explicit = train_explicit.build_full_trainset()  # Convert to training set format required by Surprise library

# Load testing data
# test_df should contain 'user_id', 'parent_asin', and 'rating' (or 'label' for classification)
test_set_explicit = list(zip(test_df['user_id'], test_df['parent_asin'], test_df['rating']))

# Define models for explicit feedback prediction
# Removed NMF model as per the requirement
models_explicit = {
    'SVD': SVD(random_state=seed),  # Singular Value Decomposition
    'SVD++': SVDpp(random_state=seed),  # Enhanced version of SVD considering implicit feedback
    'SlopeOne': SlopeOne(),  # Simple and efficient collaborative filtering algorithm
    'CoClustering': CoClustering(random_state=seed)  # Co-clustering approach for collaborative filtering
}

# Train each model and evaluate predictions
model_predictions_explicit = {}  # Store predictions for each model
model_metrics_explicit = {}  # Store evaluation metrics for each model

for model_name, model in models_explicit.items():
    print(f"\nTraining model: {model_name}")
    model.fit(trainset_explicit)  # Train the model on the training set

    print("Making predictions on the test set...")
    predictions = model.test(test_set_explicit)  # Generate predictions on the test set

    # Calculate RMSE (Root Mean Square Error) for prediction quality
    rmse = accuracy.rmse(predictions, verbose=True)

    # Store predictions
    model_predictions_explicit[model_name] = predictions

    # Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve)
    y_true = test_df['label']  # True binary labels for the test set
    y_pred = [pred.est for pred in predictions]  # Predicted ratings
    roc_auc = roc_auc_score(y_true, y_pred)

    # Store evaluation metrics
    model_metrics_explicit[model_name] = {
        'RMSE': rmse,
        'ROC_AUC': roc_auc
    }

# Define the directory for saving results
result_dir = os.path.join('../Results', 'CF_model')  # Update path to reflect 'collaborative_filtering'

# Create the directory if it doesn't exist
os.makedirs(result_dir, exist_ok=True)

# Save evaluation metrics to a text file
metrics_file_path = os.path.join(result_dir, 'metrics.txt')

with open(metrics_file_path, 'w', encoding='utf-8') as f:
    f.write("Evaluation Metrics for Explicit Feedback Models\n")
    f.write("=============================================\n\n")
    for model_name, metrics in model_metrics_explicit.items():
        f.write(f"Model: {model_name}\n")
        f.write(f"RMSE: {metrics['RMSE']:.4f}\n")
        f.write(f"ROC AUC: {metrics['ROC_AUC']:.4f}\n\n")

print(f"Evaluation metrics have been saved to {metrics_file_path}")

# Save predictions for each model to separate files
for model_name, predictions in model_predictions_explicit.items():
    # Define the file path for predictions
    predictions_file_path = os.path.join(result_dir, f"{model_name}_predictions.txt")
    
    # Write predictions to the file
    with open(predictions_file_path, 'w', encoding='utf-8') as f:
        f.write(f"{model_name} Model Predictions\n")
        f.write("=============================================\n\n")
        for pred in predictions:
            f.write(f"User ID: {pred.uid}, Item ID: {pred.iid}, True Rating: {pred.r_ui}, Predicted Rating: {pred.est}\n")
    
    print(f"Predictions for the {model_name} model have been saved to {predictions_file_path}")



Training model: SVD
Making predictions on the test set...
