In [None]:
from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName('ReadCSV').getOrCreate()

# Read all CSV files in a directory
parquet_directory_path = '../data/development/'
data = spark.read.parquet(parquet_directory_path, header=True, inferSchema=True)

df = data.toPandas()

In [None]:
item_ids = df['item_id'].unique()
item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
df['item_idx'] = df['item_id'].map(item_id_to_index)
df = df.astype('int')
df

In [None]:
# Setting the random seed
import random
import numpy as np
import torch
random_seed = 25
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import pandas as pd
from typing import List
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

device = torch.device('cuda')


class CustomDataset(Dataset):
    def __init__(self, df, item_id_mapping, label_mapping):
        self.user_ids = df['user_id'].unique()
        self.user_data = []
        for user_id in self.user_ids:
            user_df = df[df['user_id'] == user_id]
            item_ids = user_df['item_id'].map(item_id_mapping).values
            labels = user_df['ratings_discretized'].values
            other_features = user_df[['istrack', 'isalbum', 'isartist', 'isgenre']].values.astype(float)
            self.user_data.append((torch.tensor(item_ids, dtype=torch.long),
                                   torch.tensor(other_features, dtype=torch.float),
                                   torch.tensor(labels, dtype=torch.long)))
       
    def __len__(self):
        return len(self.user_data)

    def __getitem__(self, idx):
        item_ids, other_features, labels = self.user_data[idx]
        length = len(item_ids)
        return item_ids, other_features, labels, length


# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(pos * div_term)
        self.encoding[:, 1::2] = torch.cos(pos * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :]


# Transformer-based Model
class TransformerModel(nn.Module):
    def __init__(self, num_items, hidden_size, num_layers, num_heads, dropout_rate, num_classes):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=hidden_size, padding_idx=0)
        self.pos_encoder = PositionalEncoding(hidden_size)
        
        # Transformer Encoder Layer
        encoder_layers = nn.TransformerEncoderLayer(d_model=132, nhead=num_heads, dropout=dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Final linear layer for classification
        self.fc = nn.Linear(132, num_classes)

    def forward(self, item_ids, other_features, lengths):
        # Embedding and positional encoding
        embeddings = self.embedding(item_ids)  # [batch_size, seq_len, embedding_dim]
        embeddings = self.pos_encoder(embeddings)
        transformer_input = torch.cat((embeddings, other_features), dim=2)
        
        # Transformer expects input of shape [seq_len, batch_size, d_model]
        transformer_input = transformer_input.permute(1, 0, 2)
        transformer_output = self.transformer_encoder(transformer_input)

        # Revert shape to [batch_size, seq_len, hidden_size]
        transformer_output = transformer_output.permute(1, 0, 2)
        outputs = self.fc(transformer_output) 

        return outputs


# Modify the training and nested cross-validation process
def train_and_validate(model, train_loader, val_loader, criterion, optimizer, patience=3):
    best_val_loss = float('inf')
    best_model_state = None
    epochs_no_improve = 0
    num_epochs = 100
    model.to(device)
    num_classes = model.fc.out_features

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for item_ids, features, labels, lengths in train_loader:
            item_ids, features, labels, lengths = item_ids.to(device), features.to(device), labels.to(device), lengths.to(device)
            optimizer.zero_grad()
            outputs = model(item_ids, features, lengths)
            labels_flat = labels.view(-1)
            outputs_flat = outputs.view(-1, num_classes)
            mask = labels_flat != -1
            loss = criterion(outputs_flat[mask], labels_flat[mask])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader):.4f}")

        model.eval()
        val_loss = 0
        val_labels = []
        val_preds = []
        softmax_probs = []
        with torch.no_grad():
            for item_ids, features, labels, lengths in val_loader:
                item_ids, features, labels, lengths = item_ids.to(device), features.to(device), labels.to(device), lengths.to(device)
                outputs = model(item_ids, features, lengths)
                labels_flat = labels.view(-1)
                outputs_flat = outputs.view(-1, num_classes)
                mask = labels_flat != -1
                loss = criterion(outputs_flat[mask], labels_flat[mask])
                val_loss += loss.item()
                softmax_output = F.softmax(outputs_flat[mask], dim=1)
                softmax_probs.extend(softmax_output.cpu().tolist())
                _, preds = torch.max(outputs_flat[mask], dim=1)
                val_labels.extend(labels_flat[mask].cpu().tolist())
                val_preds.extend(preds.cpu().tolist())

        val_loss /= len(val_loader)
        print(f"        Validation Loss: {val_loss:.4f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered!")
            break

    model.load_state_dict(best_model_state)
    accuracy = accuracy_score(val_labels, val_preds)
    return accuracy, (val_labels, val_preds, softmax_probs)


In [None]:
# Nested cross-validation function using Transformer model
def nested_cross_validation(df: pd.DataFrame, n_outer: int, n_inner: int):
    results_per_outer = []

    # Create item_id and label mappings
    unique_item_ids = df['item_id'].unique()
    item_id_mapping = {item_id: idx + 1 for idx, item_id in enumerate(unique_item_ids)}  # +1 ensures padding_idx=0
    num_items = len(unique_item_ids) + 1  # Padding index

    unique_labels = df['ratings_discretized'].unique()
    label_mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}
    num_classes = len(unique_labels)

    outer_folds = split_df_into_folds(df, n_outer)
    for outer_idx, outer_fold in enumerate(outer_folds):
        print(f"Outer Fold {outer_idx + 1}/{n_outer}")
        outer_train_data = df[~df['user_id'].isin(outer_fold['user_id'])]
        outer_val_data = outer_fold

        best_inner_accuracy = 0
        best_params = None

        # Inner loop for hyperparameter tuning
        for hidden_size in params_to_tune['hidden_size']:
            for num_layers in params_to_tune['num_layers']:
                for num_heads in params_to_tune['num_heads']:
                    for dropout_rate in params_to_tune['dropout_rate']:
                        inner_accuracies = []
                        inner_folds = split_df_into_folds(outer_train_data, n_inner)
                        print(f"  Tuning: hidden_size={hidden_size}, num_layers={num_layers}, num_heads={num_heads}, dropout_rate={dropout_rate}")
                        for inner_idx, inner_fold in enumerate(inner_folds):
                            print(f"    Inner Fold {inner_idx + 1}/{n_inner}")
                            inner_train_data = outer_train_data[~outer_train_data['user_id'].isin(inner_fold['user_id'])]
                            inner_val_data = inner_fold

                            train_dataset = CustomDataset(inner_train_data, item_id_mapping, label_mapping)
                            val_dataset = CustomDataset(inner_val_data, item_id_mapping, label_mapping)

                            train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
                            val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)

                            # Initialize the Transformer model
                            model = TransformerModel(
                                num_items=num_items,
                                hidden_size=hidden_size,
                                num_layers=num_layers,
                                num_heads=num_heads,
                                dropout_rate=dropout_rate,
                                num_classes=num_classes
                            )

                            criterion = nn.CrossEntropyLoss()
                            optimizer = optim.Adam(model.parameters(), lr=0.001)

                            # Train and validate the model
                            accuracy, _ = train_and_validate(model, train_loader, val_loader, criterion, optimizer)
                            print(f"    Accuracy: {accuracy:.4f}")
                            inner_accuracies.append(accuracy)
                        
                        avg_inner_accuracy = sum(inner_accuracies) / len(inner_accuracies)
                        print(f"  Average Inner Accuracy for hyperparameters: {avg_inner_accuracy:.4f}")
                        if avg_inner_accuracy > best_inner_accuracy:
                            best_inner_accuracy = avg_inner_accuracy
                            best_params = {
                                'hidden_size': hidden_size,
                                'num_layers': num_layers,
                                'num_heads': num_heads,
                                'dropout_rate': dropout_rate
                            }

        print(f"Best hyperparameters for Outer Fold {outer_idx + 1}: {best_params}")

        # Use the best hyperparameters to train on the outer fold
        train_dataset = CustomDataset(outer_train_data, item_id_mapping, label_mapping)
        val_dataset = CustomDataset(outer_val_data, item_id_mapping, label_mapping)

        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)

        # Initialize the Transformer model with the best parameters
        model = TransformerModel(
            num_items=num_items,
            hidden_size=best_params['hidden_size'],
            num_layers=best_params['num_layers'],
            num_heads=best_params['num_heads'],
            dropout_rate=best_params['dropout_rate'],
            num_classes=num_classes
        )

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        # Train and validate the model on the outer fold
        outer_accuracy, (val_labels, val_preds, val_softmax_probs) = train_and_validate(model, train_loader, val_loader, criterion, optimizer)

        # Store true labels, predictions, and softmax probabilities for each outer fold
        results_per_outer.append((val_labels, val_preds, val_softmax_probs))
        print(f"  Outer Fold Accuracy: {outer_accuracy:.4f}")

    return results_per_outer


# Parameter grid for tuning
params_to_tune = {
    'hidden_size': [128], 
    'num_layers': [2, 4],  
    'num_heads': [2, 4], 
    'dropout_rate': [0.2]  
}

# Function to split the DataFrame by user IDs into folds
def split_df_into_folds(df: pd.DataFrame, n_fold: int) -> List[pd.DataFrame]:
    unique_user_ids = df['user_id'].unique().tolist()
    random.shuffle(unique_user_ids)
    user_id_chunks = [unique_user_ids[i::n_fold] for i in range(n_fold)]
    folds = []
    for chunk in user_id_chunks:
        fold_df = df[df['user_id'].isin(chunk)]
        folds.append(fold_df)
    return folds

# Example run of the nested cross-validation
results = nested_cross_validation(df, n_outer=20, n_inner=5)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np

# Function to plot combined ROC curve across all folds
def plot_combined_roc_curve(results_per_outer, num_classes):
    all_labels = []
    all_softmax_probs = []

    # Collect all labels and softmax probabilities across folds
    for val_labels, _, val_softmax_probs in results_per_outer:
        all_labels.extend(val_labels)
        all_softmax_probs.extend(val_softmax_probs)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # Binarize the true labels for each class
    for i in range(num_classes):
        # One-vs-all ROC curve for class i
        fpr[i], tpr[i], _ = roc_curve([1 if label == i else 0 for label in all_labels],
                                      [prob[i] for prob in all_softmax_probs])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot ROC curve for each class
    plt.figure(figsize=(10, 8))
    for i in range(num_classes):
        plt.plot(fpr[i], tpr[i], label=f'Rating Class {i} (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', label="Random")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('One-vs-All ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

num_classes = 3
plot_combined_roc_curve(results, num_classes)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Function to plot confusion matrix
def plot_confusion_matrix(results_per_outer, num_classes):
    all_labels = []
    all_preds = []

    # Collect all true labels and predictions across folds
    for val_labels, val_preds, _ in results_per_outer:
        all_labels.extend(val_labels)
        all_preds.extend(val_preds)

    # Compute the confusion matrix
    cm = confusion_matrix(all_labels, all_preds, labels=range(num_classes))

    # Plotting the confusion matrix using seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Low', 'Medium', 'High'], yticklabels=['Low', 'Medium', 'High'])
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

num_classes = 3
plot_confusion_matrix(results, num_classes)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to calculate mean and variance of accuracy, precision, recall, and F1
def calculate_metrics_mean_and_variance(results_per_outer, average='macro'):
    accuracies = []
    precisions = []
    recalls = []
    f1s = []

    # Extract metrics from each fold
    for val_labels, val_preds, _ in results_per_outer:
        # Calculate accuracy for each fold
        accuracy = accuracy_score(val_labels, val_preds)
        accuracies.append(accuracy)

        # Calculate precision, recall, and F1 for each fold, handling undefined metrics with zero_division=0
        precision = precision_score(val_labels, val_preds, average=average, zero_division=0)
        recall = recall_score(val_labels, val_preds, average=average, zero_division=0)
        f1 = f1_score(val_labels, val_preds, average=average, zero_division=0)

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    # Calculate mean and variance (std^2) for each metric
    mean_accuracy = np.mean(accuracies)
    variance_accuracy = np.var(accuracies)

    mean_precision = np.mean(precisions)
    variance_precision = np.var(precisions)

    mean_recall = np.mean(recalls)
    variance_recall = np.var(recalls)

    mean_f1 = np.mean(f1s)
    variance_f1 = np.var(f1s)

    return {
        'mean_accuracy': mean_accuracy, 'variance_accuracy': variance_accuracy,
        'mean_precision': mean_precision, 'variance_precision': variance_precision,
        'mean_recall': mean_recall, 'variance_recall': variance_recall,
        'mean_f1': mean_f1, 'variance_f1': variance_f1
    }

# Example usage:
metrics = calculate_metrics_mean_and_variance(results, average='macro')
print(f"Mean Accuracy: {metrics['mean_accuracy']:.4f}, Variance (std^2): {metrics['variance_accuracy']:.4f}")
print(f"Mean Precision: {metrics['mean_precision']:.4f}, Variance (std^2): {metrics['variance_precision']:.4f}")
print(f"Mean Recall: {metrics['mean_recall']:.4f}, Variance (std^2): {metrics['variance_recall']:.4f}")
print(f"Mean F1: {metrics['mean_f1']:.4f}, Variance (std^2): {metrics['variance_f1']:.4f}")
