In [11]:
# Import necessary libraries
import numpy as np
import pandas as pd

# For modeling and evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import OneClassSVM, SVC
from sklearn.multiclass import OneVsRestClassifier
from collections import Counter

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
random_state = 42
np.random.seed(random_state)

# --------------------------------------------
# Step 1: Load and Preprocess Data
# --------------------------------------------

# Load data from the third and fourth batches
third_batch = np.load('third_batch_multi_labels.npz', allow_pickle=True)
fourth_batch = np.load('fourth_batch_multi_labels.npz', allow_pickle=True)

# Convert numpy arrays to pandas DataFrames
third_X_df = pd.DataFrame(third_batch['X'], columns=['user_id', 'item_id', 'rating'])
third_labels_df = pd.DataFrame(third_batch['yy'], columns=['user_id', 'label'])

fourth_X_df = pd.DataFrame(fourth_batch['X'], columns=['user_id', 'item_id', 'rating'])
fourth_labels_df = pd.DataFrame(fourth_batch['yy'], columns=['user_id', 'label'])

# Combine third and fourth batches for training
X_df = pd.concat([third_X_df, fourth_X_df], ignore_index=True)
labels_df = pd.concat([third_labels_df, fourth_labels_df], ignore_index=True)

# --------------------------------------------
# Step 2: Feature Engineering
# --------------------------------------------

def create_user_features(df):
    # Count of interactions
    interaction_counts = df.groupby('user_id')['item_id'].count().rename('total_interactions')
    
    # Statistics of ratings
    rating_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'min', 'max']).rename(
        columns={'mean': 'rating_mean', 'std': 'rating_std', 'min': 'rating_min', 'max': 'rating_max'}
    )
    
    # Count of each rating type
    rating_counts = df.groupby(['user_id', 'rating']).size().unstack(fill_value=0)
    rating_counts.columns = [f'rating_{col}' for col in rating_counts.columns]
    
    # Ratio of each rating type
    rating_ratios = rating_counts.div(rating_counts.sum(axis=1), axis=0)
    rating_ratios.columns = [f'ratio_{col}' for col in rating_ratios.columns]
    
    # Interaction diversity (number of unique items)
    item_diversity = df.groupby('user_id')['item_id'].nunique().rename('unique_items')
    
    # Combine all features
    user_features = pd.concat([
        interaction_counts, rating_stats, rating_counts,
        rating_ratios, item_diversity
    ], axis=1)
    
    # Fill NaN values in rating_std
    user_features['rating_std'] = user_features['rating_std'].fillna(0)
    
    # Replace infinite values with NaN and then fill with 0
    user_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    user_features.fillna(0, inplace=True)
    
    return user_features.reset_index()

# Create user-level features
user_features = create_user_features(X_df)

# Merge features with labels
user_data = user_features.merge(labels_df, on='user_id')

# --------------------------------------------
# Step 3: Prepare Training and Validation Sets
# --------------------------------------------

# Separate features and labels
X = user_data.drop(['user_id', 'label'], axis=1)
y = user_data['label']

# Split into training and validation sets before scaling
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)

# Check class distribution
print('Original training set class distribution:', Counter(y_train))

# --------------------------------------------
# Step 4: Data Scaling
# --------------------------------------------

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# --------------------------------------------
# Step 5: Anomaly Detection with One-Class SVM
# --------------------------------------------

# Fit One-Class SVM on scaled training data
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.1)  # Adjust parameters as needed
oc_svm.fit(X_train_scaled)

# Compute anomaly scores for training data
anomaly_scores_train = -oc_svm.decision_function(X_train_scaled)  # Higher scores indicate anomalies

# Compute anomaly scores for validation data
anomaly_scores_valid = -oc_svm.decision_function(X_valid_scaled)

# Add anomaly scores to the scaled features
X_train_scaled = np.hstack([X_train_scaled, anomaly_scores_train.reshape(-1, 1)])
X_valid_scaled = np.hstack([X_valid_scaled, anomaly_scores_valid.reshape(-1, 1)])

# --------------------------------------------
# Step 6: Handling Class Imbalance with SMOTE
# --------------------------------------------

# Initialize SMOTE
smote = SMOTE(random_state=random_state, sampling_strategy='auto')

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check new class distribution
print('Resampled training set class distribution:', Counter(y_train_resampled))

# --------------------------------------------
# Step 7: Model Training with One-vs-Rest SVM
# --------------------------------------------

# Initialize One-vs-Rest SVM
svm_clf = SVC(probability=True, random_state=random_state)

# Wrap in One-vs-Rest classifier for multi-class classification
ovr_svm = OneVsRestClassifier(svm_clf)

# Define parameter grid for hyperparameter tuning
param_grid_svm = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__gamma': ['scale', 'auto']
}

# Initialize RandomizedSearchCV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
random_search_svm = RandomizedSearchCV(
    estimator=ovr_svm,
    param_distributions=param_grid_svm,
    n_iter=10,
    scoring='roc_auc_ovr',
    cv=skf,
    n_jobs=-1,
    verbose=1,
    random_state=random_state
)

# Fit RandomizedSearchCV
random_search_svm.fit(X_train_resampled, y_train_resampled)

# Best estimator
best_svm = random_search_svm.best_estimator_
print(f"Best SVM Parameters: {random_search_svm.best_params_}")

# --------------------------------------------
# Step 8: Model Evaluation
# --------------------------------------------

# Predict probabilities on validation set
svm_predictions = best_svm.predict_proba(X_valid_scaled)

# Compute AUC scores for each class
auc_scores = {}
for i in range(3):
    auc = roc_auc_score((y_valid == i).astype(int), svm_predictions[:, i])
    auc_scores[f"Class {i} AUC"] = auc

print("\nValidation Set AUC Scores:", auc_scores)

# Calculate average AUC
average_auc = np.mean(list(auc_scores.values()))
print(f"Average Validation AUC: {average_auc}")

# Predict classes without threshold adjustments
y_valid_pred = best_svm.predict(X_valid_scaled)

# Confusion Matrix
conf_matrix = confusion_matrix(y_valid, y_valid_pred)
print("\nConfusion Matrix on Validation Set:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_valid, y_valid_pred)
print("\nClassification Report on Validation Set:")
print(class_report)


Original training set class distribution: Counter({0: 1200, 1: 36, 2: 36})
Resampled training set class distribution: Counter({0: 1200, 1: 1200, 2: 1200})
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best SVM Parameters: {'estimator__kernel': 'rbf', 'estimator__gamma': 'auto', 'estimator__C': 10}

Validation Set AUC Scores: {'Class 0 AUC': 0.7172222222222223, 'Class 1 AUC': 0.5181589356346638, 'Class 2 AUC': 0.9622437971952535}
Average Validation AUC: 0.7325416516840466

Confusion Matrix on Validation Set:
[[246  48   6]
 [  5   2   2]
 [  3   0   6]]

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.97      0.82      0.89       300
           1       0.04      0.22      0.07         9
           2       0.43      0.67      0.52         9

    accuracy                           0.80       318
   macro avg       0.48      0.57      0.49       318
weighted avg       0.93      0.80      0.85       318

