In [30]:
# Import necessary libraries
import numpy as np
import pandas as pd

# For modeling and evaluation
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import OneClassSVM, SVC
from sklearn.multiclass import OneVsRestClassifier
from collections import Counter

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
random_state = 42
np.random.seed(random_state)

# --------------------------------------------
# Step 1: Load and Preprocess Data
# --------------------------------------------

# Load data from the fourth and fifth batches only
fourth_batch = np.load('fourth_batch_multi_labels.npz', allow_pickle=True)
fifth_batch = np.load('fifth_batch_multi_labels.npz', allow_pickle=True)

# Convert numpy arrays to pandas DataFrames
fourth_X_df = pd.DataFrame(fourth_batch['X'], columns=['user_id', 'item_id', 'rating'])
fourth_labels_df = pd.DataFrame(fourth_batch['yy'], columns=['user_id', 'label'])

fifth_X_df = pd.DataFrame(fifth_batch['X'], columns=['user_id', 'item_id', 'rating'])
fifth_labels_df = pd.DataFrame(fifth_batch['yy'], columns=['user_id', 'label'])

# Use only the fourth batch for training
X_train_df = fourth_X_df
y_train_df = fourth_labels_df

# Use the fifth batch for testing
X_test_df = fifth_X_df
y_test_df = fifth_labels_df

# --------------------------------------------
# Step 2: Feature Engineering
# --------------------------------------------

def create_user_features(df):
    # Count of interactions
    interaction_counts = df.groupby('user_id')['item_id'].count().rename('total_interactions')
    
    # Statistics of ratings
    rating_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'min', 'max']).rename(
        columns={'mean': 'rating_mean', 'std': 'rating_std', 'min': 'rating_min', 'max': 'rating_max'}
    )
    
    # Count of each rating type
    rating_counts = df.groupby(['user_id', 'rating']).size().unstack(fill_value=0)
    rating_counts.columns = [f'rating_{col}' for col in rating_counts.columns]
    
    # Ratio of each rating type
    rating_ratios = rating_counts.div(rating_counts.sum(axis=1), axis=0)
    rating_ratios.columns = [f'ratio_{col}' for col in rating_ratios.columns]
    
    # Interaction diversity (number of unique items)
    item_diversity = df.groupby('user_id')['item_id'].nunique().rename('unique_items')
    
    # Combine all features
    user_features = pd.concat([
        interaction_counts, rating_stats, rating_counts,
        rating_ratios, item_diversity
    ], axis=1)
    
    # Fill NaN values in rating_std
    user_features['rating_std'] = user_features['rating_std'].fillna(0)
    
    # Replace infinite values with NaN and then fill with 0
    user_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    user_features.fillna(0, inplace=True)
    
    return user_features.reset_index()

# Create user-level features for training and testing data
X_train_features = create_user_features(X_train_df)
X_test_features = create_user_features(X_test_df)

# Merge features with labels
train_data = X_train_features.merge(y_train_df, on='user_id')
test_data = X_test_features.merge(y_test_df, on='user_id')

# Separate features and labels for training and testing
X_train = train_data.drop(['user_id', 'label'], axis=1)
y_train = train_data['label']

X_test = test_data.drop(['user_id', 'label'], axis=1)
y_test = test_data['label']

# Check class distribution in training data
print('Original training set class distribution:', Counter(y_train))

# --------------------------------------------
# Step 3: Data Scaling
# --------------------------------------------

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------------
# Step 4: Anomaly Detection with One-Class SVM
# --------------------------------------------

# Fit One-Class SVM on scaled training data
oc_svm = OneClassSVM(kernel="rbf", gamma="scale", nu=0.2)
oc_svm.fit(X_train_scaled)

# Compute anomaly scores for training data
anomaly_scores_train = -oc_svm.decision_function(X_train_scaled)  # Higher scores indicate anomalies

# Compute anomaly scores for testing data
anomaly_scores_test = -oc_svm.decision_function(X_test_scaled)

# Add anomaly scores to the scaled features
X_train_scaled = np.hstack([X_train_scaled, anomaly_scores_train.reshape(-1, 1)])
X_test_scaled = np.hstack([X_test_scaled, anomaly_scores_test.reshape(-1, 1)])

# --------------------------------------------
# Step 5: Handling Class Imbalance with SMOTE
# --------------------------------------------

# Initialize SMOTE
smote = SMOTE(random_state=random_state, sampling_strategy='auto')

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check new class distribution
print('Resampled training set class distribution:', Counter(y_train_resampled))

# --------------------------------------------
# Step 6: Model Training with One-vs-Rest SVM
# --------------------------------------------

# Initialize One-vs-Rest SVM
svm_clf = SVC(probability=True, random_state=random_state)

# Wrap in One-vs-Rest classifier for multi-class classification
ovr_svm = OneVsRestClassifier(svm_clf)

# Define parameter grid for hyperparameter tuning
param_grid_svm = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__gamma': ['scale', 'auto']
}

# Initialize RandomizedSearchCV
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
random_search_svm = RandomizedSearchCV(
    estimator=ovr_svm,
    param_distributions=param_grid_svm,
    n_iter=10,
    scoring='roc_auc_ovr_weighted',
    cv=skf,
    n_jobs=-1,
    verbose=1,
    random_state=random_state
)

# Fit RandomizedSearchCV
random_search_svm.fit(X_train_resampled, y_train_resampled)

# Best estimator
best_svm = random_search_svm.best_estimator_
print(f"Best SVM Parameters: {random_search_svm.best_params_}")

# --------------------------------------------
# Step 7: Model Evaluation
# --------------------------------------------

# Predict probabilities on test set
svm_predictions = best_svm.predict_proba(X_test_scaled)

# Compute AUC scores for each class
auc_scores = {}
for i in range(3):
    auc = roc_auc_score((y_test == i).astype(int), svm_predictions[:, i])
    auc_scores[f"Class {i} AUC"] = auc

print("\nTest Set AUC Scores:", auc_scores)

# Calculate average AUC
average_auc = np.mean(list(auc_scores.values()))
print(f"Average Test AUC: {average_auc}")

# Predict classes without threshold adjustments
y_test_pred = best_svm.predict(X_test_scaled)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix on Test Set:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_test_pred)
print("\nClassification Report on Test Set:")
print(class_report)


Original training set class distribution: Counter({0: 500, 1: 25, 2: 25})
Resampled training set class distribution: Counter({0: 500, 1: 500, 2: 500})
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best SVM Parameters: {'estimator__kernel': 'rbf', 'estimator__gamma': 'auto', 'estimator__C': 10}

Test Set AUC Scores: {'Class 0 AUC': 0.7860977777777778, 'Class 1 AUC': 0.6212994708994709, 'Class 2 AUC': 0.9751365079365079}
Average Test AUC: 0.7941779188712522

Confusion Matrix on Test Set:
[[1235  228   37]
 [  48   23    4]
 [   7    5   63]]

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.96      0.82      0.89      1500
           1       0.09      0.31      0.14        75
           2       0.61      0.84      0.70        75

    accuracy                           0.80      1650
   macro avg       0.55      0.66      0.58      1650
weighted avg       0.90      0.80      0.84      1650

