In [43]:
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, silhouette_score
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import skew, kurtosis

import numpy as np
import pandas as pd

In [35]:
data = np.load('first_batch_multi_labels.npz')

X = data['X']
y = data['y']
yy = data['yy']

labels = yy[:, 1]
labels

df = pd.DataFrame(X, columns=['user', 'item', 'rating'])

df_y = pd.DataFrame(yy, columns=['user', 'label'])

In [44]:
# Step 1: Feature Engineering
df_user_features = df.groupby('user').agg(
    mean_rating=('rating', 'mean'),
    median_rating=('rating', 'median'),
    std_rating=('rating', 'std'),
    count_dislike=('rating', lambda x: (x == -10).sum()),
    count_neutral=('rating', lambda x: (x == 0).sum()),
    count_like=('rating', lambda x: (x == 10).sum()),
    count_watched=('rating', lambda x: (x == 1).sum()),
    total_interactions=('rating', 'count')
)

# Ratio features
df_user_features['like_ratio'] = df_user_features['count_like'] / df_user_features['total_interactions']
df_user_features['dislike_ratio'] = df_user_features['count_dislike'] / df_user_features['total_interactions']
df_user_features['neutral_ratio'] = df_user_features['count_neutral'] / df_user_features['total_interactions']
df_user_features['watched_ratio'] = df_user_features['count_watched'] / df_user_features['total_interactions']

# Interaction patterns
df_user_features['like_to_dislike_ratio'] = df_user_features['count_like'] / (df_user_features['count_dislike'] + 1)
df_user_features['rating_variance'] = df.groupby('user')['rating'].var()

# Distribution features
df_user_features['rating_skew'] = df.groupby('user')['rating'].apply(lambda x: skew(x))
df_user_features['rating_kurtosis'] = df.groupby('user')['rating'].apply(lambda x: kurtosis(x))

# User behavior patterns
df_user_features['rating_range'] = df.groupby('user')['rating'].apply(lambda x: x.max() - x.min())
df_user_features['unique_items_ratio'] = df.groupby('user')['item'].nunique() / df_user_features['total_interactions']

# Step 2: Merge user features with labels from yy
df_labels = pd.DataFrame(yy, columns=['user', 'label'])
df_merged = pd.merge(df_user_features.reset_index(), df_labels, on='user')

# Step 3: Split the data into train and test sets (train on class 0 only, evaluate on all)
train_df, test_df = train_test_split(df_merged, test_size=0.2, stratify=df_merged['label'], random_state=42)

# Train set for class 0 only
train_class_0 = train_df[train_df['label'] == 0]
X_train_class_0 = train_class_0.drop(columns=['user', 'label'])

# Test set for all classes (0, 1, and 2)
X_test = test_df.drop(columns=['user', 'label'])
y_test = test_df['label']

# Standardize the features
scaler = StandardScaler()
X_scaled_train_0 = scaler.fit_transform(X_train_class_0)
X_scaled_test = scaler.transform(X_test)

# Step 4: Train One-Class SVM on class 0 (train set)
oc_svm_class_0 = OneClassSVM(kernel='rbf', nu=0.1, gamma='scale')
oc_svm_class_0.fit(X_scaled_train_0)

# Step 5: Predict on the test set (class 0, 1, and 2)
test_predictions = oc_svm_class_0.predict(X_scaled_test)
# Map predictions to class labels (normal/inliers are class 0, anomalies are class 1 and 2)
test_predicted_labels = np.where(test_predictions == -1, 1, 0)

# Step 6: Compute AUC for each class on the test set

# AUC for class 0 vs (class 1 and class 2)
lb = LabelBinarizer()
binarized_labels_0 = lb.fit_transform(np.where(y_test == 0, 1, 0))  # Class 0 vs rest
auc_class_0 = roc_auc_score(binarized_labels_0, np.where(test_predicted_labels == 0, 1, 0))

# AUC for class 1 vs (class 0 and class 2)
binarized_labels_1 = lb.fit_transform(np.where(y_test == 1, 1, 0))  # Class 1 vs rest
auc_class_1 = roc_auc_score(binarized_labels_1, np.where(test_predicted_labels == 1, 1, 0))

# AUC for class 2 vs (class 0 and class 1)
binarized_labels_2 = lb.fit_transform(np.where(y_test == 2, 1, 0))  # Class 2 vs rest
auc_class_2 = roc_auc_score(binarized_labels_2, np.where(test_predicted_labels == 1, 1, 0))  # Anomalies as class 1

# Output the AUC scores for each class
print(f"AUC score for class 0 (test set): {auc_class_0}")
print(f"AUC score for class 1 (test set): {auc_class_1}")
print(f"AUC score for class 2 (test set): {auc_class_2}")

# Mean AUC on the test set
mean_auc_test = np.mean([auc_class_0, auc_class_1, auc_class_2])
print(f"Mean AUC (test set): {mean_auc_test}")

AUC score for class 0 (test set): 0.7200000000000001
AUC score for class 1 (test set): 0.6309523809523809
AUC score for class 2 (test set): 0.7880952380952381
Mean AUC (test set): 0.713015873015873
