In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd

# For modeling and evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from collections import Counter

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
random_state = 42
np.random.seed(random_state)

# --------------------------------------------
# Step 1: Load and Preprocess Data
# --------------------------------------------

# Load data from the third and fourth batches
third_batch = np.load('third_batch_multi_labels.npz', allow_pickle=True)
fourth_batch = np.load('fourth_batch_multi_labels.npz', allow_pickle=True)

# Convert numpy arrays to pandas DataFrames
third_X_df = pd.DataFrame(third_batch['X'], columns=['user_id', 'item_id', 'rating'])
third_labels_df = pd.DataFrame(third_batch['yy'], columns=['user_id', 'label'])

fourth_X_df = pd.DataFrame(fourth_batch['X'], columns=['user_id', 'item_id', 'rating'])
fourth_labels_df = pd.DataFrame(fourth_batch['yy'], columns=['user_id', 'label'])

# Combine third and fourth batches for training
X_df = pd.concat([third_X_df, fourth_X_df], ignore_index=True)
labels_df = pd.concat([third_labels_df, fourth_labels_df], ignore_index=True)

# --------------------------------------------
# Step 2: Feature Engineering
# --------------------------------------------

def create_user_features(df):
    # Count of interactions
    interaction_counts = df.groupby('user_id')['item_id'].count().rename('total_interactions')
    
    # Statistics of ratings
    rating_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'min', 'max']).rename(
        columns={'mean': 'rating_mean', 'std': 'rating_std', 'min': 'rating_min', 'max': 'rating_max'}
    )
    
    # Count of each rating type
    rating_counts = df.groupby(['user_id', 'rating']).size().unstack(fill_value=0)
    rating_counts.columns = [f'rating_{col}' for col in rating_counts.columns]
    
    # Ratio of each rating type
    rating_ratios = rating_counts.div(rating_counts.sum(axis=1), axis=0)
    rating_ratios.columns = [f'ratio_{col}' for col in rating_ratios.columns]
    
    # Interaction diversity (number of unique items)
    item_diversity = df.groupby('user_id')['item_id'].nunique().rename('unique_items')
    
    # Combine all features
    user_features = pd.concat([
        interaction_counts, rating_stats, rating_counts,
        rating_ratios, item_diversity
    ], axis=1)
    
    # Fill NaN values in rating_std
    user_features['rating_std'] = user_features['rating_std'].fillna(0)
    
    # Replace infinite values with NaN and then fill with 0
    user_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    user_features.fillna(0, inplace=True)
    
    return user_features.reset_index()

# Create user-level features
user_features = create_user_features(X_df)

# Merge features with labels
user_data = user_features.merge(labels_df, on='user_id')

# --------------------------------------------
# Step 3: Prepare Training and Validation Sets
# --------------------------------------------

# Separate features and labels
X = user_data.drop(['user_id', 'label'], axis=1)
y = user_data['label']

# Split into training and validation sets before scaling
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=random_state
)

# Check class distribution
print('Original training set class distribution:', Counter(y_train))

# --------------------------------------------
# Step 4: Data Scaling
# --------------------------------------------

# Standardize features (excluding anomaly_score, which will be added later)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# --------------------------------------------
# Step 5: Anomaly Detection with Isolation Forest
# --------------------------------------------

# Fit Isolation Forest on scaled training data
iso_forest = IsolationForest(
    n_estimators=100,
    contamination='auto',
    random_state=random_state
)
iso_forest.fit(X_train_scaled)

# Compute anomaly scores for training data
anomaly_scores_train = iso_forest.decision_function(X_train_scaled)
anomaly_scores_train = -anomaly_scores_train  # Invert scores so higher scores indicate anomalies

# Compute anomaly scores for validation data
anomaly_scores_valid = iso_forest.decision_function(X_valid_scaled)
anomaly_scores_valid = -anomaly_scores_valid

# Add anomaly scores to the scaled features
X_train_scaled = np.hstack([X_train_scaled, anomaly_scores_train.reshape(-1, 1)])
X_valid_scaled = np.hstack([X_valid_scaled, anomaly_scores_valid.reshape(-1, 1)])

# --------------------------------------------
# Step 6: Handling Class Imbalance with SMOTE
# --------------------------------------------

# Initialize SMOTE
smote = SMOTE(random_state=random_state, sampling_strategy='auto')

# Apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check new class distribution
print('Resampled training set class distribution:', Counter(y_train_resampled))

# --------------------------------------------
# Step 7: Model Training with XGBoost
# --------------------------------------------

# Initialize XGBoost Classifier
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=random_state,
    n_jobs=-1
)

# Define parameter grid for hyperparameter tuning
param_grid_xgb = {
    'n_estimators': [500],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0],
    'reg_alpha': [0],
    'reg_lambda': [1],
}

# Initialize RandomizedSearchCV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid_xgb,
    n_iter=10,
    scoring='roc_auc_ovr_weighted',
    cv=skf,
    n_jobs=-1,
    verbose=1,
    random_state=random_state
)

# Fit RandomizedSearchCV
random_search_xgb.fit(X_train_resampled, y_train_resampled)

# Best estimator
best_xgb = random_search_xgb.best_estimator_
print(f"Best XGBoost Parameters: {random_search_xgb.best_params_}")

# --------------------------------------------
# Step 8: Model Evaluation
# --------------------------------------------

# Predict probabilities on validation set
xgb_predictions = best_xgb.predict_proba(X_valid_scaled)

# Compute AUC scores for each class
auc_scores = {}
for i in range(3):
    auc = roc_auc_score((y_valid == i).astype(int), xgb_predictions[:, i])
    auc_scores[f"Class {i} AUC"] = auc

print("\nValidation Set AUC Scores:", auc_scores)

# Calculate average AUC
average_auc = np.mean(list(auc_scores.values()))
print(f"Average Validation AUC: {average_auc}")

# Predict classes without threshold adjustments
y_valid_pred = best_xgb.predict(X_valid_scaled)

# Confusion Matrix
conf_matrix = confusion_matrix(y_valid, y_valid_pred)
print("\nConfusion Matrix on Validation Set:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_valid, y_valid_pred)
print("\nClassification Report on Validation Set:")
print(class_report)

# --------------------------------------------
# Step 9: Prepare Test Data (Fifth Batch)
# --------------------------------------------

# Load the fifth batch data which is the test set with unlabeled users
fifth_batch = np.load('fifth_batch_multi.npz', allow_pickle=True)
fifth_X = fifth_batch['X']
fifth_X_df = pd.DataFrame(fifth_X, columns=['user_id', 'item_id', 'rating'])

# Create user-level features for the fifth batch
fifth_user_features = create_user_features(fifth_X_df)

# Ensure all features are aligned
missing_cols = set(X_train.columns) - set(fifth_user_features.columns)
for col in missing_cols:
    fifth_user_features[col] = 0

# Reorder columns to match training data
fifth_user_features = fifth_user_features[X_train.columns]

# Scaling (excluding anomaly_score, which will be added next)
fifth_X_scaled = scaler.transform(fifth_user_features)

# Compute anomaly scores for the fifth batch
fifth_anomaly_scores = iso_forest.decision_function(fifth_X_scaled)
fifth_anomaly_scores = -fifth_anomaly_scores  # Invert scores so higher scores indicate anomalies

# Add anomaly scores to the scaled features
fifth_X_scaled = np.hstack([fifth_X_scaled, fifth_anomaly_scores.reshape(-1, 1)])

# --------------------------------------------
# Step 10: Predict on Test Data
# --------------------------------------------

# Predict probabilities on test data
fifth_xgb_predictions = best_xgb.predict_proba(fifth_X_scaled)

# Ensure the predicted probabilities sum to 1 for each user
assert np.allclose(fifth_xgb_predictions.sum(axis=1), 1, atol=1e-6), "Probabilities do not sum to 1."

# Save the predicted probabilities to .npz file
np.savez('fifth_batch_scores.npz', scores=fifth_xgb_predictions)

# Predict classes without threshold adjustments
fifth_y_pred = best_xgb.predict(fifth_X_scaled)

# Count the number of instances predicted for each class
class_counts = pd.Series(fifth_y_pred).value_counts().sort_index()
print("\nPredicted Class Counts on Test Set:")
for i, count in class_counts.items():
    print(f"Class {i}: {count}")

# Total anomalies
num_anomalies = class_counts.get(1, 0) + class_counts.get(2, 0)
print(f"\nTotal anomalies predicted (Class 1 and 2): {num_anomalies}")


Original training set class distribution: Counter({0: 1200, 1: 36, 2: 36})
Resampled training set class distribution: Counter({0: 1200, 1: 1200, 2: 1200})
Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best XGBoost Parameters: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}

Validation Set AUC Scores: {'Class 0 AUC': 0.7562962962962962, 'Class 1 AUC': 0.6425746134483998, 'Class 2 AUC': 0.95361380798274}
Average Validation AUC: 0.7841615725758121

Confusion Matrix on Validation Set:
[[277  20   3]
 [  7   1   1]
 [  6   0   3]]

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       300
           1       0.05      0.11      0.07         9
           2       0.43      0.33      0.38         9

    accuracy                           0.88       318
   macro avg       0.48      0.46      0.46       318
weighted avg       0.91      0.88      0.90       318


Predicted Class Counts on Test Set:
Class 0: 1429
Class 1: 143
Class 2: 78

Total anomalies predicted (Class 1 and 2): 221
