In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

def run_task5_pipeline():
    print("--- Starting Task 5: Account Security Monitoring (Ensemble) ---")

    # ---------------------------------------------------------
    # 1. LOAD DATA
    # ---------------------------------------------------------
    try:
        df = pd.read_csv('test.csv')
        print(f"Data Loaded. Shape: {df.shape}")
    except FileNotFoundError:
        print("Error: 'task5/test.csv' not found. Please ensure the file exists.")
        return

    player_ids = df['id'] if 'id' in df.columns else df.index
    # Keep 'id' for submission, but drop it for feature processing
    features = df.drop(columns=['id'], errors='ignore')

    # ---------------------------------------------------------
    # 2. FEATURE ENGINEERING
    # ---------------------------------------------------------
    print("Engineering features...")

    # Identify base feature names
    base_features = set()
    for col in features.columns:
        if col[-2:] in ['_1', '_2', '_3', '_4']:
            base_features.add(col[:-2])

    # Calculate statistics across time steps
    for base in base_features:
        cols = [f"{base}_{i}" for i in range(1, 5)]
        if all(c in features.columns for c in cols):
            features[f'{base}_std'] = features[cols].std(axis=1)
            features[f'{base}_mean'] = features[cols].mean(axis=1)
            features[f'{base}_range'] = features[cols].max(axis=1) - features[cols].min(axis=1)

    features = features.fillna(0)

    # ---------------------------------------------------------
    # 3. SCALING
    # ---------------------------------------------------------
    print("Scaling features...")
    # Select only numeric columns for scaling to avoid errors with object dtypes
    numeric_features = features.select_dtypes(include=np.number)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(numeric_features)

    # ---------------------------------------------------------
    # 4. ENSEMBLE MODELING
    # ---------------------------------------------------------
    # Contamination: Expected % of anomalies (e.g., 5%)
    contamination_rate = 0.05
    print(f"Training Ensemble Models (Contamination: {contamination_rate})...")

    # --- Model A: Isolation Forest ---
    print("1. Training Isolation Forest...")
    iso_forest = IsolationForest(
        n_estimators=200,
        contamination=contamination_rate,
        random_state=42,
        n_jobs=-1
    )
    # Output: -1 (Anomaly), 1 (Normal)
    pred_iso = iso_forest.fit_predict(X_scaled)

    # --- Model B: One-Class SVM ---
    # SVMs can be slow on massive data. If >100k rows, consider SGDOneClassSVM.
    print("2. Training One-Class SVM...")
    oc_svm = OneClassSVM(
        kernel='rbf',
        nu=contamination_rate,
        gamma='scale'
    )
    pred_svm = oc_svm.fit_predict(X_scaled)

    # --- Model C: Local Outlier Factor (LOF) ---
    print("3. Training Local Outlier Factor...")
    lof = LocalOutlierFactor(
        n_neighbors=20,
        contamination=contamination_rate,
        n_jobs=-1,
        novelty=False # LOF is strictly outlier detection here
    )
    pred_lof = lof.fit_predict(X_scaled)

    # ---------------------------------------------------------
    # 5. VOTING (HARD VOTING)
    # ---------------------------------------------------------
    print("Aggregating votes...")

    # Convert predictions: -1 (Anomaly) -> 1,  1 (Normal) -> 0
    # This makes summing them easier.
    res_iso = np.where(pred_iso == -1, 1, 0)
    res_svm = np.where(pred_svm == -1, 1, 0)
    res_lof = np.where(pred_lof == -1, 1, 0)

    # Sum votes (0 to 3)
    total_votes = res_iso + res_svm + res_lof

    # MAJORITY RULE: If 2 or more models agree it's an anomaly, flag it.
    # > 1 means 2 or 3 votes.
    final_preds = np.where(total_votes > 1, 1, 0)

    # ---------------------------------------------------------
    # 6. SUBMISSION
    # ---------------------------------------------------------
    submission = pd.DataFrame({
        'id': player_ids,
        'is_anomaly': final_preds
    })

    print("\n--- Ensemble Prediction Summary ---")
    print("Model Agreement Breakdown:")
    print(pd.Series(total_votes).value_counts().sort_index().rename({0: '0 Votes (Normal)', 1: '1 Vote', 2: '2 Votes', 3: '3 Votes (Strong Anomaly)'}))
    print("\nFinal Decision (Majority Vote):")
    print(submission['is_anomaly'].value_counts())

    submission.to_csv('task5_submission.csv', index=False)
    print("\nSuccessfully saved to 'task5_submission.csv'")

if __name__ == "__main__":
    run_task5_pipeline()

--- Starting Task 5: Account Security Monitoring (Ensemble) ---
Data Loaded. Shape: (25889, 124)
Engineering features...
Scaling features...
Training Ensemble Models (Contamination: 0.05)...
1. Training Isolation Forest...
2. Training One-Class SVM...
3. Training Local Outlier Factor...
Aggregating votes...

--- Ensemble Prediction Summary ---
Model Agreement Breakdown:
0 Votes (Normal)            23500
1 Vote                       1089
2 Votes                      1108
3 Votes (Strong Anomaly)      192
Name: count, dtype: int64

Final Decision (Majority Vote):
is_anomaly
0    24589
1     1300
Name: count, dtype: int64

Successfully saved to 'task5_submission.csv'
