In [1]:
# ============================
# SEMI-SUPERVISED XGBOOST (BoT + 3% ToN) - CORRECTED FOR KAGGLE GPU
# ============================

!pip install -q xgboost scipy

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# ============================
# 1. LOAD DATA
# ============================
BOT_PATH = "/kaggle/input/cicbotiot/CIC-BoT-IoT-V2.parquet"
TON_PATH = "/kaggle/input/cictoniot/CIC-ToN-IoT-V2.parquet"

FEATURES = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Fwd Packet Length Max',
    'Bwd Packet Length Max',
    'Flow Bytes/s',
    'Flow Packets/s',
    'Flow IAT Mean'
]

bot = pd.read_parquet(BOT_PATH, columns=FEATURES + ['Label']).sample(frac=0.05, random_state=SEED)
ton = pd.read_parquet(TON_PATH, columns=FEATURES + ['Label'])

X_bot_df = bot[FEATURES]
y_bot = bot['Label'].values

X_ton_df = ton[FEATURES]
y_ton = ton['Label'].values

# ============================
# 2. DRIFT-AWARE FEATURE SELECTION (TOP-K)
# ============================
drift = []
for f in FEATURES:
    ks, _ = ks_2samp(X_bot_df[f], X_ton_df[f])
    drift.append((f, ks))

drift_df = pd.DataFrame(drift, columns=["Feature", "KS"]).sort_values("KS")
TOP_K = 4
selected_features = drift_df.head(TOP_K)["Feature"].tolist()
print("Selected features:", selected_features)

X_bot = X_bot_df[selected_features].values
X_ton = X_ton_df[selected_features].values

# ============================
# 3. SEMI-SUPERVISED SPLIT (3% ToN)
# ============================
X_ton_train, X_ton_test, y_ton_train, y_ton_test = train_test_split(
    X_ton, y_ton, train_size=0.03, stratify=y_ton, random_state=SEED
)

print("Using ToN labeled samples:", len(X_ton_train))
print("Evaluating on unseen ToN samples:", len(X_ton_test))

# ============================
# 4. SMOTE (BoT ONLY)
# ============================
smote = SMOTE(random_state=SEED)
X_bot, y_bot = smote.fit_resample(X_bot, y_bot)

# ============================
# 5. COMBINE DATA
# ============================
X_train_all = np.vstack([X_bot, X_ton_train])
y_train_all = np.hstack([y_bot, y_ton_train])

# ============================
# 6. ROBUST SCALING
# ============================
scaler = QuantileTransformer(
    n_quantiles=300,
    output_distribution="normal",
    random_state=SEED
)

X_train_all = scaler.fit_transform(X_train_all)
X_ton_test = scaler.transform(X_ton_test)

# ============================
# 7. XGBOOST MODEL - CORRECTED FOR KAGGLE
# ============================
# First, check if GPU is available
try:
    # Try GPU version first
    model = xgb.XGBClassifier(
        n_estimators=700,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="gpu_hist",  # Try GPU
        eval_metric="logloss",
        random_state=SEED,
        verbosity=0
    )
    print("Attempting to use GPU acceleration...")
    model.fit(X_train_all, y_train_all)
    print("✅ Model trained with GPU acceleration")
    
except Exception as e:
    print(f"GPU training failed: {e}")
    print("Falling back to CPU training...")
    
    # Fall back to CPU version
    model = xgb.XGBClassifier(
        n_estimators=500,  # Reduced for CPU
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",  # CPU histogram method
        eval_metric="logloss",
        random_state=SEED,
        verbosity=0
    )
    model.fit(X_train_all, y_train_all)
    print("✅ Model trained with CPU")

# ============================
# 8. EVALUATION (UNSEEN ToN)
# ============================
probs = model.predict_proba(X_ton_test)[:, 1]
preds = (probs > 0.5).astype(int)

print("\n=== Semi-Supervised Cross-Dataset (BoT + 3% ToN → 97% ToN) ===")
print("Accuracy:", accuracy_score(y_ton_test, preds))
print("Precision:", precision_score(y_ton_test, preds, zero_division=0))
print("Recall:", recall_score(y_ton_test, preds, zero_division=0))
print("F1:", f1_score(y_ton_test, preds, zero_division=0))
print("AUC:", roc_auc_score(y_ton_test, probs))
print("\nConfusion Matrix:")
print(confusion_matrix(y_ton_test, preds))
print("\nClassification Report:")
print(classification_report(y_ton_test, preds))

print("\n✅ Semi-supervised training completed")

Selected features: ['Total Backward Packets', 'Total Fwd Packets', 'Bwd Packet Length Max', 'Fwd Packet Length Max']
Using ToN labeled samples: 145424
Evaluating on unseen ToN samples: 4702075
Attempting to use GPU acceleration...
✅ Model trained with GPU acceleration

=== Semi-Supervised Cross-Dataset (BoT + 3% ToN → 97% ToN) ===
Accuracy: 0.8976985692486827
Precision: 0.9774232561715643
Recall: 0.8372640545299562
F1: 0.901930986818566
AUC: 0.9469987769988335

Confusion Matrix:
[[2009058   51093]
 [ 429936 2211988]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89   2060151
           1       0.98      0.84      0.90   2641924

    accuracy                           0.90   4702075
   macro avg       0.90      0.91      0.90   4702075
weighted avg       0.91      0.90      0.90   4702075


✅ Semi-supervised training completed
