# Experiment 02: XGBoost Classification Pipeline (GPU Enabed)

## Overview
This notebook implements the XGBoost pipeline exactly as requested.
- **Core**: Feature Extraction (User Engine).
- **Preprocessing**: StandardScaler + SMOTE.
- **Model**: XGBClassifier (GPU/Hist) with RandomizedSearchCV.
- **Refinement**: Threshold Tuning.

> **Note**: This notebook requires a GPU environment (e.g. Kaggle T4, Colab). If running on CPU, change `device='cuda'` to `device='cpu'`.

In [None]:
# 1. Import & Config
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import time
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# Custom module (Shared with SVM)
from src.data_processing import load_all_splits

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# 2. Feature Extraction
BASE_PATH = 'data/raw'

print("Loading Train features...")
train_lc_features = load_all_splits(BASE_PATH, mode='train')

print("Loading Test features...")
test_lc_features = load_all_splits(BASE_PATH, mode='test')

print("Shape Train:", train_lc_features.shape)
print("Shape Test:", test_lc_features.shape)

In [None]:
# 3. Merge Metadata
train_log = pd.read_csv(os.path.join(BASE_PATH, 'train_log.csv'))
test_log = pd.read_csv(os.path.join(BASE_PATH, 'test_log.csv'))

full_train = train_log.merge(train_lc_features, on='object_id', how='left')
full_test = test_log.merge(test_lc_features, on='object_id', how='left')

full_train.fillna(0, inplace=True)
full_test.fillna(0, inplace=True)

display(full_train.head(3))

In [None]:
# 4. Preprocessing (Scale & SMOTE)
drop_cols = ['object_id', 'SpecType', 'English Translation', 'split', 'target', 'Z_err']
feature_cols = [c for c in full_train.columns if c not in drop_cols]

print(f"Using {len(feature_cols)} features.")

X = full_train[feature_cols]
y = full_train['target']
X_test_final = full_test[feature_cols]

# Scaling (StandardScaler as requested)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# SMOTE
print(f"Original TDE: {sum(y==1)}")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print(f"After SMOTE TDE: {sum(y_resampled==1)}")

In [None]:
# 5. XGBoost Training
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Config GPU XGBoost
xgb_gpu = XGBClassifier(
    tree_method='hist',
    device='cuda',        # Run on GPU
    eval_metric='logloss',
    random_state=42,
    use_label_encoder=False,
    early_stopping_rounds=None
)

# Param Grid
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
param_grid = {
    'n_estimators': [500, 800, 1200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [6, 8, 10],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [ratio, ratio * 1.2]
}

# RandomizedSearch
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
print("Running RandomizedSearchCV on GPU...")
start_time = time.time()

search = RandomizedSearchCV(
    estimator=xgb_gpu,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1',
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=1 # Required for GPU safety
)

search.fit(X_train, y_train)
print(f"Done in {(time.time() - start_time)/60:.2f} min.")
print("Best Params:", search.best_params_)

best_model = search.best_estimator_

In [None]:
# 6. Evaluation
y_pred_val = best_model.predict(X_val)
print("Validation F1 (Default 0.5):", f1_score(y_val, y_pred_val))

In [None]:
# 7. Threshold Tuning
y_val_prob = best_model.predict_proba(X_val)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_prob)
f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls)!=0)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Best Threshold: {best_threshold:.4f}")
print(f"Best F1: {best_f1:.4f}")

In [None]:
# 8. Submission
y_test_prob = best_model.predict_proba(X_test_scaled)[:, 1]
final_predictions = (y_test_prob >= best_threshold).astype(int)

submission = pd.DataFrame({
    'object_id': full_test['object_id'],
    'prediction': final_predictions
})

print(submission['prediction'].value_counts())
submission.to_csv('submission_xgboost_gpu.csv', index=False)
print("Done.")