# Model: XGBoost (final)


In [1]:
# Train the final XGBoost model using the Best Params provided by user.
# Robust behavior:
#  - Prefer `X_train_fs`/`X_test_fs` and `y_train_model`/`y_test_model` if available in notebook globals.
#  - Otherwise load `../data/dataset_final_processed.csv`, prepare train/test (stratified split), clean and impute, run SelectKBest(k=8) if needed, then train.

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Best parameters provided by user
best_params = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

# 1) Try to obtain data from notebook globals
X_tr = None; X_te = None; y_tr = None; y_te = None

if 'X_train_fs' in globals() and globals().get('X_train_fs') is not None:
    X_tr = globals()['X_train_fs']
    print('Using X_train_fs from notebook globals (selected features).')
elif 'X_train_model' in globals() and globals().get('X_train_model') is not None:
    X_tr = globals()['X_train_model']
    print('Using X_train_model from notebook globals.')
elif 'X_train_proc' in globals() and globals().get('X_train_proc') is not None:
    X_tr = globals()['X_train_proc']
    print('Using X_train_proc from notebook globals.')

if 'X_test_fs' in globals() and globals().get('X_test_fs') is not None:
    X_te = globals()['X_test_fs']
elif 'X_test_model' in globals() and globals().get('X_test_model') is not None:
    X_te = globals()['X_test_model']
elif 'X_test_proc' in globals() and globals().get('X_test_proc') is not None:
    X_te = globals()['X_test_proc']

if 'y_train_model' in globals():
    y_tr = globals()['y_train_model']
elif 'y_train' in globals():
    y_tr = globals()['y_train']

if 'y_test_model' in globals():
    y_te = globals()['y_test_model']
elif 'y_test' in globals():
    y_te = globals()['y_test']

# 2) If missing, load dataset and prepare a minimal train/test
if X_tr is None or y_tr is None:
    print('Train globals not found — loading dataset and preparing train/test split from ../data/dataset_final_processed.csv')
    df = pd.read_csv('../data/dataset_final_processed.csv')

    # Drop any leakage column 'grav' (user requested)
    if 'grav' in df.columns:
        print("Dropping feature 'grav' from loaded dataset (user requested).")
        df = df.drop(columns=['grav'])

    # Ensure target exists
    if 'grave' not in df.columns:
        raise RuntimeError("The loaded dataset does not contain a 'grave' target column.")

    # Build X, y
    y = df['grave'].copy()
    X = df.drop(columns=['grave'])

    # Minimal cleaning: coerce object columns to numeric when possible
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    dropped_object_cols = []
    for col in non_numeric:
        conv = pd.to_numeric(X[col].astype(str).str.replace(r'[()\s]', '', regex=True).str.replace(',', '.', regex=False), errors='coerce')
        frac_valid = conv.notna().mean()
        if frac_valid >= 0.8:
            X[col] = conv
        else:
            # drop unreliable object columns
            dropped_object_cols.append(col)
    if dropped_object_cols:
        print('Dropped non-convertible object columns:', dropped_object_cols)
        X.drop(columns=dropped_object_cols, inplace=True, errors='ignore')

    # Stratified split
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
    print(f'Prepared train/test shapes: X_tr={X_tr.shape}, X_te={X_te.shape}')

# 3) Ensure features do not contain columns entirely NaN in train
all_nan_cols = []
for c in X_tr.columns:
    try:
        if pd.isna(X_tr[c]).all():
            all_nan_cols.append(c)
    except Exception:
        # ignore columns that cannot be checked
        continue
if all_nan_cols:
    print('Dropping columns entirely NaN in train:', all_nan_cols)
    X_tr = X_tr.drop(columns=all_nan_cols, errors='ignore')
    if X_te is not None:
        X_te = X_te.drop(columns=all_nan_cols, errors='ignore')

# 4) Impute remaining NaNs with median (fit on train)
numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) == 0:
    raise RuntimeError('No numeric features available after cleaning — cannot train XGBoost.')

imputer = SimpleImputer(strategy='median')
X_tr_imp = pd.DataFrame(imputer.fit_transform(X_tr[numeric_cols]), columns=numeric_cols, index=X_tr.index)
X_te_imp = None
if X_te is not None:
    # align test to numeric_cols that exist
    common = [c for c in numeric_cols if c in X_te.columns]
    X_te_imp_values = imputer.transform(X_te[common])
    X_te_imp = pd.DataFrame(X_te_imp_values, columns=list(common), index=X_te.index)

# 5) Feature selection: use existing selected_features_kbest if available, else compute SelectKBest(k=8)
if 'selected_features_kbest' in globals() and globals().get('selected_features_kbest'):
    selected = [c for c in globals()['selected_features_kbest'] if c in X_tr_imp.columns]
    print('Using selected_features_kbest from notebook globals:', selected)
else:
    k = min(8, X_tr_imp.shape[1])
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_tr_imp, y_tr)
    mask = selector.get_support()
    selected = list(X_tr_imp.columns[mask])
    print(f'SelectKBest selected {len(selected)} features (top {k}):', selected)

# Build final train/test feature matrices for modeling
X_train_final = X_tr_imp.loc[:, selected].copy()
X_test_final = X_te_imp.loc[:, selected].copy() if X_te_imp is not None else None

print('Final modeling shapes -> X_train_final:', X_train_final.shape, 'X_test_final:', None if X_test_final is None else X_test_final.shape)

# 6) Train XGBoost with the provided best params
print('Training XGBoost with params:', best_params)
xgb_final = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, verbosity=1)

xgb_final.fit(X_train_final, y_tr)

# 7) Evaluate
if 'evaluate_model' in globals():
    print('\nEvaluating using notebook evaluate_model helper:')
    globals()['evaluate_model']('XGBoost (final)', xgb_final, X_train_final, X_test_final if X_test_final is not None else X_train_final, y_tr, y_te if y_te is not None else y_tr)
else:
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    if X_test_final is None or y_te is None:
        print('No test set available; showing training metrics only.')
        y_pred_tr = xgb_final.predict(X_train_final)
        print('Train Accuracy:', accuracy_score(y_tr, y_pred_tr))
        print('Train F1:', f1_score(y_tr, y_pred_tr))
    else:
        y_pred = xgb_final.predict(X_test_final)
        print('Test Accuracy:', accuracy_score(y_te, y_pred))
        print('Test F1:', f1_score(y_te, y_pred))
        print('Precision:', precision_score(y_te, y_pred))
        print('Recall:', recall_score(y_te, y_pred))

# 8) Expose final model and metadata
globals().update({'xgb_final_model': xgb_final, 'xgb_final_params': best_params, 'xgb_final_features': selected})
print('\nSaved final model as `xgb_final_model` and selected features as `xgb_final_features`.')


Train globals not found — loading dataset and preparing train/test split from ../data/dataset_final_processed.csv
Prepared train/test shapes: X_tr=(44241, 30), X_te=(11061, 30)
Dropping columns entirely NaN in train: ['lat', 'long', 'v2', 'heure']
SelectKBest selected 8 features (top 8): ['agg', 'col', 'nb_usagers', 'nb_vehicules', 'v1', 'plan', 'situ', 'vma']
Final modeling shapes -> X_train_final: (44241, 8) X_test_final: (11061, 8)
Training XGBoost with params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Test Accuracy: 0.8944941687008408
Test F1: 0.5471478463329453
Precision: 0.6904995102840352
Recall: 0.4530848329048843

Saved final model as `xgb_final_model` and selected features as `xgb_final_features`.


# Cross-validated recall optimization and final training

In [2]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
import numpy as np

# Require training matrices prepared by previous cell
if 'X_train_final' not in globals() or 'y_tr' not in globals():
    raise RuntimeError('X_train_final and y_tr must be available. Run the previous cell that prepares and trains the model first.')

X_train_cv = globals()['X_train_final']
y_train_cv = globals()['y_tr']
X_test_cv = globals().get('X_test_final')
y_test_cv = globals().get('y_te')

print(f"CV tuning on data: X_train_cv={X_train_cv.shape}, y_train_cv={len(y_train_cv)}")

# Base XGBoost with the user's best params as starting config
base_params = globals().get('xgb_final_params', None)
if base_params is None:
    raise RuntimeError('Base XGBoost params not found in globals (`xgb_final_params`).')

from xgboost import XGBClassifier
base_xgb = XGBClassifier(**base_params, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, verbosity=0)

# Candidate grid – we only vary scale_pos_weight and subsample slightly to improve recall
# Compute a heuristic class ratio
neg = int((y_train_cv == 0).sum())
pos = int((y_train_cv == 1).sum())
ratio = max(1, int(round(neg / pos)))
param_grid = {
    'scale_pos_weight': [1, ratio, max(1, ratio//2), ratio*2],
    'subsample': [base_params.get('subsample', 0.8), 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(base_xgb, param_grid, scoring='recall', cv=cv, n_jobs=-1, verbose=1)
print('Starting GridSearchCV (scoring=recall) over param grid:', param_grid)
grid.fit(X_train_cv, y_train_cv)

print('\nGridSearchCV complete.')
print('Best params (recall):', grid.best_params_)
print('Best CV recall:', grid.best_score_)

best_xgb = grid.best_estimator_

# Compute out-of-fold predicted probabilities for threshold search
print('\nComputing out-of-fold probabilities for threshold optimization (F2).')
oof_proba = cross_val_predict(best_xgb, X_train_cv, y_train_cv, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]

# Search thresholds to maximize F2 score (beta=2 emphasizes recall)
thresholds = np.linspace(0.01, 0.99, 99)
best_thr = 0.5
best_f2 = -1.0
from sklearn.metrics import fbeta_score
for thr in thresholds:
    preds = (oof_proba >= thr).astype(int)
    try:
        f2 = fbeta_score(y_train_cv, preds, beta=2)
    except Exception:
        f2 = 0.0
    if f2 > best_f2:
        best_f2 = f2
        best_thr = thr

print(f'Optimal threshold (F2 on OOF): {best_thr:.3f} with F2={best_f2:.4f}')

# Fit final model on full training data
print('\nFitting best estimator on full training data...')
best_xgb.fit(X_train_cv, y_train_cv)

# Evaluate on test if available
if X_test_cv is not None and y_test_cv is not None:
    y_proba_test = best_xgb.predict_proba(X_test_cv)[:, 1]
    y_pred_test_default = best_xgb.predict(X_test_cv)
    y_pred_test_thr = (y_proba_test >= best_thr).astype(int)

    print('\nEvaluation (default 0.5 threshold):')
    print('Accuracy:', accuracy_score(y_test_cv, y_pred_test_default))
    print('Recall:', recall_score(y_test_cv, y_pred_test_default))
    print('Precision:', precision_score(y_test_cv, y_pred_test_default))
    print('F1:', f1_score(y_test_cv, y_pred_test_default))
    try:
        print('ROC-AUC:', roc_auc_score(y_test_cv, y_proba_test))
    except Exception:
        pass

    print('\nEvaluation (optimized threshold):')
    print('Threshold:', best_thr)
    print('Accuracy:', accuracy_score(y_test_cv, y_pred_test_thr))
    print('Recall:', recall_score(y_test_cv, y_pred_test_thr))
    print('Precision:', precision_score(y_test_cv, y_pred_test_thr))
    print('F1:', f1_score(y_test_cv, y_pred_test_thr))
    try:
        print('ROC-AUC:', roc_auc_score(y_test_cv, y_proba_test))
    except Exception:
        pass
else:
    print('\nNo test set available for final evaluation. You can use the OOF F2/recall as an estimate.')
    # Show OOF metrics at default and optimized threshold
    oof_pred_def = (oof_proba >= 0.5).astype(int)
    oof_pred_thr = (oof_proba >= best_thr).astype(int)
    print('\nOOF (default 0.5) Recall:', recall_score(y_train_cv, oof_pred_def), 'F1:', f1_score(y_train_cv, oof_pred_def))
    print('OOF (opt thr) Recall:', recall_score(y_train_cv, oof_pred_thr), 'F2:', fbeta_score(y_train_cv, oof_pred_thr, beta=2))

# Expose final tuned model and threshold
globals().update({'xgb_cv_best': best_xgb, 'xgb_cv_best_params': grid.best_params_, 'xgb_cv_threshold': best_thr})
print('\nSaved tuned model as `xgb_cv_best` and threshold as `xgb_cv_threshold`.')

CV tuning on data: X_train_cv=(44241, 8), y_train_cv=44241
Starting GridSearchCV (scoring=recall) over param grid: {'scale_pos_weight': [1, 6, 3, 12], 'subsample': [0.8, 1.0]}
Fitting 5 folds for each of 8 candidates, totalling 40 fits

GridSearchCV complete.
Best params (recall): {'scale_pos_weight': 12, 'subsample': 1.0}
Best CV recall: 0.8753210914397137

Computing out-of-fold probabilities for threshold optimization (F2).
Optimal threshold (F2 on OOF): 0.670 with F2=0.7073

Fitting best estimator on full training data...

Evaluation (default 0.5 threshold):
Accuracy: 0.7799475635114366
Recall: 0.8753213367609255
Precision: 0.3781232648528595
F1: 0.5281116711903838
ROC-AUC: 0.8931362400252066

Evaluation (optimized threshold):
Threshold: 0.67
Accuracy: 0.8277732573908326
Recall: 0.8155526992287918
Precision: 0.4395566331832352
F1: 0.5712356515867657
ROC-AUC: 0.8931362400252066

Saved tuned model as `xgb_cv_best` and threshold as `xgb_cv_threshold`.
