# Retrain RedZone V2 on labeled dataset

This notebook retrains a model on the 5,953 labeled rows and evaluates AUC on validation/test splits.


In [None]:
from __future__ import annotations

import os
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.metrics import roc_auc_score

# --- Config ---
DATA_PATH = "/Users/starsrain/jan2026_concord/jan2026_loonie_customize/retrain_data/features_with_fpdaa.parquet"
OUTPUT_MODEL_DIR = "/Users/starsrain/jan2026_concord/jan2026_loonie_customize/retrain_data/autogluon_models_FPDAA_retrain_no_accountGuid"
LABEL_COL = "FPDAA"
GROUP_COL = "IBVStatusID"  # group by application/IBV to avoid leakage across accounts
RANDOM_SEED = 37
TRAIN_SIZE = 0.70
VAL_SIZE = 0.15
TEST_SIZE = 0.15
LEAKAGE_COLS = [LABEL_COL, "FPDAA_matured", "IBVStatusID"]

# --- Load data ---
df = pd.read_parquet(DATA_PATH)

# --- Split data (70/15/15) ---
if GROUP_COL in df.columns:
    groups = df[GROUP_COL]
    gss = GroupShuffleSplit(n_splits=1, train_size=TRAIN_SIZE, random_state=RANDOM_SEED)
    train_idx, temp_idx = next(gss.split(df, df[LABEL_COL], groups))
    train_df = df.iloc[train_idx]
    temp_df = df.iloc[temp_idx]

    gss_val = GroupShuffleSplit(
        n_splits=1,
        train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_SEED,
    )
    val_idx, test_idx = next(gss_val.split(temp_df, temp_df[LABEL_COL], temp_df[GROUP_COL]))
    val_df = temp_df.iloc[val_idx]
    test_df = temp_df.iloc[test_idx]

    print("unique groups:", df[GROUP_COL].nunique())
    print("train groups:", train_df[GROUP_COL].nunique())
    print("val groups:", val_df[GROUP_COL].nunique())
    print("test groups:", test_df[GROUP_COL].nunique())
else:
    train_df, temp_df = train_test_split(
        df,
        test_size=(1 - TRAIN_SIZE),
        random_state=RANDOM_SEED,
        stratify=df[LABEL_COL],
    )
    val_df, test_df = train_test_split(
        temp_df,
        test_size=TEST_SIZE / (VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_SEED,
        stratify=temp_df[LABEL_COL],
    )

ID_COLS = ["accountGuid"]
feature_cols = [c for c in df.columns if c not in LEAKAGE_COLS + ID_COLS]
train_df_model = train_df[feature_cols + [LABEL_COL]]
val_df_model = val_df[feature_cols + [LABEL_COL]]
test_df_model = test_df[feature_cols + [LABEL_COL]]

print("train rows:", len(train_df_model))
print("val rows:", len(val_df_model))
print("test rows:", len(test_df_model))

# --- Train AutoGluon ---
if os.path.exists(OUTPUT_MODEL_DIR):
    print("Note: model directory already exists:", OUTPUT_MODEL_DIR)

predictor = TabularPredictor(label=LABEL_COL, eval_metric="roc_auc", path=OUTPUT_MODEL_DIR)
predictor.fit(train_data=train_df_model, tuning_data=val_df_model)

# --- Evaluate AUC on val/test ---
val_proba = predictor.predict_proba(val_df_model[feature_cols])
val_score = val_proba.iloc[:, -1]
val_auc = roc_auc_score(val_df_model[LABEL_COL].astype(int), val_score)

_test_proba = predictor.predict_proba(test_df_model[feature_cols])
test_score = _test_proba.iloc[:, -1]
test_auc = roc_auc_score(test_df_model[LABEL_COL].astype(int), test_score)

print("val auc:", val_auc)
print("test auc:", test_auc)


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.4
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 25.2.0: Tue Nov 18 21:08:48 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T8132
CPU Count:          10
Memory Avail:       0.71 GB / 16.00 GB (4.4%)
Disk Space Avail:   467.43 GB / 926.35 GB (50.5%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most users. Use in c

unique groups: 3557
train groups: 2489
val groups: 534
test groups: 534
train rows: 4180
val rows: 856
test rows: 917
Note: model directory already exists: /Users/starsrain/jan2026_concord/jan2026_loonie_customize/retrain_data/autogluon_models_FPDAA_retrain


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Unused Original Features (Count: 1): ['active_monthly_3']
		These features were not used to generate any of the output features. Add a feature generator compatible with these features to utilize them.
		Features can also be unused if they carry very little information, such as being categorical but having almost entirely unique values or being duplicates of other features.
		These features do not need to be present at inference time.
		('float', []) : 1 | ['active_monthly_3']
	Types of features in original data (raw d

val auc: 0.7301648770756322
test auc: 0.6281532805429864


rows: 5953
score corr: 0.2705970081118157
delta mean: 285.5020868820017
delta median: 285.0


Unnamed: 0,IBVStatusID,retrained_score,redZoneV2_score,score_delta
count,5953.0,5953.0,5953.0,5953.0
mean,55805.780615,500.892995,215.390908,285.502087
std,16804.783076,114.129425,78.209231,119.630788
min,1810.0,20.0,-20.0,-275.0
25%,46090.0,415.0,158.0,198.0
50%,56451.0,506.0,208.0,285.0
75%,66049.0,591.0,265.0,374.0
max,86361.0,785.0,589.000642,607.0


In [5]:
# --- Summarize retrained model ---
leaderboard_retrain = predictor.leaderboard(silent=True)

best_model_retrain = getattr(predictor, "model_best", None)
if best_model_retrain is None and not leaderboard_retrain.empty:
    best_model_retrain = leaderboard_retrain.iloc[0]["model"]

print("best retrain model:", best_model_retrain)
display(leaderboard_retrain)


best retrain model: WeightedEnsemble_L2


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.730165,roc_auc,0.033537,2.166797,0.000247,0.019372,2,True,7
1,CatBoost,0.729766,roc_auc,0.005209,1.80638,0.005209,1.80638,1,True,3
2,XGBoost,0.70007,roc_auc,0.00613,1.243282,0.00613,1.243282,1,True,6
3,ExtraTreesEntr,0.682216,roc_auc,0.028081,0.341045,0.028081,0.341045,1,True,5
4,RandomForestEntr,0.681799,roc_auc,0.025848,0.704539,0.025848,0.704539,1,True,2
5,ExtraTreesGini,0.670727,roc_auc,0.02722,0.375525,0.02722,0.375525,1,True,4
6,RandomForestGini,0.669025,roc_auc,0.025232,0.688536,0.025232,0.688536,1,True,1


In [4]:
# --- Inspect current production model (for reference) ---
from autogluon.tabular import TabularPredictor

CURRENT_MODEL_PATH = "/Users/starsrain/jan2026_concord/jan2026_loonie_customize/src/model/autogluon_models_FPDAA_20250904_010918"

predictor_current = TabularPredictor.load(CURRENT_MODEL_PATH, require_py_version_match=False)
leaderboard = predictor_current.leaderboard(silent=True)

best_model = getattr(predictor_current, "model_best", None)
if best_model is None and not leaderboard.empty:
    best_model = leaderboard.iloc[0]["model"]

print("best model:", best_model)
display(leaderboard)

try:
    best_model_obj = predictor_current._trainer.load_model(best_model)
    print("best model class:", best_model_obj.__class__.__name__)
    print("best model params:", getattr(best_model_obj, "params", None))
except Exception as exc:
    print("could not load best model params:", exc)


Found 2 mismatches between original and current metadata:


best model: WeightedEnsemble_L2_FULL


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.649597,roc_auc,2.544717,228.998852,0.003793,2.487705,2,True,10
1,CatBoost_r137_BAG_L1,0.645436,roc_auc,0.067784,33.011422,0.067784,33.011422,1,True,3
2,CatBoost_r5_BAG_L1,0.645227,roc_auc,0.07179,32.230016,0.07179,32.230016,1,True,6
3,CatBoost_r49_BAG_L1,0.645166,roc_auc,0.113794,29.82533,0.113794,29.82533,1,True,5
4,CatBoost_r69_BAG_L1,0.64376,roc_auc,0.061243,34.664336,0.061243,34.664336,1,True,4
5,CatBoost_r6_BAG_L1,0.643071,roc_auc,0.136425,34.451962,0.136425,34.451962,1,True,8
6,LightGBM_BAG_L1,0.637197,roc_auc,0.27239,17.13863,0.27239,17.13863,1,True,1
7,XGBoost_r95_BAG_L1,0.637176,roc_auc,0.230609,20.835208,0.230609,20.835208,1,True,9
8,XGBoost_r22_BAG_L1,0.634763,roc_auc,0.257926,21.157989,0.257926,21.157989,1,True,7
9,ExtraTreesGini_BAG_L1,0.594594,roc_auc,1.328964,3.196254,1.328964,3.196254,1,True,2


could not load best model params: [Errno 2] No such file or directory: '/Users/starsrain/jan2026_concord/jan2026_loonie_customize/src/model/autogluon_models_FPDAA_20250904_010918/models/WeightedEnsemble_L2_FULL/model.pkl'
