In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import zipfile
import os

# Correct file path (adjust this if needed based on Google Drive window)
zip_path = '/content/drive/MyDrive/dataset (1).zip'
extract_path = '/content/dataset'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Unzipped to", extract_path)

# Optional: List a few files to confirm extraction
for root, dirs, files in os.walk(extract_path):
    for name in files:
        print(os.path.join(root, name))

Unzipped to /content/dataset
/content/dataset/dataset/test/digital_behavior.csv
/content/dataset/dataset/test/asset_returns.csv
/content/dataset/dataset/test/current_products.csv
/content/dataset/dataset/test/wearables.csv
/content/dataset/dataset/test/users.csv
/content/dataset/dataset/test/claims_history_test_public.csv
/content/dataset/dataset/test/cost_structure.csv
/content/dataset/dataset/test/telematics.csv
/content/dataset/dataset/train/digital_behavior.csv
/content/dataset/dataset/train/asset_returns.csv
/content/dataset/dataset/train/current_products.csv
/content/dataset/dataset/train/wearables.csv
/content/dataset/dataset/train/users.csv
/content/dataset/dataset/train/cost_structure.csv
/content/dataset/dataset/train/claims_history.csv
/content/dataset/dataset/train/telematics.csv


In [5]:
import os
import pandas as pd

base_dir = '/content/dataset/dataset'
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

def load_all_csvs(folder):
    return {os.path.splitext(f)[0]: pd.read_csv(os.path.join(folder, f))
            for f in os.listdir(folder) if f.endswith('.csv')}

train_dfs = load_all_csvs(train_dir)
test_dfs = load_all_csvs(test_dir)
print('Train tables:', list(train_dfs.keys()))
print('Test tables:', list(test_dfs.keys()))


Train tables: ['digital_behavior', 'asset_returns', 'current_products', 'wearables', 'users', 'cost_structure', 'claims_history', 'telematics']
Test tables: ['digital_behavior', 'asset_returns', 'current_products', 'wearables', 'users', 'claims_history_test_public', 'cost_structure', 'telematics']


In [6]:
for name, df in train_dfs.items():
    print(f"\n{name} shape:", df.shape)
    print(df.info())
    print("Missing data:\n", df.isnull().sum().sort_values(ascending=False).head())
    print(df.describe(include='all').T.head())



digital_behavior shape: (1050000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050000 entries, 0 to 1049999
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 1050000 non-null  int64  
 1   week_timestamp          1050000 non-null  object 
 2   daily_screen_time_avg   1050000 non-null  float64
 3   social_media_time_avg   1050000 non-null  float64
 4   app_usage_vector        1050000 non-null  object 
 5   social_sentiment_score  1050000 non-null  float64
 6   ecommerce_spend_vector  1050000 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 56.1+ MB
None
Missing data:
 user_id                  0
week_timestamp           0
daily_screen_time_avg    0
social_media_time_avg    0
app_usage_vector         0
dtype: int64
                           count   unique  \
user_id                1050000.0      NaN   
week_timestamp           1050000  

In [7]:
# ============================
# PRE-FLIGHT: dataset sanity, targets, alignment, quick leakage scan
# (safe to run multiple times)
# ============================
import os, numpy as np, pandas as pd

BASE_DIR = "/content/dataset/dataset"
TRAIN_DIR = os.path.join(BASE_DIR, "train")
TEST_DIR  = os.path.join(BASE_DIR, "test")

def lread(path):
    return pd.read_csv(path, encoding="latin-1")

# ---- 1) load minimal tables we need here ----
users_tr   = lread(os.path.join(TRAIN_DIR, "users.csv"))
claims_tr  = lread(os.path.join(TRAIN_DIR, "claims_history.csv"))
wear_tr    = lread(os.path.join(TRAIN_DIR, "wearables.csv"))
dig_tr     = lread(os.path.join(TRAIN_DIR, "digital_behavior.csv"))
tel_tr     = lread(os.path.join(TRAIN_DIR, "telematics.csv"))
cur_tr     = lread(os.path.join(TRAIN_DIR, "current_products.csv"))

users_te   = lread(os.path.join(TEST_DIR, "users.csv"))
wear_te    = lread(os.path.join(TEST_DIR, "wearables.csv"))
dig_te     = lread(os.path.join(TEST_DIR, "digital_behavior.csv"))
tel_te     = lread(os.path.join(TEST_DIR, "telematics.csv"))
cur_te     = lread(os.path.join(TEST_DIR, "current_products.csv"))

# ---- 2) quick shapes & key checks ----
def brief(df, name, key="user_id"):
    print(f"{name:20s} shape={df.shape} | {key} unique={df[key].nunique() if key in df.columns else 'NA'}")

print("\n== TRAIN shapes ==")
for df,name in [(users_tr,"users"),(claims_tr,"claims_history"),(wear_tr,"wearables"),
                (dig_tr,"digital_behavior"),(tel_tr,"telematics"),(cur_tr,"current_products")]:
    brief(df,name)

print("\n== TEST shapes ==")
for df,name in [(users_te,"users"),(wear_te,"wearables"),
                (dig_te,"digital_behavior"),(tel_te,"telematics"),(cur_te,"current_products")]:
    brief(df,name)

# required columns
required_users = {"user_id"}
assert required_users.issubset(users_tr.columns), "users.csv missing user_id"
assert "user_id" in users_te.columns, "test users.csv missing user_id"

# ---- 3) target preview (frequency per policy) ----
claims_tr["policy_type"] = claims_tr["policy_type"].str.lower()
freq = (
    claims_tr.groupby(["user_id","policy_type"])["claim_id"].count()
    .unstack("policy_type").fillna(0)
)
freq_bin = (freq > 0).astype(int)
base_rates = freq_bin.mean().sort_values(ascending=False)
print("\n== frequency base rates (train) ==")
print(base_rates.rename("positive_rate"))

# ---- 4) build tiny feature master (just to see join cardinality) ----
def agg_minimal(df, name):
    num = df.select_dtypes(include="number").columns.tolist()
    if "user_id" in num: num.remove("user_id")
    if not num:
        return pd.DataFrame({"user_id": df["user_id"].unique()})
    g = df.groupby("user_id")[num].agg(["mean","std"]).reset_index()
    g.columns = ["user_id"] + [f"{name}__{a}_{b}" for a,b in g.columns.tolist()[1:]]
    return g

feat = users_tr.merge(agg_minimal(wear_tr,"wear"), on="user_id", how="left") \
               .merge(agg_minimal(dig_tr,"dig"),  on="user_id", how="left") \
               .merge(agg_minimal(tel_tr,"tel"),  on="user_id", how="left") \
               .merge(cur_tr.drop(columns=[c for c in ["portfolio_asset_allocation"] if c in cur_tr.columns]),
                      on="user_id", how="left")

print(f"\n== feature master preview ==")
print("feature_master shape:", feat.shape)
print("null % (top 10):")
print((feat.isnull().mean().sort_values(ascending=False).head(10) * 100).round(2))

# ---- 5) quick leakage probe (single-policy) ----
probe_y = freq_bin.get("pet", pd.Series(0, index=freq_bin.index))  # choose one policy
tmp = feat.set_index("user_id").loc[probe_y.index].copy()
# kill any column that looks obviously leaky by name
bad_like = [c for c in tmp.columns if any(x in c.lower() for x in ["claim","freq","sev","target","label","policy_type"])]
tmp = tmp.drop(columns=bad_like, errors="ignore")

# simple 1D correlation scan on numeric cols
num_cols = tmp.select_dtypes(include="number").columns
corrs = []
for c in num_cols:
    try:
        r = np.corrcoef(tmp[c].fillna(tmp[c].median()), probe_y)[0,1]
        if abs(r) >= 0.98:
            corrs.append((c, float(r)))
    except Exception:
        pass

if corrs:
    print("\nüö® high correlation columns (possible leakage):")
    print(corrs[:10])
else:
    print("\n‚úÖ no obvious leakage by simple correlation probe.")

# ---- 6) train/test alignment sanity ----
feat_te = users_te.merge(agg_minimal(wear_te,"wear"), on="user_id", how="left") \
                  .merge(agg_minimal(dig_te,"dig"),   on="user_id", how="left") \
                  .merge(agg_minimal(tel_te,"tel"),   on="user_id", how="left") \
                  .merge(cur_te.drop(columns=[c for c in ["portfolio_asset_allocation"] if c in cur_te.columns]),
                         on="user_id", how="left")

train_cols = set(feat.columns) - {"user_id"}
test_cols  = set(feat_te.columns) - {"user_id"}
missing_in_test = sorted(list(train_cols - test_cols))
extra_in_test   = sorted(list(test_cols - train_cols))

print("\n== alignment check ==")
print("missing_in_test (should be empty or very small):", missing_in_test[:20])
print("extra_in_test   (ok):", extra_in_test[:20])

if len(missing_in_test) == 0:
    print("\n‚úÖ alignment looks good. You can start training.")
else:
    print("\n‚ö†Ô∏è some train features not present in test. The training code will auto reindex, but review if many.")



== TRAIN shapes ==
users                shape=(10000, 19) | user_id unique=10000
claims_history       shape=(3137, 6) | user_id unique=2341
wearables            shape=(3655000, 6) | user_id unique=5000
digital_behavior     shape=(1050000, 7) | user_id unique=10000
telematics           shape=(1769092, 9) | user_id unique=4000
current_products     shape=(10000, 8) | user_id unique=10000

== TEST shapes ==
users                shape=(4000, 19) | user_id unique=4000
wearables            shape=(1462000, 6) | user_id unique=2000
digital_behavior     shape=(420000, 7) | user_id unique=4000
telematics           shape=(719420, 9) | user_id unique=1600
current_products     shape=(4000, 8) | user_id unique=4000

== frequency base rates (train) ==
policy_type
pet        0.762067
auto       0.213157
renters    0.082016
Name: positive_rate, dtype: float64

== feature master preview ==
feature_master shape: (10000, 47)
null % (top 10):
tel__trip_id_mean                 60.0
tel__distance_miles_std  

In [8]:
!pip -q install catboost ngboost


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m349.3/349.3 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m117.3/117.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone


In [11]:
# ============================================
# ‚≠ê FREQUENCY‚ÄìSEVERITY SUPER ENSEMBLE (CPU, leak-proof)
# - Robust preprocessing (winsor + rare-bucket + OHE, version-proof)
# - OOF CV bagging across seeds (LGBM, XGB, CatBoost*, NGBoost*)
# - Isotonic calibration, blend + meta-learner (LogReg) for P*
# - Stacked regressors for SEV* (LGBM Tweedie + Linear, XGB Pseudo-Huber, Cat*, NGB*)
# - Sensible thresholds (Youden + cost-aware)
# - Safe fallbacks everywhere (no silent breaks)
# * optional: auto-skipped if not installed
# ============================================

import os, gc, ast, json, warnings, inspect
import numpy as np, pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

import lightgbm as lgb
import xgboost as xgb
def _try(name):
    try: return __import__(name)
    except Exception: return None
catboost = _try("catboost")
ngboost  = _try("ngboost")

import joblib

# ---------------------------
# CONFIG
# ---------------------------
BASE_DIR   = "/content/dataset/dataset"
TRAIN_DIR  = os.path.join(BASE_DIR, "train")
TEST_DIR   = os.path.join(BASE_DIR, "test")

N_FOLDS      = 5
BAG_SEEDS    = [42, 1337, 2025]
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
FAST_MODE    = False   # True = quick dev; False = higher accuracy
N_JOBS       = -1

COST_FP = 1.0
COST_FN = 5.0

LEAKY_FRAGS = [
    'claim', 'policy_type', 'fraud', 'settle', 'payout', 'accident', 'incident', 'loss',
    'freq_', 'sev_', 'sev_log_', 'target', 'label'
]

# ---------------------------
# UTILS
# ---------------------------
def load_csv(p): return pd.read_csv(p, encoding="latin-1")
def rmse(y, yhat):
    try: return float(mean_squared_error(y, yhat, squared=False))
    except TypeError: return float(mean_squared_error(y, yhat)**0.5)

def youden_thr(y, p):
    fpr, tpr, thr = roc_curve(y, p)
    return float(thr[np.argmax(tpr - fpr)])

def best_cost_thr(y, p, steps=400):
    y = (np.asarray(y) > 0.5).astype(int)
    grid = np.linspace(0,1,steps)
    best_t, best_cost = 0.5, 1e18
    for t in grid:
        pred = (p >= t).astype(int)
        fp = ((pred==1)&(y==0)).sum()
        fn = ((pred==0)&(y==1)).sum()
        cost = COST_FP*fp + COST_FN*fn
        if cost < best_cost: best_cost, best_t = cost, float(t)
    return best_t, best_cost

# ---------------------------
# TRANSFORMERS
# ---------------------------
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, q=(0.01,0.99)): self.q=q; self.b={}
    def fit(self, X, y=None):
        X=pd.DataFrame(X);
        for c in X.columns:
            s = pd.to_numeric(X[c], errors="coerce")
            self.b[c]=(np.nanquantile(s,self.q[0]), np.nanquantile(s,self.q[1]))
        return self
    def transform(self, X):
        X=pd.DataFrame(X);
        for c,(lo,hi) in self.b.items():
            X[c]=pd.to_numeric(X[c], errors="coerce").clip(lo,hi)
        return X.values

class RareBucket(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=20): self.min=min_count; self.keep={}
    def fit(self, X, y=None):
        X=pd.DataFrame(X).astype("object")
        for c in X.columns:
            vc=X[c].value_counts(dropna=False)
            self.keep[c]=set(vc[vc>=self.min].index)
        return self
    def transform(self, X):
        X=pd.DataFrame(X).astype("object")
        for c,keep in self.keep.items():
            X[c]=X[c].apply(lambda v: v if v in keep else "Other")
        return X.values

# ---------------------------
# FEATURE ENGINEERING (leak-proof)
# ---------------------------
def _safe_get(df, col, default=0): return df[col] if col in df.columns else default

def extract_json(df, col, prefix):
    if col not in df.columns: return df
    def safe_eval(x):
        if pd.isna(x): return {}
        if isinstance(x, dict): return x
        if isinstance(x,str):
            try: return ast.literal_eval(x)
            except Exception: return {}
        return {}
    s=df[col].apply(safe_eval)
    def sumv(d):
        try: return float(sum(v for v in d.values() if pd.notna(v)))
        except: return 0.0
    df[f"{prefix}_vector_sum"]=s.apply(sumv)
    keys=['US Stocks','International Stocks','Bonds'] if prefix=='alloc' else []
    for k in keys: df[f"{prefix}_{k.lower().replace(' ','_')}"]=s.apply(lambda d:d.get(k,0))
    return df.drop(columns=[col], errors="ignore")

def aggregate(path, name):
    df=load_csv(path)
    if 'user_id' not in df.columns: raise ValueError(f"user_id missing in {path}")
    num=df.select_dtypes(include='number').columns.tolist()
    if 'user_id'in num: num.remove('user_id')
    if not num: return pd.DataFrame({'user_id':df['user_id'].unique()})
    g=df.groupby('user_id')[num].agg(['mean','std','min','max']).reset_index()
    g.columns=['user_id']+[f"{name}_{a}_{b}" for a,b in g.columns.tolist()[1:]]
    if name=='telematics':
        df['hard_braking_events']=_safe_get(df,'hard_braking_events',0)
        df['speeding_events']=_safe_get(df,'speeding_events',0)
        df['total_events']=df['hard_braking_events']+df['speeding_events']
        ev=df.groupby('user_id')['total_events'].mean().reset_index()
        ev.columns=['user_id','telematics_events_per_trip_mean']
        g=g.merge(ev,on='user_id',how='left')
    return g

def create_feature_master(folder):
    print(f"Loading data from {folder}...")
    users=load_csv(os.path.join(folder,'users.csv')).drop_duplicates('user_id',keep='first')
    wear =aggregate(os.path.join(folder,'wearables.csv'),'wearable')
    digi =aggregate(os.path.join(folder,'digital_behavior.csv'),'digital')
    tele=aggregate(os.path.join(folder,'telematics.csv'),'telematics')
    cur  =load_csv(os.path.join(folder,'current_products.csv'))
    cur  =extract_json(cur,'portfolio_asset_allocation','alloc')

    feat=users.merge(wear,on='user_id',how='left')\
              .merge(digi,on='user_id',how='left')\
              .merge(tele,on='user_id',how='left')\
              .merge(cur,on='user_id',how='left')

    for c in ['annual_income','student_loan_debt','credit_card_debt',
              'risk_tolerance_score','financial_literacy_score']:
        if c not in feat.columns: feat[c]=np.nan
    feat['financial_health']=feat['annual_income'].fillna(0)-feat['student_loan_debt'].fillna(0)-feat['credit_card_debt'].fillna(0)
    feat['risk_propensity']=feat['risk_tolerance_score'].fillna(0)*feat['financial_literacy_score'].fillna(0)

    bad=[c for c in feat.columns if any(t in c.lower() for t in LEAKY_FRAGS)]
    if bad:
        print(f"‚ö†Ô∏è Removing potential leak columns: {bad[:12]}{' ...' if len(bad)>12 else ''}")
        feat=feat.drop(columns=bad, errors='ignore')
    return feat

def build_targets(feature_master, claims_history):
    ch=claims_history.copy()
    if 'policy_type'in ch.columns: ch['policy_type']=ch['policy_type'].str.lower()
    agg=ch.groupby(['user_id','policy_type']).agg(freq=('claim_id','count'),
                                                  sev_base=('claim_amount','mean')).reset_index()
    pv=agg.pivot_table(index='user_id', columns='policy_type',
                       values=['freq','sev_base']).reset_index()
    cols=list(pv.columns)
    if isinstance(cols[0],tuple) and cols[0][0]=='user_id': pv.rename(columns={cols[0]:'user_id'}, inplace=True)
    new=['user_id']+[f"{a}_{str(b).lower()}" if isinstance((a,b),tuple) else a for a,b in pv.columns[1:]]
    pv.columns=new
    tgt=feature_master[['user_id']].merge(pv,on='user_id',how='left')
    for p in ['auto','renters','pet']:
        if f'freq_{p}' not in tgt.columns: tgt[f'freq_{p}']=0
        tgt[f'freq_{p}']=tgt[f'freq_{p}'].fillna(0).gt(0).astype(float)
        col=f'sev_base_{p}'
        if col not in tgt.columns: tgt[col]=np.nan
        tgt[f'sev_log_{p}']=np.log1p(np.clip(tgt[col].astype(float),0,None))
        tgt.drop(columns=[col], inplace=True, errors='ignore')
    return tgt

def build_Xy(feature_master, targets):
    feat_cols=[c for c in feature_master.columns if c!='user_id' and not any(k in c.lower() for k in ['freq_','sev_','sev_log_'])]
    X=feature_master[feat_cols].copy()
    df=targets.copy()
    y_pack={
        'auto':    df.get('freq_auto',    pd.Series(0,index=df.index)).astype(float).values,
        'renters': df.get('freq_renters', pd.Series(0,index=df.index)).astype(float).values,
        'pet':     df.get('freq_pet',     pd.Series(0,index=df.index)).astype(float).values,
        'sev_log_auto':    df.get('sev_log_auto',    pd.Series(np.nan,index=df.index)).astype(float).values,
        'sev_log_renters': df.get('sev_log_renters', pd.Series(np.nan,index=df.index)).astype(float).values,
        'sev_log_pet':     df.get('sev_log_pet',     pd.Series(np.nan,index=df.index)).astype(float).values,
    }
    return X, y_pack

# ---------------------------
# PREPROCESSOR (version-proof)
# ---------------------------
def _make_ohe(min_freq=20):
    sig=inspect.signature(OneHotEncoder.__init__)
    kw={}
    if 'min_frequency'in sig.parameters:
        kw['handle_unknown']='infrequent_if_exist'; kw['min_frequency']=min_freq
    else:
        kw['handle_unknown']='ignore'
    if 'sparse_output'in sig.parameters: kw['sparse_output']=True
    else: kw['sparse']=True
    return OneHotEncoder(**kw)

def get_preprocessor(X):
    X=X.copy()
    for c in X.select_dtypes(include='bool').columns: X[c]=X[c].astype(int)
    num_cols=X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols=X.select_dtypes(include=['object']).columns.tolist()

    num=Pipeline([('imputer',SimpleImputer(strategy='median')),
                  ('winsor',Winsorizer((0.01,0.99))),
                  ('scaler',StandardScaler())])

    sig=inspect.signature(OneHotEncoder.__init__)
    cat_steps=[('imputer',SimpleImputer(strategy='most_frequent'))]
    if 'min_frequency' not in sig.parameters:
        cat_steps.append(('rare',RareBucket(min_count=20)))
    cat_steps.append(('onehot',_make_ohe(20)))
    cat=Pipeline(cat_steps)

    return ColumnTransformer([('num',num,num_cols),('cat',cat,cat_cols)],
                             remainder='drop', sparse_threshold=0.3)

# ---------------------------
# MODELS
# ---------------------------
def _n_est(): return 1200 if FAST_MODE else 3000
def _lr():    return 0.03  if FAST_MODE else 0.015

def class_weights(y):
    y=np.asarray(y).astype(float)
    pos=y.sum(); neg=len(y)-pos
    spw=neg/max(pos,1.0); cw=None if (pos==0 or neg==0) else {0:1.0,1:neg/max(pos,1.0)}
    return spw, cw

# classifiers
def lgbm_clf(cw):
    return lgb.LGBMClassifier(n_estimators=_n_est(), learning_rate=_lr(),
        max_depth=7, num_leaves=48, min_child_samples=80,
        subsample=0.8, colsample_bytree=0.7, reg_alpha=0.3, reg_lambda=1.5,
        objective='binary', metric='auc', class_weight=cw, n_jobs=N_JOBS,
        random_state=RANDOM_STATE, verbose=-1)

def xgb_clf(spw):
    return xgb.XGBClassifier(n_estimators=_n_est(), learning_rate=_lr(),
        max_depth=6, min_child_weight=6, subsample=0.8, colsample_bytree=0.7,
        reg_lambda=2.0, gamma=0.15, objective='binary:logistic', eval_metric='auc',
        tree_method='hist', predictor='auto', n_jobs=N_JOBS, random_state=RANDOM_STATE,
        scale_pos_weight=spw)

def cat_clf():
    if catboost is None: return None
    return catboost.CatBoostClassifier(iterations=_n_est(), learning_rate=_lr(),
        depth=7, l2_leaf_reg=5.0, loss_function='Logloss', eval_metric='AUC',
        random_seed=RANDOM_STATE, thread_count=max(1,os.cpu_count()-1), verbose=False)

def ngb_clf():
    if ngboost is None: return None
    from ngboost.distns import Bernoulli
    from ngboost.scores import LogScore
    return ngboost.NGBClassifier(Dist=Bernoulli, Score=LogScore,
        n_estimators=max(400,_n_est()//6), learning_rate=min(0.05,_lr()*2.0),
        natural_gradient=True, verbose=False, random_state=RANDOM_STATE)

# regressors
def lgbm_reg_tweedie():
    return lgb.LGBMRegressor(n_estimators=_n_est(), learning_rate=_lr(),
        max_depth=8, num_leaves=96, min_child_samples=50,
        subsample=0.8, colsample_bytree=0.7, reg_alpha=0.3, reg_lambda=1.5,
        objective='tweedie', tweedie_variance_power=1.3, metric='rmse',
        n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=-1)

def lgbm_reg_linear():
    return lgb.LGBMRegressor(n_estimators=_n_est(), learning_rate=_lr(),
        max_depth=8, num_leaves=96, min_child_samples=50,
        subsample=0.8, colsample_bytree=0.7, reg_alpha=0.3, reg_lambda=1.5,
        objective='regression', metric='rmse', n_jobs=N_JOBS,
        random_state=RANDOM_STATE, verbose=-1)

def xgb_reg_phuber():
    return xgb.XGBRegressor(n_estimators=_n_est(), learning_rate=_lr(),
        max_depth=7, min_child_weight=6, subsample=0.8, colsample_bytree=0.7,
        reg_lambda=2.5, gamma=0.1, objective='reg:pseudohubererror', eval_metric='rmse',
        tree_method='hist', predictor='auto', n_jobs=N_JOBS, random_state=RANDOM_STATE)

def cat_reg():
    if catboost is None: return None
    return catboost.CatBoostRegressor(iterations=_n_est(), learning_rate=_lr(),
        depth=8, l2_leaf_reg=6.0, loss_function='RMSE', random_seed=RANDOM_STATE,
        thread_count=max(1,os.cpu_count()-1), verbose=False)

def ngb_reg():
    if ngboost is None: return None
    from ngboost.distns import Normal
    from ngboost.scores import LogScore
    return ngboost.NGBRegressor(Dist=Normal, Score=LogScore,
        n_estimators=max(400,_n_est()//6), learning_rate=min(0.05,_lr()*2.0),
        natural_gradient=True, verbose=False, random_state=RANDOM_STATE)

# early stopping helpers
def fit_lgb(model, Xtr, ytr, Xva, yva):
    if isinstance(model, lgb.LGBMModel):
        model.fit(Xtr,ytr, eval_set=[(Xva,yva)],
                  callbacks=[lgb.log_evaluation(0),
                             lgb.early_stopping(stopping_rounds=200, verbose=False)])
        return model
    model.fit(Xtr,ytr); return model

def fit_xgb(model, Xtr, ytr, Xva, yva, maximize=True, metric='auc'):
    sig=inspect.signature(model.fit)
    if 'early_stopping_rounds' in sig.parameters:
        model.fit(Xtr,ytr, eval_set=[(Xva,yva)], verbose=False, early_stopping_rounds=200)
    else:
        try:
            from xgboost.callback import EarlyStopping
            cb=EarlyStopping(rounds=200, save_best=True, maximize=maximize,
                             data_name='validation_0', metric_name=metric)
            model.fit(Xtr,ytr, eval_set=[(Xva,yva)], verbose=False, callbacks=[cb])
        except Exception:
            model.fit(Xtr,ytr, eval_set=[(Xva,yva)], verbose=False)
    return model

# ---------------------------
# LEAKAGE SCAN (quick)
# ---------------------------
def leakage_scan_drop(X, y_bin):
    bad=[]
    y=np.asarray(y_bin).astype(int)
    for col in X.columns:
        try:
            xx=X[[col]]
            num=xx.select_dtypes(include=[np.number]).columns.tolist()
            cat=xx.select_dtypes(include=['object']).columns.tolist()
            steps=[]
            if num: steps.append(('num',Pipeline([('imp',SimpleImputer(strategy='median')),('sc',StandardScaler())]),num))
            if cat: steps.append(('cat',Pipeline([('imp',SimpleImputer(strategy='most_frequent')),('ohe',_make_ohe(5))]),cat))
            if not steps: continue
            pp=ColumnTransformer(steps, remainder='drop')
            xp=pp.fit_transform(xx)
            clf=LogisticRegression(max_iter=200)
            clf.fit(xp,y)
            p=clf.predict_proba(xp)[:,1]
            if roc_auc_score(y,p)>=0.98: bad.append(col)
        except Exception:
            pass
    if bad:
        print(f"üö® Potential leakage-like columns (AUC‚â•0.98): {bad}")
        X=X.drop(columns=bad, errors='ignore')
    return X, bad

# ---------------------------
# OOF SUPERBAG ‚Äî FREQUENCY
# ---------------------------
def oof_freq_superbag(prep, X, y, tag):
    Xp=prep.transform(X)
    y=np.asarray(y).astype(int)           # <-- FIX for NGBoost & scoring
    spw,cw=class_weights(y)

    builders=[('lgbm', lambda:lgbm_clf(cw)),
              ('xgb',  lambda:xgb_clf(spw))]
    if catboost is not None: builders.append(('cat', cat_clf))
    if ngboost  is not None: builders.append(('ngb', ngb_clf))

    oof_by_model={n:np.zeros(len(y)) for n,_ in [(n,b) for n,b in builders]}
    models_by_seed=[]; calibs_by_seed=[]; seed_oofs=[]; seed_thr_y=[]; seed_thr_c=[]

    for seed in BAG_SEEDS:
        skf=StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
        fold_models=[]; fold_calibs=[]; oof_seed=np.zeros(len(y))
        for fold,(tr,va) in enumerate(skf.split(Xp,y),1):
            Xtr,Xva=Xp[tr],Xp[va]; ytr,yva=y[tr],y[va]
            per=[]; pack={}; cals={}
            for name,mk in [(n,b) for n,b in builders]:
                m=mk()
                if name=='xgb': m=fit_xgb(m,Xtr,ytr,Xva,yva,True,'auc')
                elif name=='lgbm': m=fit_lgb(m,Xtr,ytr,Xva,yva)
                else: m.fit(Xtr,ytr)
                if hasattr(m,'predict_proba'): p=m.predict_proba(Xva)[:,1]
                else:
                    try: p=m.predict_proba(Xva)[:,1]
                    except: p=1/(1+np.exp(-m.predict(Xva)))
                iso=IsotonicRegression(out_of_bounds='clip'); iso.fit(p,yva)
                pcal=iso.transform(p)
                auc=roc_auc_score(yva,pcal)
                print(f"[{tag}:{name}] seed={seed} fold={fold} AUC={auc:.4f}")
                if auc>=0.99: raise RuntimeError(f"üõë ABORT: [{tag}:{name}] leak AUC={auc:.4f}")
                oof_by_model[name][va]=pcal; per.append(pcal)
                pack[name]=m; cals[name]=iso
            blend=np.mean(np.vstack(per),axis=0)
            oof_seed[va]=blend
            fold_models.append(pack); fold_calibs.append(cals)
        models_by_seed.append(fold_models); calibs_by_seed.append(fold_calibs)
        auc=roc_auc_score(y,oof_seed); ty=youden_thr(y,oof_seed); tc,_=best_cost_thr(y,oof_seed)
        print(f"[{tag}] seed={seed} OOF AUC={auc:.4f} | thr_y={ty:.4f} | thr_c={tc:.4f}")
        seed_oofs.append(oof_seed); seed_thr_y.append(ty); seed_thr_c.append(tc)

    base_names=[n for n,_ in [(n,b) for n,b in builders]]
    X_meta=np.vstack([oof_by_model[n] for n in base_names]).T
    meta=LogisticRegression(max_iter=1000); meta.fit(X_meta,y)
    oof_meta=meta.predict_proba(X_meta)[:,1]; auc_meta=roc_auc_score(y,oof_meta)

    oof_blend=np.mean(np.vstack(seed_oofs),axis=0); auc_blend=roc_auc_score(y,oof_blend)
    print(f"[{tag}] META AUC={auc_meta:.4f} | BAG BLEND AUC={auc_blend:.4f}")

    pd.DataFrame({**{f'oof_{n}':oof_by_model[n] for n in base_names},
                  'oof_blend':oof_blend,'oof_meta':oof_meta,'y':y}).to_csv(f"oof_{tag.replace('*_','')}_freq.csv",index=False)

    return {'builders':base_names,'models_by_seed':models_by_seed,'calibs_by_seed':calibs_by_seed,
            'meta':meta,'auc_oof_meta':float(auc_meta),'auc_oof_blend':float(auc_blend),
            'thr_youden':float(np.mean(seed_thr_y)),'thr_cost':float(np.mean(seed_thr_c))}

def predict_freq(bundle, prep, X):
    Xp=prep.transform(X)
    names=bundle['builders']; seed_preds=[]
    for si,fold_models in enumerate(bundle['models_by_seed']):
        fold_pred=np.zeros(Xp.shape[0]); nfold=0
        for fold_pack, fold_cals in zip(fold_models, bundle['calibs_by_seed'][si]):
            per=[]
            for n in names:
                m=fold_pack[n]
                if hasattr(m,'predict_proba'): p=m.predict_proba(Xp)[:,1]
                else:
                    try: p=m.predict_proba(Xp)[:,1]
                    except: p=1/(1+np.exp(-m.predict(Xp)))
                per.append(fold_cals[n].transform(p))
            fold_pred += np.mean(np.vstack(per),axis=0); nfold+=1
        seed_preds.append(fold_pred/max(1,nfold))
    blend=np.mean(np.vstack(seed_preds),axis=0)

    # meta features from first seed+fold for stability
    try:
        pack0=bundle['models_by_seed'][0][0]; cal0=bundle['calibs_by_seed'][0][0]
        feats=[]
        for n in names:
            m=pack0[n]
            if hasattr(m,'predict_proba'): p=m.predict_proba(Xp)[:,1]
            else:
                try: p=m.predict_proba(Xp)[:,1]
                except: p=1/(1+np.exp(-m.predict(Xp)))
            feats.append(cal0[n].transform(p))
        Xmeta=np.vstack(feats).T
        pmeta=bundle['meta'].predict_proba(Xmeta)[:,1]
        prob=0.5*blend+0.5*pmeta
    except Exception:
        prob=blend
    return np.clip(prob,0.0,1.0)

# ---------------------------
# OOF SUPERBAG ‚Äî SEVERITY
# ---------------------------
def oof_sev_superbag(prep, X, ylog, tag):
    Xp=prep.transform(X)
    builders=[('lgb_tweedie', lgbm_reg_tweedie),
              ('xgb_phuber',  xgb_reg_phuber),
              ('lgb_linear',  lgbm_reg_linear)]
    if catboost is not None: builders.append(('cat', cat_reg))
    if ngboost  is not None: builders.append(('ngb', ngb_reg))

    seed_packs=[]; all_oofs=[]
    for seed in BAG_SEEDS:
        kf=KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
        oof_stack=np.zeros((len(ylog), len(builders)))
        fold_models=[]
        for fold,(tr,va) in enumerate(kf.split(Xp),1):
            Xtr,Xva=Xp[tr],Xp[va]; ytr,yva=ylog[tr],ylog[va]
            models=[]
            for j,(name,mk) in enumerate(builders):
                m=mk()
                if isinstance(m, xgb.XGBRegressor): m=fit_xgb(m,Xtr,ytr,Xva,yva,False,'rmse')
                elif isinstance(m, lgb.LGBMRegressor): m=fit_lgb(m,Xtr,ytr,Xva,yva)
                else: m.fit(Xtr,ytr)
                oof_stack[va,j]=m.predict(Xva)
                models.append(m)
            fold_models.append(models)
        meta=Ridge(alpha=1.0, random_state=RANDOM_STATE); meta.fit(oof_stack,ylog)
        oof_meta=meta.predict(oof_stack); print(f"[{tag}] seed={seed} OOF RMSE(log)={rmse(ylog,oof_meta):.4f}")
        seed_packs.append({'models_by_fold':fold_models,'meta':meta}); all_oofs.append(oof_meta)

    bag_rmse=float(np.mean([rmse(ylog,o) for o in all_oofs])); print(f"[{tag}] BAG OOF RMSE(log)={bag_rmse:.4f}")
    pd.DataFrame({'oof_meta_bag':np.mean(np.vstack(all_oofs),axis=0),'y_log':ylog}).to_csv(f"oof_{tag.replace('*_','')}_sev.csv",index=False)
    return {'builders':[n for n,_ in builders],'seeds':seed_packs,'rmse_oof_log':bag_rmse}

def predict_sev(bundle, prep, X):
    Xp=prep.transform(X); names=bundle['builders']; out=np.zeros(Xp.shape[0])
    for seed_pack in bundle['seeds']:
        fold_models=seed_pack['models_by_fold']; meta=seed_pack['meta']
        base=np.zeros((Xp.shape[0], len(names)))
        for models in fold_models:
            for j,m in enumerate(models): base[:,j]+=m.predict(Xp)
        base/=max(1,len(fold_models)); out+=meta.predict(base)
    out/=max(1,len(bundle['seeds']))
    sev=np.expm1(out); return np.maximum(1.0, sev)

def stabilize(sev, shrink=0.15, cap_q=0.995):
    med=np.nanmedian(sev); s=(1-shrink)*sev+shrink*med; hi=np.nanquantile(s,cap_q)
    return np.clip(s,1.0,hi)

# ---------------------------
# MAIN
# ---------------------------
if __name__ == "__main__":
    print("="*60); print("=== PHASE 1: LEAK-PROOF DATA PREP (FREQUENCY-SEVERITY) ==="); print("="*60)
    claims = load_csv(os.path.join(TRAIN_DIR,'claims_history.csv'))
    feat_tr = create_feature_master(TRAIN_DIR)
    targets = build_targets(feat_tr, claims)
    X_tr, y = build_Xy(feat_tr, targets)

    # quick leakage probe using 'pet' as target
    X_tr, dropped = leakage_scan_drop(X_tr, y['pet'])
    if dropped: print(f"‚úÖ Dropped suspicious columns: {dropped}")

    pre = get_preprocessor(X_tr); pre.fit(X_tr)
    del claims, targets; gc.collect()

    print("\n"+"="*60); print("=== PHASE 2: SUPER ENSEMBLE OOF TRAINING (CPU) ==="); print("="*60)
    bundles={}; metrics={}
    for disp,key in [('Auto','auto'),('Renters','renters'),('Pet','pet')]:
        print(f"\n--- {disp} ---")
        y_freq = y[key].astype(float); print(f"  -> P* positives: {int(y_freq.sum())}/{len(y_freq)}")
        freq_b = oof_freq_superbag(pre, X_tr, y_freq, f"P*_{disp}")
        bundles[f'p_{key}']=freq_b
        metrics[f'auc_oof_p_{key}_blend']=freq_b['auc_oof_blend']
        metrics[f'auc_oof_p_{key}_meta']=freq_b['auc_oof_meta']
        metrics[f'thr_youden_p_{key}']=freq_b['thr_youden']
        metrics[f'thr_cost_p_{key}']=freq_b['thr_cost']

        mask=(y_freq==1.0); print(f"  -> SEV* training size: {int(mask.sum())}")
        if mask.sum()>N_FOLDS:
            sev_b=oof_sev_superbag(pre, X_tr.loc[mask], y[f'sev_log_{key}'][mask], f"SEV*_{disp}")
        else:
            class Dummy:
                def predict(self, X): return np.zeros(X.shape[0])
            sev_b={'builders':[], 'seeds':[{'models_by_fold':[], 'meta':Dummy()}], 'rmse_oof_log':0.0}
        bundles[f'sev_{key}']=sev_b
        metrics[f'rmse_oof_log_sev_{key}']=sev_b['rmse_oof_log']

    joblib.dump(bundles,'super_ensemble_models.joblib')
    joblib.dump(pre,'super_preprocessor.joblib')
    with open('super_metrics.json','w') as f: json.dump(metrics,f,indent=2)
    print("\n‚úÖ Saved super ensemble models, preprocessor, and OOF metrics.")
    print("OOF metrics:", json.dumps(metrics, indent=2))

    print("\n"+"="*60); print("=== PHASE 3: PREDICT & SUBMIT ==="); print("="*60)
    feat_te=create_feature_master(TEST_DIR)
    if 'user_id' not in feat_te.columns: raise ValueError("user_id missing in test data")
    X_te = feat_te.drop(columns=['user_id'], errors='ignore').reindex(columns=X_tr.columns, fill_value=np.nan)

    sub=pd.DataFrame({'user_id':feat_te['user_id']})
    for key in ['auto','renters','pet']:
        p = predict_freq(bundles[f'p_{key}'], pre, X_te)
        sub[f'p_{key}']=np.clip(p,0.001,0.999)
        sraw = predict_sev(bundles[f'sev_{key}'], pre, X_te)
        sub[f'sev_{key}']=stabilize(sraw, shrink=0.15, cap_q=0.995)

    cols=['user_id','p_auto','p_renters','p_pet','sev_auto','sev_renters','sev_pet']
    sub[cols].to_csv('submission_predictions_super.csv', index=False)
    print("\n‚úÖ Created 'submission_predictions_super.csv'. Head:\n", sub[cols].head())
    print("\nüéØ Done. Leak-proof, calibrated, bagged, stacked ‚Äî with safe fallbacks.")


=== PHASE 1: LEAK-PROOF DATA PREP (FREQUENCY-SEVERITY) ===
Loading data from /content/dataset/dataset/train...

=== PHASE 2: SUPER ENSEMBLE OOF TRAINING (CPU) ===

--- Auto ---
  -> P* positives: 499/10000
[P*_Auto:lgbm] seed=42 fold=1 AUC=0.8673
[P*_Auto:xgb] seed=42 fold=1 AUC=0.8460
[P*_Auto:cat] seed=42 fold=1 AUC=0.8508
[P*_Auto:ngb] seed=42 fold=1 AUC=0.8599
[P*_Auto:lgbm] seed=42 fold=2 AUC=0.8770
[P*_Auto:xgb] seed=42 fold=2 AUC=0.8329
[P*_Auto:cat] seed=42 fold=2 AUC=0.8420
[P*_Auto:ngb] seed=42 fold=2 AUC=0.8597
[P*_Auto:lgbm] seed=42 fold=3 AUC=0.8565
[P*_Auto:xgb] seed=42 fold=3 AUC=0.8548
[P*_Auto:cat] seed=42 fold=3 AUC=0.8614
[P*_Auto:ngb] seed=42 fold=3 AUC=0.8603
[P*_Auto:lgbm] seed=42 fold=4 AUC=0.8530
[P*_Auto:xgb] seed=42 fold=4 AUC=0.8351
[P*_Auto:cat] seed=42 fold=4 AUC=0.8399
[P*_Auto:ngb] seed=42 fold=4 AUC=0.8428
[P*_Auto:lgbm] seed=42 fold=5 AUC=0.8992
[P*_Auto:xgb] seed=42 fold=5 AUC=0.8766
[P*_Auto:cat] seed=42 fold=5 AUC=0.8685
[P*_Auto:ngb] seed=42 fold=5 

KeyboardInterrupt: 

In [None]:
# ==== RESUME WHAT'S MISSING ‚Üí SAVE ‚Üí PREDICT ====
import os, json, joblib, numpy as np, pandas as pd

# 0) Try to load partial artifacts if not in RAM
if 'bundles' not in globals():
    if os.path.exists('super_ensemble_models_PARTIAL.joblib'):
        bundles = joblib.load('super_ensemble_models_PARTIAL.joblib')
        print("‚ÑπÔ∏è Loaded bundles from PARTIAL checkpoint.")
    elif os.path.exists('super_ensemble_models.joblib'):
        bundles = joblib.load('super_ensemble_models.joblib')
        print("‚ÑπÔ∏è Loaded bundles from full checkpoint.")
    else:
        bundles = {}

if 'preprocessor' not in globals():
    if os.path.exists('super_preprocessor_PARTIAL.joblib'):
        preprocessor = joblib.load('super_preprocessor_PARTIAL.joblib')
        print("‚ÑπÔ∏è Loaded preprocessor from PARTIAL checkpoint.")
    elif os.path.exists('super_preprocessor.joblib'):
        preprocessor = joblib.load('super_preprocessor.joblib')
        print("‚ÑπÔ∏è Loaded preprocessor from full checkpoint.")

if 'metrics' not in globals():
    metrics = {}
    for p in ['super_metrics_PARTIAL.json','super_metrics.json']:
        if os.path.exists(p):
            try:
                metrics.update(json.load(open(p)))
                print(f"‚ÑπÔ∏è Loaded metrics from {p}")
                break
            except Exception:
                pass

# 1) Ensure train matrices exist (quick rebuild if needed)
need_data = any(v not in globals() for v in ['X_train_full','y_pack'])
if need_data:
    claims_history = pd.read_csv(os.path.join(TRAIN_DIR, 'claims_history.csv'), encoding='latin-1')
    feat_master    = create_feature_master(TRAIN_DIR)
    targets        = build_targets_from_claims(feat_master, claims_history)
    X_train_full, y_pack = build_Xy(feat_master, targets)
    if 'preprocessor' not in globals():
        preprocessor = get_preprocessor(X_train_full)
        preprocessor.fit(X_train_full)
    print("‚úÖ Rebuilt train features and targets.")

# 2) Train ONLY what is missing
def _has(key): return key in bundles and bundles[key]

todo = []
for k in ['p_auto','sev_auto','p_renters','sev_renters','p_pet','sev_pet']:
    if not _has(k): todo.append(k)
print("‚ñ∂Ô∏è To train:", todo if todo else "nothing (all done)")

if todo:
    # FREQUENCY
    if 'p_auto' in todo:
        y = y_pack['auto'].astype(float)
        bundles['p_auto'] = oof_frequency_superbag(preprocessor, X_train_full, y, "P*_Auto")
        metrics['auc_oof_p_auto_blend'] = bundles['p_auto']['auc_oof_blend']
        metrics['auc_oof_p_auto_meta']  = bundles['p_auto']['auc_oof_meta']
    if 'p_renters' in todo:
        y = y_pack['renters'].astype(float)
        bundles['p_renters'] = oof_frequency_superbag(preprocessor, X_train_full, y, "P*_Renters")
        metrics['auc_oof_p_renters_blend'] = bundles['p_renters']['auc_oof_blend']
        metrics['auc_oof_p_renters_meta']  = bundles['p_renters']['auc_oof_meta']
    if 'p_pet' in todo:
        y = y_pack['pet'].astype(float)
        bundles['p_pet'] = oof_frequency_superbag(preprocessor, X_train_full, y, "P*_Pet")
        metrics['auc_oof_p_pet_blend'] = bundles['p_pet']['auc_oof_blend']
        metrics['auc_oof_p_pet_meta']  = bundles['p_pet']['auc_oof_meta']

    # SEVERITY (train only on claimers)
    if 'sev_auto' in todo:
        yfreq = y_pack['auto'].astype(float)
        mask = (yfreq == 1.0)
        ylog = y_pack['sev_log_auto'][mask]
        bundles['sev_auto'] = oof_severity_superbag(preprocessor, X_train_full.loc[mask], ylog, "SEV*_Auto")
        metrics['rmse_oof_log_sev_auto'] = bundles['sev_auto']['rmse_oof_log']
    if 'sev_renters' in todo:
        yfreq = y_pack['renters'].astype(float)
        mask = (yfreq == 1.0)
        ylog = y_pack['sev_log_renters'][mask]
        bundles['sev_renters'] = oof_severity_superbag(preprocessor, X_train_full.loc[mask], ylog, "SEV*_Renters")
        metrics['rmse_oof_log_sev_renters'] = bundles['sev_renters']['rmse_oof_log']
    if 'sev_pet' in todo:
        yfreq = y_pack['pet'].astype(float)
        mask = (yfreq == 1.0)
        ylog = y_pack['sev_log_pet'][mask]
        bundles['sev_pet'] = oof_severity_superbag(preprocessor, X_train_full.loc[mask], ylog, "SEV*_Pet")
        metrics['rmse_oof_log_sev_pet'] = bundles['sev_pet']['rmse_oof_log']

    # Save full artifacts
    joblib.dump(bundles, 'super_ensemble_models.joblib')
    joblib.dump(preprocessor, 'super_preprocessor.joblib')
    json.dump(metrics, open('super_metrics.json','w'), indent=2)
    # Mirror to Drive if mounted
    try:
        os.makedirs('/content/drive/MyDrive/model_backups/FINAL', exist_ok=True)
        joblib.dump(bundles, '/content/drive/MyDrive/model_backups/FINAL/super_ensemble_models.joblib')
        joblib.dump(preprocessor, '/content/drive/MyDrive/model_backups/FINAL/super_preprocessor.joblib')
        json.dump(metrics, open('/content/drive/MyDrive/model_backups/FINAL/super_metrics.json','w'), indent=2)
        print("üíæ Saved locally and to Drive/FINAL.")
    except Exception as e:
        print("Saved locally. Drive mirror skipped:", e)
else:
    print("‚úÖ Nothing missing; using existing artifacts.")

# 3) Predict & submit (Phase 3)
feat_test = create_feature_master(TEST_DIR)
X_test = feat_test.drop(columns=['user_id'], errors='ignore').reindex(
    columns=X_train_full.columns, fill_value=np.nan
)

sub = pd.DataFrame({'user_id': feat_test['user_id']})
for key in ['auto','renters','pet']:
    prob = predict_frequency_superbag(bundles[f'p_{key}'], preprocessor, X_test)
    sub[f'p_{key}'] = np.clip(prob, 0.001, 0.999)

    sev_raw = predict_severity_superbag(bundles[f'sev_{key}'], preprocessor, X_test)
    sub[f'sev_{key}'] = stabilize_severity(sev_raw, shrink=0.15, cap_quantile=0.995)

cols = ['user_id','p_auto','p_renters','p_pet','sev_auto','sev_renters','sev_pet']
sub[cols].to_csv('submission_predictions_super.csv', index=False)
print("‚úÖ Created 'submission_predictions_super.csv'. Preview:")
print(sub[cols].head())


‚ñ∂Ô∏è To train: ['sev_pet']
