# Model training and evaluation

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")


DATA_PATH  = Path(PROCESSED_DATA_DIR/"processed.csv")
MODEL_PATH = Path(MODELS_DIR)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

# Choose features

In [None]:

## pick target and drop rows lacking it
#y = df["days_vac_to_nom"]
#X = df.drop(columns=["days_vac_to_nom",
#                     "days_nom_to_latest_action",     # other targets saved for later tasks
#                     "days_vac_to_conf"])
#
## --------  column groups ------------
#num_cols = [
#    "age_at_nom_days", "days_into_pres_term",
#    "days_to_next_pres_elec", "days_to_next_mid_elec",
#    "congress_num", "years_private_practice",
#    "highest_degree_level"
#] + [c for c in df.columns if c.startswith("years_")]
#
#bool_cols = [c for c in df.columns if c.startswith("has_")] + ["partisan_mismatch"]
#
#cat_cols = [
#    "pres_term_idx", "congress_session",
#    "seat_level", "vacancy_reason_cat",
#    "pres_party", "party_of_appointing_president"
#]
#

In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[cat_cols + num_cols]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=df_model["seat_level"]
)

#  Preprocessing & model

In [None]:

pre = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
     ("num", "passthrough", num_cols)]
)

xgb = XGBRegressor(
    n_estimators=300,          # low at first to try a quick finish, can double to 600 if needing better performance via longer training time
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    n_jobs=-1,
    random_state=42,
)

pipe = Pipeline([("prep", pre), ("model", xgb)])
pipe.fit(X_train, y_train)

# Prediction with trained model

In [None]:

pred = pipe.predict(X_test)
print("MAE on hold‑out:", mean_absolute_error(y_test, pred).round(2))