# Model training and evaluation

In [None]:
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm.notebook import tqdm
from xgboost import XGBRegressor

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")

In [None]:
df = pd.read_csv(PROCESSED_DATA_DIR/"processed.csv")

# Choose features

## target variable

In [None]:
TARGET = "days_nom_to_conf"

# pick target and drop Y label targets from features
y = df[TARGET]
X = df.drop(columns=[TARGET, "days_nom_to_latest_action"])  # other target saved for later tries at modeling

## numeric features

In [None]:
NUMERIC_FEATURES = [
    "actions_count",
    "age_at_nom_days",
    "birth_year", 
    "committees_count",
    "congress_num", 
    "days_into_pres_term",
    "days_nom_to_deceased",
    "days_to_next_midterm_election",
    "days_to_next_pres_election",
    "death_year", 
    "degree_year", 
    "education_sequence", 
    "fed_service_sequence", 
    "highest_degree_level",
    "professional_career_sequence",
    "record_vote_number",   
    "service_as_chief_judge,_begin", 
    "service_as_chief_judge,_end",
]

# boolean features

In [None]:
BOOLEAN_FEATURES = [
    "pres_term_is_latter_term", 
    "statute_authorized_new_seat_bool",
]

# categorical features

In [None]:
CATEGORICAL_FEATURES  = [
    "aba_rating", 
    "appointing_president",
    "congress_session",
    "court_type",
    "seat_level", 
    "birth_state",
    "latestaction_is_div_opp_house",
    "latestaction_is_div_opp_senate",
    "latestaction_is_fully_div",
    "latestaction_is_unified",
    "nomination_vacancy_reason",
    "nomination_of_or_from_location",
    "nomination_to_position_title",
    "nomination_to_court_name",
    "nomination_term_info",
    "nomination_predecessor_name",
    "nomination_is_list_nomination",
    "nomination_parsing_confidence",
    "nomination_multiple_nominees_count",
    "nominees_0_organization",
    "nominees_0_state",
    "party_of_appointing_president",
    "race_or_ethnicity",
    "school",
    "seat_id_letters_only",
    "senate_vote_type",
    
]

In [23]:
df["nomination_term_info"].dropna().head(50)

42          expiring fifteen years after he takes office
58          expiring fifteen years after he takes office
59          expiring fifteen years after he takes office
134    fifteen years after she takes office. (Reappoi...
138                  fifteen years after he takes office
168                                        fifteen years
178                                        fifteen years
202                 fifteen years after she takes office
235    expiring fifteen years after he takes office. ...
283         expiring fifteen years after he takes office
292         expiring fifteen years after he takes office
297                                        fifteen years
309            ten years vice David V. O'Brien, deceased
373                  fifteen years, vice Sarah L. Wilson
374                                        fifteen years
375                       fifteen years. (Reappointment)
377                                        fifteen years
378                            

In [None]:
from nomination_predictor.modeling.train import validate_feature_lists

are_features_unique = validate_feature_lists(NUMERIC_FEATURES, BOOLEAN_FEATURES, CATEGORICAL_FEATURES)

if not are_features_unique:
    raise ValueError("Feature lists contain duplicates. Please fix before continuing.")

✅ All features are unique across feature type lists
ℹ️ Total unique features: 38


In [None]:
if len([col for col in NUMERIC_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in NUMERIC_FEATURES are absent from the df: {}".format(
        [col for col in NUMERIC_FEATURES if col not in df.columns],
    ))
if len([col for col in BOOLEAN_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in BOOLEAN_FEATURES are absent from the df: {}".format(
        [col for col in BOOLEAN_FEATURES if col not in df.columns],
    ))
if len([col for col in CATEGORICAL_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in CATEGORICAL_FEATURES are absent from the df: {}".format(
        [col for col in CATEGORICAL_FEATURES if col not in df.columns],
    ))



In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[BOOLEAN_FEATURES + CATEGORICAL_FEATURES + NUMERIC_FEATURES]
y = df_model[TARGET]

# splitting training and testing data

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=df_model["seat_level"]
)

# Model Selection, Training, and Evaluation

##  Preprocessing pipeline setup

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_FEATURES),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
    ])

In [None]:
class ProgressXGBRegressor(XGBRegressor):
    """Wrapper to be able to get a progress bar"""
    def fit(self, X, y, *args, **kwargs):
        # Print start message
        logger.info(f"Starting XGBoost training with {self.n_estimators} trees...")
        start_time = time.time()
        
        # Fit the model
        result = super().fit(X, y, *args, **kwargs)
        
        # Print completion message with timing
        elapsed = time.time() - start_time
        logger.info(f"XGBoost training completed in {elapsed:.2f} seconds")
        return result

In [None]:
NUM_ESTIMATORS = 1000
LEARNING_RATE = 0.1
MAX_DEPTH = 6

# Create the pipeline with preprocessing and model
pipeline_speedy = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=NUM_ESTIMATORS,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Training

In [None]:
from nomination_predictor.modeling.train import train_model

pipeline = train_model(pipeline_speedy, X_train, y_train)

[32m2025-07-17 10:22:19.784[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m84[0m - [1mTraining model on 1129 samples, 38 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-17 10:22:19.811[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 300 trees...[0m


Training Pipeline: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]

[32m2025-07-17 10:22:20.893[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 1.08 seconds[0m
[32m2025-07-17 10:22:20.900[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m88[0m - [1mModel training completed[0m





## Prediction

In [None]:
from nomination_predictor.modeling.predict import predict_model

y_pred = predict_model(pipeline, X_test)

[32m2025-07-17 10:22:20.911[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 283 samples, 38 features[0m


Predicting: 100%|██████████| 1/1 [00:00<00:00, 52.84it/s]

[32m2025-07-17 10:22:20.934[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m22[0m - [1mPrediction completed[0m





## Evaluation

### Choice of metric: MAE

Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) are both sensitive to outliers, and MSE also doesn't use the same units as our target variable, making it less intuitive.

After all our data cleaning, we have few-enough rows of data, with outliers occurring often-enough, that I'm selecting Mean Absolute Error (MAE) as our metric.

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

from nomination_predictor.modeling.predict import interpret_results

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

interpret_results(mae, r2, y_train)

[32m2025-07-17 10:22:20.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mModel evaluation - MAE: 40.12, R²: 0.4474[0m

===== Mean Absolute Error (MAE): 40.12 =====
📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R² Score: 0.4474 =====
📊 FAIR: The model explains between 30-50% of the variance in confirmation times.
🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.

===== Interpretation in Context =====
• The average nomination takes 111 days to confirm
• With a standard deviation of 85 days
• Our model's error (MAE) is 40 days, which is 47% of the standard deviation
• This means our model outperforms a baseline model that always predicts the average

===== Recommended Next Steps =====
1. Tune hyperparameters to optimize model performance
2. Explore feature importance to understand key driv

# Saving the trained model

In [None]:
from nomination_predictor.modeling.train import save_model_with_metadata

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': NUM_ESTIMATORS,
            'learning_rate': LEARNING_RATE,
            'max_depth': MAX_DEPTH
        }
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-17 10:22:20.968[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36msave_model_with_metadata[0m:[36m120[0m - [1mModel saved to /home/wsl2ubuntuuser/nomination_predictor/models/xgboost_regression_2025-07-17_102220.pkl with metadata[0m


# Model tuning via randomized hyper-parameter search

In [None]:
import pathlib
import pickle
import time
from datetime import datetime

from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# --- reuse the `preprocessor`, `cat_cols`, `num_cols`, X_train, y_train
#     created in earlier notebook cells ---------------------------------

RANDOMIZED_SEARCH_N_ITER = 120

base_xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",   # leave algorithm choice to 'hist'
    device="cuda",        # run on GPU
    n_jobs=-1,
    random_state=42,
)

pipeline = Pipeline([("preprocessor", preprocessor), ("model", base_xgb)])

param_dist = {
    "model__n_estimators"     : randint(1500, 6000),
    "model__learning_rate"    : loguniform(0.01, 0.05),
    "model__max_depth"        : randint(4, 10),
    "model__min_child_weight" : randint(1, 10),
    "model__subsample"        : loguniform(0.7, 1.0),
    "model__colsample_bytree" : loguniform(0.7, 1.0),
    "model__gamma"            : loguniform(1e-3, 0.3),
    "model__reg_alpha"        : loguniform(1e-3, 1.0),
    "model__reg_lambda"       : loguniform(0.5, 5.0),
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=RANDOMIZED_SEARCH_N_ITER,                 # was told n_iter=80 would take ≈5–6 h on 8‑core CPU
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=2,
    n_jobs=1,                  # let each trial use all cores
    random_state=42,
)

start = time.time()
search.fit(X_train, y_train)   # ← no eval_set passed through pipeline
logger.info(f"Search finished in {(time.time()-start)/3600:.2f} hours")

print("Best MAE (CV):", -search.best_score_)
print("Best params  :", search.best_params_)

# save the tuned pipeline
best_model = search.best_estimator_
pathlib.Path("../models").mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
with open(f"../models/xgb_randomsearch_best_{timestamp}.pkl", "wb") as f:
    pickle.dump(best_model, f)

logger.info(f"Saved model → ../models/xgb_randomsearch_best_{timestamp}.pkl")

Fitting 3 folds for each of 120 candidates, totalling 360 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__colsample_bytree=0.7265115238644736, model__gamma=0.22648248189516848, model__learning_rate=0.07259248719561363, model__max_depth=8, model__min_child_weight=5, model__n_estimators=2638, model__reg_alpha=0.02175195311877764, model__reg_lambda=0.6294263501482736, model__subsample=0.7586387864371679; total time=  15.4s
[CV] END model__colsample_bytree=0.7265115238644736, model__gamma=0.22648248189516848, model__learning_rate=0.07259248719561363, model__max_depth=8, model__min_child_weight=5, model__n_estimators=2638, model__reg_alpha=0.02175195311877764, model__reg_lambda=0.6294263501482736, model__subsample=0.7586387864371679; total time=  13.8s
[CV] END model__colsample_bytree=0.7265115238644736, model__gamma=0.22648248189516848, model__learning_rate=0.07259248719561363, model__max_depth=8, model__min_child_weight=5, model__n_estimators=2638, model__reg_alpha=0.02175195311877764, model__reg_lambda=0.6294263501482736, model__subsample=0.7586387864371679; total time=  29.0

In [None]:
y_pred = predict_model(pipeline, X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
interpret_results(mae, r2, y_train)

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression_randomsearch",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction, tuned with random search',
        'parameters': search.best_params_,
        },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-17 11:52:30.218[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 283 samples, 38 features[0m


Predicting:   0%|          | 0/1 [00:00<?, ?it/s]


NotFittedError: Pipeline is not fitted yet.