# Model training and evaluation

In [None]:
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm.notebook import tqdm
from xgboost import XGBRegressor

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")

In [None]:
df = pd.read_csv(PROCESSED_DATA_DIR/"processed.csv")

# Choose features

## target variable

In [None]:
TARGET = "days_nom_to_conf"

# pick target and drop Y label targets from features
y = df[TARGET]
X = df.drop(columns=[TARGET, "days_nom_to_latest_action"])  # other target saved for later tries at modeling

## numeric features

In [None]:
NUMERIC_FEATURES = [
    "actions_count",
    "age_at_nom_days",
    "birth_year", 
    "committees_count",
    "congress_num", 
    "days_into_pres_term",
    "days_nom_to_deceased",
    "days_to_next_midterm_election",
    "days_to_next_pres_election",
    "death_year", 
    "degree_year", 
    "education_sequence", 
    "fed_service_sequence", 
    "highest_degree_level",
    "professional_career_sequence",
    "record_vote_number",   
    "service_as_chief_judge,_begin", 
    "service_as_chief_judge,_end",
]

# boolean features

In [None]:
BOOLEAN_FEATURES = [
    "pres_term_is_latter_term", 
    "statute_authorized_new_seat_bool",
]

# categorical features

In [None]:
CATEGORICAL_FEATURES  = [
    "aba_rating", 
    "appointing_president",
    "congress_session",
    "court_type",
    "birth_state",
    "latestaction_is_div_opp_house",
    "latestaction_is_div_opp_senate",
    "latestaction_is_fully_div",
    "latestaction_is_unified",
    "nomination_vacancy_reason",
    "nomination_of_or_from_location",
    "nomination_to_position_title",
    "nomination_to_court_name",
    "nominees_0_organization",
    "nominees_0_state",
    "nomination_term_years", # sounds numeric but only few options, including lifetime
    "party_of_appointing_president",
    "race_or_ethnicity",
    "received_in_senate_political_era",
    "school",
    "seat_level_cong_recategorized",
    "seat_id_letters_only",
    "senate_vote_type"
]

In [None]:
from nomination_predictor.modeling.train import validate_feature_lists

are_features_unique = validate_feature_lists(NUMERIC_FEATURES, BOOLEAN_FEATURES, CATEGORICAL_FEATURES)

if not are_features_unique:
    raise ValueError("Feature lists contain duplicates. Please fix before continuing.")

‚úÖ All features are unique across feature type lists
‚ÑπÔ∏è Total unique features: 43


In [None]:
if len([col for col in NUMERIC_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in NUMERIC_FEATURES are absent from the df: {}".format(
        [col for col in NUMERIC_FEATURES if col not in df.columns],
    ))
if len([col for col in BOOLEAN_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in BOOLEAN_FEATURES are absent from the df: {}".format(
        [col for col in BOOLEAN_FEATURES if col not in df.columns],
    ))
if len([col for col in CATEGORICAL_FEATURES if col not in df.columns]) > 0:
    logger.warning("The following columns in CATEGORICAL_FEATURES are absent from the df: {}".format(
        [col for col in CATEGORICAL_FEATURES if col not in df.columns],
    ))

In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[BOOLEAN_FEATURES + CATEGORICAL_FEATURES + NUMERIC_FEATURES]
y = df_model[TARGET]

# splitting training and testing data

In [None]:
from sklearn.model_selection import train_test_split

# Create composite strata
strata = (
    df_model["seat_level_cong_recategorized"].astype(str)
    + "__"
    + df_model["received_in_senate_political_era"].astype(str)
)

# Optionally collapse very-rare strata to 'Other' (prevents ValueError)
min_count = 3          # tweak as needed
rare_mask = strata.map(strata.value_counts()) < min_count
strata = strata.where(~rare_mask, other="Other")

# Train‚Äìtest split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=strata,
)

# Model Selection, Training, and Evaluation

##  Preprocessing pipeline setup

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_FEATURES),
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
    ])

In [None]:
class ProgressXGBRegressor(XGBRegressor):
    """Wrapper to be able to get a progress bar"""
    def fit(self, X, y, *args, **kwargs):
        # Print start message
        logger.info(f"Starting XGBoost training with {self.n_estimators} trees...")
        start_time = time.time()
        
        # Fit the model
        result = super().fit(X, y, *args, **kwargs)
        
        # Print completion message with timing
        elapsed = time.time() - start_time
        logger.info(f"XGBoost training completed in {elapsed:.2f} seconds")
        return result

In [None]:
NUM_ESTIMATORS = 1000
LEARNING_RATE = 0.1
MAX_DEPTH = 6

# Create the pipeline with preprocessing and model
pipeline_speedy = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=NUM_ESTIMATORS,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Training

In [None]:
from nomination_predictor.modeling.train import train_model

pipeline = train_model(pipeline_speedy, X_train, y_train)

[32m2025-07-18 02:51:18.782[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m84[0m - [1mTraining model on 1424 samples, 43 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-18 02:51:18.889[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 1000 trees...[0m


Training Pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:00<00:00, 60.01s/it]

[32m2025-07-18 02:52:18.794[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 59.90 seconds[0m
[32m2025-07-18 02:52:18.802[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m88[0m - [1mModel training completed[0m





## Prediction

In [None]:
from nomination_predictor.modeling.predict import predict_model

y_pred = predict_model(pipeline, X_test)

[32m2025-07-18 02:52:18.839[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 357 samples, 43 features[0m


Predicting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 16.41it/s]

[32m2025-07-18 02:52:18.904[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m22[0m - [1mPrediction completed[0m





## Evaluation

### Choice of metric: MAE

Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) are both sensitive to outliers, and MSE also doesn't use the same units as our target variable, making it less intuitive.

After all our data cleaning, we have few-enough rows of data, with outliers occurring often-enough, that I'm selecting Mean Absolute Error (MAE) as our metric.

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

from nomination_predictor.modeling.predict import (interpret_results,
                                                   summarize_model_complexity)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R¬≤: {r2:.4f}")

interpret_results(mae, r2, y_train)
summarize_model_complexity(pipeline)

[32m2025-07-18 02:52:18.927[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mModel evaluation - MAE: 42.53, R¬≤: 0.5109[0m

===== Mean Absolute Error (MAE): 42.53 =====
üìä GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
üîç TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R¬≤ Score: 0.5109 =====
üìä MODERATE: The model explains between 50-70% of the variance in confirmation times.
üîç TAKEAWAY: The model captures significant patterns but misses some factors.

===== Interpretation in Context =====
‚Ä¢ The average nomination takes 128 days to confirm
‚Ä¢ With a standard deviation of 85 days
‚Ä¢ Our model's error (MAE) is 43 days, which is 50% of the standard deviation
‚Ä¢ This means our model outperforms a baseline model that always predicts the average

===== Recommended Next Steps =====
1. Tune hyperparameters to optimize model performance
2. Explore feature importance t

# Saving the trained model

In [None]:
from nomination_predictor.modeling.train import save_model_with_metadata

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': NUM_ESTIMATORS,
            'learning_rate': LEARNING_RATE,
            'max_depth': MAX_DEPTH
        }
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-18 02:52:18.976[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36msave_model_with_metadata[0m:[36m120[0m - [1mModel saved to /home/wsl2ubuntuuser/nomination_predictor/models/xgboost_regression_2025-07-18_025218.pkl with metadata[0m


# Model tuning via randomized hyper-parameter search

In [None]:
import pathlib
import pickle
import time
from datetime import datetime

from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# --- reuse the `preprocessor`, `cat_cols`, `num_cols`, X_train, y_train
#     created in earlier notebook cells ---------------------------------

RANDOMIZED_SEARCH_N_ITER = 120

base_xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",   # leave algorithm choice to 'hist'
    device="cuda",        # run on GPU
    n_jobs=-1,
    random_state=42,
)

pipeline = Pipeline([("preprocessor", preprocessor), ("model", base_xgb)])

param_dist = {
    "model__n_estimators"     : randint(1500, 6000),
    "model__learning_rate"    : loguniform(0.01, 0.05),
    "model__max_depth"        : randint(4, 10),
    "model__min_child_weight" : randint(1, 10),
    "model__subsample"        : loguniform(0.7, 1.0),
    "model__colsample_bytree" : loguniform(0.7, 1.0),
    "model__gamma"            : loguniform(1e-3, 0.3),
    "model__reg_alpha"        : loguniform(1e-3, 1.0),
    "model__reg_lambda"       : loguniform(0.5, 5.0),
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=RANDOMIZED_SEARCH_N_ITER,                 # was told n_iter=80 would take ‚âà5‚Äì6‚ÄØh on 8‚Äëcore CPU
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=2,
    n_jobs=1,                  # let each trial use all cores
    random_state=42,
)

start = time.time()
search.fit(X_train, y_train)   # ‚Üê no eval_set passed through pipeline
logger.info(f"Search finished in {(time.time()-start)/3600:.2f} hours")

print("Best MAE (CV):", -search.best_score_)
print("Best params  :", search.best_params_)

# save the tuned pipeline
best_model = search.best_estimator_
pathlib.Path("../models").mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
with open(f"../models/xgb_randomsearch_best_{timestamp}.pkl", "wb") as f:
    pickle.dump(best_model, f)

logger.info(f"Saved model ‚Üí ../models/xgb_randomsearch_best_{timestamp}.pkl")

Fitting 3 folds for each of 120 candidates, totalling 360 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__colsample_bytree=0.8000461479455026, model__gamma=0.22648248189516848, model__learning_rate=0.032481928697702896, model__max_depth=8, model__min_child_weight=5, model__n_estimators=1966, model__reg_alpha=0.0019949166150633937, model__reg_lambda=1.4395239548966472, model__subsample=0.7884790488951874; total time=  -8.4s
[CV] END model__colsample_bytree=0.8000461479455026, model__gamma=0.22648248189516848, model__learning_rate=0.032481928697702896, model__max_depth=8, model__min_child_weight=5, model__n_estimators=1966, model__reg_alpha=0.0019949166150633937, model__reg_lambda=1.4395239548966472, model__subsample=0.7884790488951874; total time=  21.8s
[CV] END model__colsample_bytree=0.8000461479455026, model__gamma=0.22648248189516848, model__learning_rate=0.032481928697702896, model__max_depth=8, model__min_child_weight=5, model__n_estimators=1966, model__reg_alpha=0.0019949166150633937, model__reg_lambda=1.4395239548966472, model__subsample=0.7884790488951874; total ti

In [None]:
# Predict and evaluate directly from the search object
y_pred = search.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)
interpret_results(mae, r2, y_train)
summarize_model_complexity(search.best_estimator_)

model_file = save_model_with_metadata(
    search.best_estimator_,                # <- fitted pipeline
    "xgboost_regression_randomsearch",
    metadata={
        "description": "XGBoost regression model for nomination confirmation time prediction, tuned with random search",
        "parameters": search.best_params_,
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2,
)


===== Mean Absolute Error (MAE): 38.92 =====
üìä GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
üîç TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R¬≤ Score: 0.5795 =====
üìä MODERATE: The model explains between 50-70% of the variance in confirmation times.
üîç TAKEAWAY: The model captures significant patterns but misses some factors.

===== Interpretation in Context =====
‚Ä¢ The average nomination takes 128 days to confirm
‚Ä¢ With a standard deviation of 85 days
‚Ä¢ Our model's error (MAE) is 39 days, which is 46% of the standard deviation
‚Ä¢ This means our model outperforms a baseline model that always predicts the average

===== Recommended Next Steps =====
1. Tune hyperparameters to optimize model performance
2. Explore feature importance to understand key drivers
3. Consider ensemble methods to improve predictions
[32m2025-07-18 07:24:21.470[0m | [1mINFO    [0m | [36mnomination_predict