# Model training and evaluation

In [1]:
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm.notebook import tqdm
from xgboost import XGBRegressor

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")

[32m2025-07-16 14:34:18.238[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m103[0m - [1mProject root: /home/wsl2ubuntuuser/nomination_predictor[0m
[32m2025-07-16 14:34:18.240[0m | [1mINFO    [0m | [36mnomination_predictor.config[0m:[36m<module>[0m:[36m127[0m - [1mConfiguration loaded[0m


In [None]:
df = pd.read_csv(PROCESSED_DATA_DIR/"processed.csv")

# Choose features

## target variable

In [None]:
TARGET = "days_nom_to_conf"

# pick target and drop Y label targets from features
y = df[TARGET]
X = df.drop(columns=[TARGET, "days_nom_to_latest_action"])  # other target saved for later tries at modeling

## numeric features

In [None]:
numeric_features = [
    "actions_count",
    "age_at_nom_days",
    "birth_year", 
    "committees_count",
    "congress_num", 
    "days_into_pres_term",
    "days_nom_to_deceased",
    "days_to_next_midterm_election",
    "days_to_next_pres_election",
    "death_year", 
    "degree_year", 
    "education_sequence", 
    "fed_service_sequence", 
    "highest_degree_level",
    "professional_career_sequence",
    "record_vote_number",   
    "service_as_chief_judge,_begin", 
    "service_as_chief_judge,_end",
]

# boolean features

In [None]:
boolean_features = [
    "pres_term_is_latter_term", 
    "statute_authorized_new_seat_bool",
]

# categorical features

In [None]:
categorical_features  = [
    "aba_rating", 
    "appointing_president",
    "congress_session",
    "court_type",
    "seat_level", 
    "birth_state",
    "latestaction_is_div_opp_house",
    "latestaction_is_div_opp_senate",
    "latestaction_is_fully_div",
    "latestaction_is_unified",
    "nominees_0_organization",
    "nominees_0_state",
    "party_of_appointing_president",
    "race_or_ethnicity",
    "school",
    "seat_id_letters_only",
    "senate_vote_type",
    "vacancy_reason",
]

In [None]:
from nomination_predictor.modeling.train import validate_feature_lists

are_features_unique = validate_feature_lists(numeric_features, boolean_features, categorical_features)

if not are_features_unique:
    raise ValueError("Feature lists contain duplicates. Please fix before continuing.")

✅ All features are unique across feature type lists
ℹ️ Total unique features: 38


In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[boolean_features + categorical_features + numeric_features]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=df_model["seat_level"]
)

# Model Selection, Training, and Evaluation

##  Preprocessing pipeline setup

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [None]:
class ProgressXGBRegressor(XGBRegressor):
    """Wrapper to be able to get a progress bar"""
    def fit(self, X, y, *args, **kwargs):
        # Print start message
        logger.info(f"Starting XGBoost training with {self.n_estimators} trees...")
        start_time = time.time()
        
        # Fit the model
        result = super().fit(X, y, *args, **kwargs)
        
        # Print completion message with timing
        elapsed = time.time() - start_time
        logger.info(f"XGBoost training completed in {elapsed:.2f} seconds")
        return result

In [None]:
NUM_ESTIMATORS = 300
LEARNING_RATE = 0.1
MAX_DEPTH = 6

In [None]:

# Create the pipeline with preprocessing and model
pipeline_speedy = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=NUM_ESTIMATORS,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Training

In [None]:
from nomination_predictor.modeling.train import train_model

pipeline = train_model(pipeline_speedy, X_train, y_train)

[32m2025-07-16 14:34:19.465[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m84[0m - [1mTraining model on 1129 samples, 38 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-16 14:34:19.509[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 300 trees...[0m


Training Pipeline: 100%|██████████| 1/1 [00:14<00:00, 14.46s/it]

[32m2025-07-16 14:34:33.924[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 14.41 seconds[0m
[32m2025-07-16 14:34:33.932[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m88[0m - [1mModel training completed[0m





## Prediction

In [None]:
from nomination_predictor.modeling.predict import predict_model

y_pred = predict_model(pipeline, X_test)

[32m2025-07-16 14:34:34.006[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 283 samples, 38 features[0m


Predicting: 100%|██████████| 1/1 [00:00<00:00, 32.60it/s]

[32m2025-07-16 14:34:34.042[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m22[0m - [1mPrediction completed[0m





## Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

from nomination_predictor.modeling.predict import interpret_results

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

interpret_results(mae, r2, y_train)

[32m2025-07-16 14:34:34.208[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mModel evaluation - MAE: 40.12, R²: 0.4474[0m

===== Mean Absolute Error (MAE): 40.12 =====
📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R² Score: 0.4474 =====
📊 FAIR: The model explains between 30-50% of the variance in confirmation times.
🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.

===== Interpretation in Context =====
• The average nomination takes 111 days to confirm
• With a standard deviation of 85 days
• Our model's error (MAE) is 40 days, which is 47% of the standard deviation
• This means our model outperforms a baseline model that always predicts the average

===== Recommended Next Steps =====
1. Tune hyperparameters to optimize model performance
2. Explore feature importance to understand key driv

# Saving the trained model

In [None]:
from nomination_predictor.modeling.train import save_model_with_metadata

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': NUM_ESTIMATORS,
            'learning_rate': LEARNING_RATE,
            'max_depth': MAX_DEPTH
        }
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-16 14:34:34.354[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36msave_model_with_metadata[0m:[36m120[0m - [1mModel saved to /home/wsl2ubuntuuser/nomination_predictor/models/xgboost_regression_2025-07-16_143434.pkl with metadata[0m


# Again, but slower

In [None]:
NUM_ESTIMATORS_LONGER = NUM_ESTIMATORS *2

pipeline_longer = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=NUM_ESTIMATORS_LONGER,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
pipeline = train_model(pipeline_longer, X_train, y_train)
y_pred = predict_model(pipeline, X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

interpret_results(mae, r2, y_train)

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': NUM_ESTIMATORS_LONGER,
            'learning_rate': LEARNING_RATE,
            'max_depth': MAX_DEPTH
        }
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-16 14:34:34.447[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m84[0m - [1mTraining model on 1129 samples, 38 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-16 14:34:34.494[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 600 trees...[0m


Training Pipeline: 100%|██████████| 1/1 [00:22<00:00, 22.57s/it]


[32m2025-07-16 14:34:57.016[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 22.51 seconds[0m
[32m2025-07-16 14:34:57.023[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m88[0m - [1mModel training completed[0m
[32m2025-07-16 14:34:57.025[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 283 samples, 38 features[0m


Predicting: 100%|██████████| 1/1 [00:00<00:00, 38.72it/s]

[32m2025-07-16 14:34:57.056[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m22[0m - [1mPrediction completed[0m
[32m2025-07-16 14:34:57.059[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mModel evaluation - MAE: 40.56, R²: 0.4474[0m

===== Mean Absolute Error (MAE): 40.56 =====
📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R² Score: 0.4474 =====
📊 FAIR: The model explains between 30-50% of the variance in confirmation times.
🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.

===== Interpretation in Context =====
• The average nomination takes 111 days to confirm
• With a standard deviation of 85 days
• Our model's error (MAE) is 41 days, which is 48% of the standard deviation
• This means our model outperforms a baseline model that alw




# Again, with more patience

In [None]:
NUM_ESTIMATORS_LONGEST = NUM_ESTIMATORS * 4
pipeline_longest = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=NUM_ESTIMATORS_LONGEST,
        learning_rate=LEARNING_RATE,
        max_depth=MAX_DEPTH,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
pipeline = train_model(pipeline_longer, X_train, y_train)
y_pred = predict_model(pipeline, X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

interpret_results(mae, r2, y_train)

model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': NUM_ESTIMATORS_LONGER,
            'learning_rate': LEARNING_RATE,
            'max_depth': MAX_DEPTH
        }
    },
    X_train=X_train,
    y_train=y_train,
    mae=mae,
    r2=r2
)

[32m2025-07-16 14:34:57.134[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m84[0m - [1mTraining model on 1129 samples, 38 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-16 14:34:57.176[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 600 trees...[0m


Training Pipeline: 100%|██████████| 1/1 [00:24<00:00, 24.60s/it]


[32m2025-07-16 14:35:21.729[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 24.55 seconds[0m
[32m2025-07-16 14:35:21.735[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.train[0m:[36mtrain_model[0m:[36m88[0m - [1mModel training completed[0m
[32m2025-07-16 14:35:21.736[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m18[0m - [1mPredicting using model on 283 samples, 38 features[0m


Predicting: 100%|██████████| 1/1 [00:00<00:00, 25.97it/s]

[32m2025-07-16 14:35:21.784[0m | [1mINFO    [0m | [36mnomination_predictor.modeling.predict[0m:[36mpredict_model[0m:[36m22[0m - [1mPrediction completed[0m
[32m2025-07-16 14:35:21.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mModel evaluation - MAE: 40.56, R²: 0.4474[0m

===== Mean Absolute Error (MAE): 40.56 =====
📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R² Score: 0.4474 =====
📊 FAIR: The model explains between 30-50% of the variance in confirmation times.
🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.

===== Interpretation in Context =====
• The average nomination takes 111 days to confirm
• With a standard deviation of 85 days
• Our model's error (MAE) is 41 days, which is 48% of the standard deviation
• This means our model outperforms a baseline model that alw


