# Model training and evaluation

In [None]:
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loguru import logger
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm.notebook import tqdm
from xgboost import XGBRegressor

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")

In [None]:
df = pd.read_csv(PROCESSED_DATA_DIR/"processed.csv")

# Choose features

## target variable

In [None]:
TARGET = "days_nom_to_conf"

# pick target and drop Y label targets from features
y = df[TARGET]
X = df.drop(columns=[TARGET, "days_nom_to_latest_action"])  # other target saved for later tries at modeling

## numeric features

In [None]:
numeric_features = [
    "actions_count",
    "age_at_nom_days",
    "birth_year", 
    "committees_count",
    "congress_num", 
    "days_into_pres_term",
    "days_nom_to_deceased",
    "days_to_next_midterm_election",
    "days_to_next_pres_election",
    "death_year", 
    "degree_year", 
    "education_sequence", 
    "fed_service_sequence", 
    "highest_degree_level",
    "professional_career_sequence",
    "record_vote_number",   
    "service_as_chief_judge,_begin", 
    "service_as_chief_judge,_end",
]

# boolean features

In [None]:
boolean_features = [
    "pres_term_is_latter_term", 
    "statute_authorized_new_seat_bool",
]

# categorical features

In [None]:
categorical_features  = [
    "aba_rating", 
    "appointing_president",
    "congress_session",
    "court_type",
    "seat_level", 
    "birth_state",
    "latestaction_is_div_opp_house",
    "latestaction_is_div_opp_senate",
    "latestaction_is_fully_div",
    "latestaction_is_unified",
    "nominees_0_organization",
    "nominees_0_state",
    "party_of_appointing_president",
    "race_or_ethnicity",
    "school",
    "seat_id_letters_only",
    "senate_vote_type",
    "vacancy_reason",
]

In [None]:
def validate_feature_lists(numeric_features, boolean_features, categorical_features):
    """
    Validates that there are no duplicate features across the different feature type lists.
    
    Args:
        numeric_features (list): List of numeric feature names
        boolean_features (list): List of boolean feature names
        categorical_features (list): List of categorical feature names
        
    Returns:
        bool: True if there are no duplicates, False otherwise
    """
    import itertools
    from collections import Counter

    # Combine all features
    all_features = list(itertools.chain(numeric_features, boolean_features, categorical_features))
    
    # Count occurrences of each feature
    feature_counts = Counter(all_features)
    
    # Find duplicates
    duplicates = [feature for feature, count in feature_counts.items() if count > 1]
    
    if duplicates:
        print("⚠️ DUPLICATE FEATURES DETECTED:")
        for dup in duplicates:
            print(f"  - '{dup}' appears in multiple feature lists:")
            if dup in numeric_features:
                print("    • numeric_features")
            if dup in boolean_features:
                print("    • boolean_features")
            if dup in categorical_features:
                print("    • categorical_features")
        return False
    else:
        print("✅ All features are unique across feature type lists")
        
        # Additionally, check total coverage
        all_features_set = set(all_features)
        if not all_features_set:
            print("⚠️ WARNING: No features specified in any list")
        else:
            print(f"ℹ️ Total unique features: {len(all_features_set)}")
            
        return True

# Usage:
are_features_valid = validate_feature_lists(numeric_features, boolean_features, categorical_features)

if not are_features_valid:
    raise ValueError("Feature lists contain duplicates. Please fix before continuing.")

✅ All features are unique across feature type lists
ℹ️ Total unique features: 38


In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[boolean_features + categorical_features + numeric_features]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=df_model["seat_level"]
)

# Model Selection, Training, and Evaluation

##  Preprocessing pipeline setup

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [None]:
class ProgressXGBRegressor(XGBRegressor):
    """Wrapper to be able to get a progress bar"""
    def fit(self, X, y, *args, **kwargs):
        # Print start message
        logger.info(f"Starting XGBoost training with {self.n_estimators} trees...")
        start_time = time.time()
        
        # Fit the model
        result = super().fit(X, y, *args, **kwargs)
        
        # Print completion message with timing
        elapsed = time.time() - start_time
        logger.info(f"XGBoost training completed in {elapsed:.2f} seconds")
        return result

In [None]:
# Create the pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ProgressXGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        verbosity=1,  # This provides some built-in progress logging
        # Add other XGBoost parameters as needed
    ))
])

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Training

In [None]:
# Train the model with a simple progress indicator
logger.info(f"Training model on {X_train.shape[0]} samples, {X_train.shape[1]} features")
with tqdm(total=1, desc="Training Pipeline") as pbar:
    pipeline.fit(X_train, y_train)
    pbar.update(1)
logger.info("Model training completed")

[32m2025-07-16 13:57:22.831[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTraining model on 1129 samples, 38 features[0m


Training Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-07-16 13:57:22.898[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m5[0m - [1mStarting XGBoost training with 300 trees...[0m
[32m2025-07-16 13:57:33.209[0m | [1mINFO    [0m | [36m__main__[0m:[36mfit[0m:[36m13[0m - [1mXGBoost training completed in 10.31 seconds[0m
[32m2025-07-16 13:57:33.212[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mModel training completed[0m


## Prediction & Evaluation

In [None]:
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

[32m2025-07-16 13:57:33.272[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mModel evaluation - MAE: 40.12, R²: 0.4474[0m


In [None]:
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Basic log of results
logger.info(f"Model evaluation - MAE: {mae:.2f}, R²: {r2:.4f}")

# Detailed interpretation of MAE
print(f"\n===== Mean Absolute Error (MAE): {mae:.2f} =====")
if mae < 30:
    print("📊 EXCELLENT: The model's predictions are typically within 30 days of the actual confirmation time.")
    print("🔍 TAKEAWAY: The model has high practical utility for predicting confirmation timelines.")
elif mae < 60:
    print("📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.")
    print("🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.")
elif mae < 90:
    print("📊 FAIR: The model's predictions are typically within 90 days of the actual confirmation time.")
    print("🔍 TAKEAWAY: The model offers directional guidance but with substantial uncertainty.")
else:
    print("📊 NEEDS IMPROVEMENT: The model's predictions have large error margins exceeding 90 days.")
    print("🔍 TAKEAWAY: Consider feature engineering, hyperparameter tuning, or alternative algorithms.")

# Detailed interpretation of R²
print(f"\n===== R² Score: {r2:.4f} =====")
if r2 > 0.7:
    print("📊 STRONG: The model explains more than 70% of the variance in confirmation times.")
    print("🔍 TAKEAWAY: The model captures most of the systematic patterns in the data.")
elif r2 > 0.5:
    print("📊 MODERATE: The model explains between 50-70% of the variance in confirmation times.")
    print("🔍 TAKEAWAY: The model captures significant patterns but misses some factors.")
elif r2 > 0.3:
    print("📊 FAIR: The model explains between 30-50% of the variance in confirmation times.")
    print("🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.")
else:
    print("📊 WEAK: The model explains less than 30% of the variance in confirmation times.")
    print("🔍 TAKEAWAY: The model has limited predictive power, consider revisiting features or methodology.")

# Context relative to problem domain
print("\n===== Interpretation in Context =====")
print(f"• The average nomination takes {y_train.mean():.0f} days to confirm")
print(f"• With a standard deviation of {y_train.std():.0f} days")
print(f"• Our model's error (MAE) is {mae:.0f} days, which is {(mae/y_train.std()*100):.0f}% of the standard deviation")
print(f"• This means our model {('outperforms' if r2 > 0 else 'underperforms')} a baseline model that always predicts the average")

# Actionable next steps
print("\n===== Recommended Next Steps =====")
if r2 < 0.3 or mae > 90:
    print("1. Consider feature engineering to identify more predictive variables")
    print("2. Try different algorithms (Random Forest, Neural Networks)")
    print("3. Collect additional data or domain-specific features")
elif r2 < 0.6:
    print("1. Tune hyperparameters to optimize model performance")
    print("2. Explore feature importance to understand key drivers")
    print("3. Consider ensemble methods to improve predictions")
else:
    print("1. Focus on model interpretability to understand key drivers")
    print("2. Validate on additional test data to ensure generalizability")
    print("3. Consider deploying the model for practical use")

[32m2025-07-16 14:00:41.513[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mModel evaluation - MAE: 40.12, R²: 0.4474[0m

===== Mean Absolute Error (MAE): 40.12 =====
📊 GOOD: The model's predictions are typically within 60 days of the actual confirmation time.
🔍 TAKEAWAY: The model provides valuable insights but has moderate error margins.

===== R² Score: 0.4474 =====
📊 FAIR: The model explains between 30-50% of the variance in confirmation times.
🔍 TAKEAWAY: The model identifies some patterns but misses many important factors.

===== Interpretation in Context =====
• The average nomination takes 111 days to confirm
• With a standard deviation of 85 days
• Our model's error (MAE) is 40 days, which is 47% of the standard deviation
• This means our model outperforms a baseline model that always predicts the average

===== Recommended Next Steps =====
1. Tune hyperparameters to optimize model performance
2. Explore feature importance to understand key driv

# Saving the trained model

In [58]:
def save_model_with_metadata(model, file_prefix, metadata=None):
    """Save model with timestamp and metadata"""
    import os
    import pickle
    from datetime import datetime

    # Create models directory if it doesn't exist
    os.makedirs("models", exist_ok=True)
    
    # Create a sanitized timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    
    # Generate filename
    filename = MODELS_DIR/f"{file_prefix}_{timestamp}.pkl"
    
    # Default metadata if none provided
    if metadata is None:
        metadata = {}
    
    # Add standard metadata
    metadata.update({
        'timestamp': timestamp,
        'features': X_train.columns.tolist(),
        'n_features': X_train.shape[1],
        'metrics': {
            'mae': float(mae),
            'r2': float(r2)
        }
    })
    
    # Save model and metadata
    with open(filename, 'wb') as f:
        pickle.dump({'model': pipeline, 'metadata': metadata}, f)
    
    logger.info(f"Model saved to {filename} with metadata")
    return filename

In [None]:
model_file = save_model_with_metadata(
    pipeline, 
    "xgboost_regression",
    metadata={
        'description': 'XGBoost regression model for nomination confirmation time prediction',
        'parameters': {
            'n_estimators': 300,
            'learning_rate': 0.1,
            'max_depth': 6
        }
    }
)

[32m2025-07-16 14:06:19.455[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_model_with_metadata[0m:[36m35[0m - [1mModel saved to /home/wsl2ubuntuuser/nomination_predictor/models/xgboost_regression_2025-07-16_140619.pkl with metadata[0m
