# Model training and evaluation

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

from nomination_predictor.config import MODELS_DIR, PROCESSED_DATA_DIR

sns.set_theme(style="whitegrid")


DATA_PATH  = Path(PROCESSED_DATA_DIR/"processed.csv")
MODEL_PATH = Path(MODELS_DIR)

In [None]:
df = pd.read_csv(DATA_PATH)

# Choose features

## target variable

In [None]:
TARGET = "days_nom_to_conf"

# pick target and drop Y label targets from features
y = df[TARGET]
X = df.drop(columns=[TARGET, "days_nom_to_latest_action"])  # other target saved for later tries at modeling

## numeric features

In [None]:
num_cols = [
    "age_at_nom_days", "days_into_pres_term", "days_to_next_pres_election", "days_to_next_midterm_election",
    "congress_num", "service_as_chief_judge_begin", "service_as_chief_judge,_end",
    "actions_count", "birth_year", "degree_year", "death_year", "record_vote_number", "committees_count", 
    "fed_service_sequence", "professional_career_sequence", "education_sequence",  
   "highest_degree_level"
]

# boolean features

In [None]:
bool_cols = [
    "pres_term_is_latter_term", "congress_session",
    "statute_authorized_new_seat_bool",
]

# categorical features

In [None]:
cat_cols = [
    "aba_rating", 
    "appointing_president",
    "congress_session",
    "court_type",
    "seat_level", 
    "nominee_birth_state",
    "latestaction_is_div_opp_house",
    "latestaction_is_div_opp_senate",
    "latestaction_is_fully_div",
    "latestaction_is_unified",
    "nominees_0_organization",
    "nominees_0_state",
    "pres_term_is_latter_term",
    "race_or_ethnicity",
    "school",
    "seat_id_letters_only",
    "senate_vote_type",
    "party_of_appointing_president"
    "vacancy_reason",
]

In [None]:
cat_cols = df.select_dtypes("object").columns.tolist()
num_cols = [
    c for c in df.select_dtypes("number").columns
    if c not in {TARGET}
]

df_model = df[df[TARGET].notna()].copy()
X = df_model[bool_cols + cat_cols + num_cols]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=df_model["seat_level"]
)

#  Preprocessing & model

# training using XGBoost's API

Doing it this way to get a less error-prone progress bar vs. when I'd tried using tqdm

In [None]:
# Import tqdm for progress bars
from tqdm.notebook import tqdm


# Create a custom callback that works with the current XGBoost API
class TqdmProgressCallback(xgb.callback.TrainingCallback):
    def __init__(self, n_estimators):
        self.pbar = tqdm(total=n_estimators, desc="XGBoost Training")
        self.curr_iteration = 0
    
    def after_iteration(self, model, epoch, evals_log):
        # Update progress bar
        self.pbar.update(1)
        
        # Optionally print evaluation metrics every 10 iterations
        if epoch % 10 == 0 and evals_log:
            message = f"[{epoch}] "
            for eval_name, metric_dict in evals_log.items():
                for metric_name, metric_values in metric_dict.items():
                    message += f"{eval_name}-{metric_name}:{metric_values[-1]:.4f} "
            tqdm.write(message)
        
        # Return False to continue training
        return False
    
    def __del__(self):
        # Clean up progress bar
        try:
            self.pbar.close()
        except:
            pass

# Create callback instance
tqdm_callback = TqdmProgressCallback(n_estimators)

# Train with progress bar using native XGBoost API
model = xgb.train(
    params,
    dtrain,
    num_boost_round=n_estimators,
    callbacks=[tqdm_callback]
)

XGBoost Training:   0%|          | 0/300 [00:00<?, ?it/s]

# Prediction with trained model

Because I chose to use XGBoost's native API for the sake of getting a progress bar, that means having to use DMatrix instead of the as-is dataframes for prediction, because I'm not just using scikitlearn's wrapper for XGBoost

In [None]:
# Apply the same preprocessing that was used during training
X_test_processed = prep_step.transform(X_test)

# Create DMatrix with processed features
dtest = xgb.DMatrix(X_test_processed)

# Now predict
pred = model.predict(dtest)
print("MAE on hold‑out:", mean_absolute_error(y_test, pred).round(2))

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:latestaction_actiondate: object, nominees_0_organization: object, receiveddate: object, nominees_0_firstname: object, nominees_0_lastname: object, nominees_0_middlename: object, nominees_0_state: object, nominees_0_suffix: object, vacancy_reason: object, full_name_from_description: object, location_of_origin_from_description: object, last_name: object, first_name: object, middle_name: object, suffix: object, birth_city: object, birth_state: object, death_city: object, death_state: object, gender: object, race_or_ethnicity: object, fjc_biography_url: object, degree: object, school: object, professional_career: object, judge_name: object, court_type: object, court_name: object, appointment_title: object, appointing_president: object, party_of_appointing_president: object, aba_rating: object, seat_id: object, statute_authorizing_new_seat: object, nomination_date: object, committee_referral_date: object, hearing_date: object, judiciary_committee_action: object, committee_action_date: object, senate_vote_type: object, confirmation_date: object, commission_date: object, senior_status_date: object, termination: object, termination_date: object, latest_action_taken: object, was_unanimous_decision: object, seat_id_letters_only: object, nominee_parsed_positiontitle: object, birth_date_approx_dt: object, death_date_exact: object, seat_level: object

# Saving the trained model

In [None]:
import json
import pickle
from datetime import datetime
from pathlib import Path


def save_model(model, model_name="nomination_predictor", metrics=None):
    # Create formatted timestamp (YYYY-MM-DD_HHMMSS)
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    
    # Create directory for this model version
    version_dir = Path(MODELS_DIR) / f"{model_name}_{timestamp}"
    version_dir.mkdir(exist_ok=True, parents=True)
    
    # Save the model
    model_path = version_dir / "model.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    # Save metadata (parameters, performance metrics)
    metadata = {
        "timestamp": timestamp,
        "model_type": type(model).__name__,
        "parameters": model.get_params() if hasattr(model, "get_params") else {},
        "metrics": metrics or {},
        "python_version": sys.version,
        # Add other relevant info: feature names, etc.
    }
    
    with open(version_dir / "metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Model saved to {version_dir}")
    return version_dir

# Usage:
metrics = {
    "rmse": rmse_score,
    "r2": r2_score,
    "mae": mae_score
}

save_model(xgb, metrics=metrics)