In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# ----------------------------------------------
df = pd.read_csv("data/interim/feature_enriched.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(data=df, x="pres_term_idx", y="days_vac_to_nom")
plt.title("Vacancy → Nomination days by presidential term"); plt.show()

sns.scatterplot(data=df, x="days_into_pres_term", y="days_vac_to_nom", alpha=0.3)
plt.title("Lag vs. days into presidential term"); plt.show()

In [None]:

# pick target and drop rows lacking it
y = df["days_vac_to_nom"]
X = df.drop(columns=["days_vac_to_nom",
                     "days_nom_to_conf",     # other targets saved for later tasks
                     "days_vac_to_conf"])

# --------  column groups ------------
num_cols = [
    "age_at_nom", "days_into_pres_term",
    "days_to_next_pres_elec", "days_to_next_mid_elec",
    "congress_num", "years_private_practice",
    "highest_degree_level"
] + [c for c in df.columns if c.startswith("years_")]

bool_cols = [c for c in df.columns if c.startswith("has_")] + ["partisan_mismatch"]

cat_cols = [
    "pres_term_idx", "congress_session",
    "seat_level", "vacancy_reason_cat",
    "pres_party", "party_of_appointing_president"
]

# --------  preprocessors -------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num",  SimpleImputer(strategy="median"), num_cols + bool_cols),
        ("cat",  OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="drop"
)

# --------  XGBoost regressor ----------
xgb_reg = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model = Pipeline(steps=[
    ("prep", preprocessor),
    ("xgb",  xgb_reg)
])

# --------  train / validate -----------
from sklearn.model_selection import cross_val_score, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
print("Test MAE:", abs(model.predict(X_test) - y_test).mean())

In [None]:
# Visualization of seat level distribution
print("\nVisualizing seat level distribution:")
import matplotlib.pyplot as plt

df["seat_level"].value_counts().plot(
    kind="bar", 
    figsize=(10, 6), 
    title="Distribution of Judicial Seat Levels"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Number of Nominations")
plt.tight_layout()
plt.show()