In [2]:
# load packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split




In [29]:
# Data Cleaning and Imputation for the three main predictors: credit score, delinquency rate, and income. We will merge these with the education data on FIPS code
creditScore = pd.read_csv('Pre_cleaned_data/cty_cred_score_rP_gP_pall.csv')
creditScore['FIPS'] = creditScore['cty'].str[3:]
creditScore = creditScore.drop(columns=['cty']).rename(columns={'Average_credit_score_rP_gP_pall': 'avg_credit_score'})
creditScore['avg_credit_score'] = creditScore['avg_credit_score'].fillna(creditScore['avg_credit_score'].mean())


Delinq = pd.read_csv('Pre_cleaned_data/cty_deliq_rate_rP_gP_p25.csv')
Delinq['FIPS'] = Delinq['cty'].str[3:]
Delinq = Delinq.drop(columns=['cty','Name']).rename(columns={'Debt_Delinquency_rP_gP_p25': 'avg_delinquency_rate'})
Delinq['avg_delinquency_rate'] = Delinq['avg_delinquency_rate'].fillna(Delinq['avg_delinquency_rate'].mean())

Income = pd.read_csv('Pre_cleaned_data/cty_kir_staycz_rP_gP_p25.csv')
Income['FIPS'] = Income['cty'].str[3:]
Income = Income.drop(columns=['cty','Name']).rename(columns={'Individual_Income_Stayed_in_Commuting_Zone_rP_gP_p25': 'avg_income'})
Income['avg_income'] = Income['avg_income'].fillna(Income['avg_income'].mean())

Edu_Data = pd.read_csv('Pre_cleaned_data/EduFund_1996_2003c.csv')
Edu_Data['FIPS'] = Edu_Data['FIPS'].astype(str).str.zfill(5)


left_joined_data = pd.merge(creditScore, Delinq, on='FIPS', how='left')
final_data = pd.merge(left_joined_data, Income, on='FIPS', how='left')
final_data = pd.merge(final_data, Edu_Data, on='FIPS', how='left')

final_data.dropna(inplace=True)
final_data.drop(columns=['SchoolYear','SupportServicesTotal_adj','LocalCharter_adj','StateTransport_adj'], inplace=True)


crosswalk = final_data[['Name','FIPS']]
crosswalk['FIPS_STATE'] = crosswalk['FIPS'].str[:2]
crosswalk['FIPS_COUNTY'] = crosswalk['FIPS'].str[2:]
crosswalk['COUNTY_NAME'] = crosswalk['Name'].str.split(',').str[0]
crosswalk['STATE_NAME'] = crosswalk['Name'].str.split(',').str[1].str.strip()
crosswalk.drop(columns=['Name'], inplace=True)


cols_to_drop = [col for col in final_data.columns if 'PCT' in col]

# Drop them from the DataFrame
final_data.drop(columns=cols_to_drop, inplace=True)



In [None]:
# fit the model to the training data
X = final_data.drop(columns=["avg_credit_score","avg_delinquency_rate"])
y = final_data["avg_delinquency_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



selected_columns = ["avg_delinquency_rate", "FederalNutrition_adj", "FederalComp_adj", "FederalOther_adj", "StateFormula_adj", "StateOther_adj", "PupilSupport_adj", "FIPS"]
data = final_data[selected_columns].copy()
data = data.merge(crosswalk, on="FIPS", how="left", suffixes=("", "_crosswalk"))


#X = X[selected_columns]

In [15]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan

# check for multicollinearity using VIF
vif_df = pd.DataFrame()
vif_df["feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_df)

# fit the OLS model with robust standard errors
ols_model = smf.ols(formula="avg_delinquency_rate ~ FederalNutrition_adj + FederalComp_adj + FederalOther_adj + StateFormula_adj + StateOther_adj + PupilSupport_adj", data=final_data).fit(cov_type='HC3')
print(ols_model.summary())
bp_test = het_breuschpagan(ols_model.resid, ols_model.model.exog)

# check for heteroscedasticity using the Breusch-Pagan test
print("Breusch-Pagan test statistic:", bp_test[0])
print("Breusch-Pagan test p-value:", bp_test[1])

formula = "avg_credit_score ~ FederalComp_adj * avg_income + FederalNutrition_adj * avg_income + StateFormula_adj * avg_income"

model_interact = smf.ols(formula=formula, data=final_data).fit(cov_type='HC3')
print(model_interact.summary())




                feature        VIF
0                 const  14.349025
1       FederalComp_adj   1.376581
2  FederalNutrition_adj   1.373790
3      FederalOther_adj   1.121925
4      StateFormula_adj   1.053638
5        StateOther_adj   1.130257
6      PupilSupport_adj   1.058981
                             OLS Regression Results                             
Dep. Variable:     avg_delinquency_rate   R-squared:                       0.332
Model:                              OLS   Adj. R-squared:                  0.331
Method:                   Least Squares   F-statistic:                     250.5
Date:                  Sun, 22 Feb 2026   Prob (F-statistic):          9.03e-262
Time:                          04:00:50   Log-Likelihood:                 2703.3
No. Observations:                  3113   AIC:                            -5393.
Df Residuals:                      3106   BIC:                            -5350.
Df Model:                             6                                 

In [33]:
OUTCOME_COL = "avg_delinquency_rate"
ID_COL = "FIPS"

PREDICTOR_COLS = [
    "FederalComp_adj",
    "FederalNutrition_adj",
    "FederalOther_adj",
    "StateFormula_adj",
    "StateOther_adj",
    "PupilSupport_adj"
]
MODERATOR = "avg_income"

ROBUST_SE_TYPE = "HC3"


missing = [c for c in [ID_COL, OUTCOME_COL] + PREDICTOR_COLS if c not in final_data.columns]
if missing:
    raise ValueError(f"Missing columns in df: {missing}")


n_counties = final_data[ID_COL].nunique()
print("Unique counties:", n_counties)


#  Build LONG table 

base = final_data[[ID_COL, OUTCOME_COL, MODERATOR] + PREDICTOR_COLS].copy()

county_long = base.melt(
    id_vars=[ID_COL, OUTCOME_COL, MODERATOR],
    value_vars=PREDICTOR_COLS,
    var_name="predictor",
    value_name="spend_dollars"
)

# per-$1,000 units (for ROI interpretation)
county_long["spend_k"] = county_long["spend_dollars"].astype(float) / 1000.0

print("Long rows (should be approx unique_counties * 6 if 1 row/county):", county_long.shape[0])


# Fit the multivariate OLS on COMPLETE CASES (unique partial effects)

df_model = base.dropna(subset=[OUTCOME_COL] + PREDICTOR_COLS).copy()

# build X in $1,000 units
X = df_model[PREDICTOR_COLS].astype(float) / 1000.0
X_mod = df_model[MODERATOR] / 1000.0 

# Add the 'moderator' (income) to X for the interaction
# Ensure it is also scaled if necessary (e.g., in $1,000s)
X['avg_income_k'] = df_model['avg_income'] / 1000.0

# Create manual interaction terms
# Example: FederalComp ROI now depends on Income
for col in PREDICTOR_COLS:
    X[f'{col}_x_income'] = X[col] * X_mod

y = df_model[OUTCOME_COL].astype(float)
X = sm.add_constant(X)
ols = sm.OLS(df_model[OUTCOME_COL], X).fit()
ols_r = ols.get_robustcov_results(cov_type=ROBUST_SE_TYPE)
betas = ols_r.params
betas = pd.Series(ols_r.params, index=ols_r.model.exog_names)

# ROI mapping: predictor -> points per $1,000
income_map = base.set_index(ID_COL)['avg_income'].to_dict()
county_long['avg_income_k'] = county_long[ID_COL].map(income_map) / 1000.0

X_mean = X.mean().to_frame().T 
baseline = float(ols_r.predict(X_mean)[0])
county_long["baseline_pred_credit"] = baseline

def get_dynamic_roi(row):
    p = row['predictor']
    inc_k = row[MODERATOR] / 1000.0
    # ROI = Base Beta + (Interaction Beta * County Income)
    return betas.get(p, 0) + (betas.get(f"{p}_x_income", 0) * inc_k)

# Apply to every row - roi_points_per_1000 will now fluctuate!
county_long["roi_points_per_1000"] = county_long.apply(get_dynamic_roi, axis=1)


# Instead of mapping a dictionary, calculate directly on the Series.
# This ensures it's dynamic to the data in each specific row.
county_long["dollars_for_1_pt_reduction"] = (
    1000.0 * (1.0 / county_long["roi_points_per_1000"].abs())
).where((county_long["roi_points_per_1000"] < 0) & np.isfinite(county_long["roi_points_per_1000"]))

# Contributions vs "average county" baseline (continuous)
# Contribution = beta * (x_i - mean(x)) in $1,000 units

means_k = (df_model[PREDICTOR_COLS].astype(float) / 1000.0).mean().to_dict()
county_long["mean_spend_k_model"] = county_long["predictor"].map(means_k)

county_long["contribution_points_vs_baseline"] = (
    county_long["roi_points_per_1000"] * (county_long["spend_k"] - county_long["mean_spend_k_model"])
)

# Baseline predicted credit for "average county" 
means_k = (df_model[PREDICTOR_COLS].astype(float) / 1000.0).mean() 
baseline = float(
    betas.get("const", 0.0) + 
    (betas.loc[PREDICTOR_COLS] * means_k.loc[PREDICTOR_COLS]).sum()
)
county_long["baseline_pred_credit"] = baseline


def calculate_dollars_dynamic(roi_series):
    # Vectorized check: if ROI is finite and > 0, calculate. Otherwise, NaN.
    return 1000.0 * (100.0 / roi_series).where((roi_series > 0) & np.isfinite(roi_series))

# Assign directly to the dataframe

# Add model-predicted credit at the county row level 

df_model["_pred_credit"] = ols.predict(X)
county_long = county_long.merge(
    df_model[[ID_COL, "_pred_credit"]],
    on=ID_COL,
    how="left"
).rename(columns={"_pred_credit": "pred_credit"})


# Final checks
print("county_long shape:", county_long.shape)
print("Rows per county (should be 6 if 1 row/county):")
print(county_long.groupby(ID_COL).size().value_counts().head())




Unique counties: 3113
Long rows (should be approx unique_counties * 6 if 1 row/county): 18678
county_long shape: (18678, 13)
Rows per county (should be 6 if 1 row/county):
6    3113
Name: count, dtype: int64


In [None]:
from auto_sklearn2 import AutoSklearnRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# AutoML with AutoSklearnRegressor
automl = AutoSklearnRegressor(time_limit=100, random_state=42)

automl.fit(X_train, y_train)

y_pred = automl.predict(X_test)
r2 = r2_score(y_test, y_pred)



In [None]:
# Print the evaluation metrics
print(f"R^2 Score: {r2}")
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"\nBest model: {automl.best_params}")
for model_name, score in automl.get_models_performance().items():
    print(f"{model_name}: {score:.4f}")



# Get the best model and its weight
best_model = automl.best_model
print(best_model)

final_estimator = best_model.steps[-1][1]
print(type(final_estimator))

importances = final_estimator.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)


print(f"Percentage RMSE: {(rmse / y_train.mean()) * 100}")
print(f"Standard Deviation of y_train: {y_train.std()}")