# College ROI Baseline (Group 3)

Goal: estimate which school characteristics best predict median earnings 10 years after entry.

This notebook uses:
- outcome: `MD_EARN_WNE_P10`
- controls: `MEDIAN_HH_INC`, `PCTPELL`, `REGION`, `LOCALE`
- key predictors: `stem_share`, `CONTROL`, `completion_rate`


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf


In [None]:
# Resolve project root whether notebook is run from repo root or notebooks/
project_root = Path.cwd()
if not (project_root / "README.md").exists() and (project_root.parent / "README.md").exists():
    project_root = project_root.parent

raw_file = project_root / "data/raw/Most-Recent-Cohorts-Institution.csv"
if not raw_file.exists():
    raw_file = Path("/Users/jacksonmaroon/Downloads/College_Scorecard_Raw_Data_10032025/Most-Recent-Cohorts-Institution.csv")

print(f"Using file: {raw_file}")


In [None]:
cols = [
    "UNITID", "INSTNM", "STABBR",
    "CONTROL", "REGION", "LOCALE",
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL",
    "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41",
]

df = pd.read_csv(raw_file, usecols=cols, low_memory=False)
print(df.shape)
df.head(3)


In [None]:
num_cols = [
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL", "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41",
]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

stem_cols = ["PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"]
df["stem_share"] = df[stem_cols].fillna(0).sum(axis=1).clip(0, 1)
df["completion_rate"] = df["C150_4"].fillna(df["C150_L4"])

df.loc[df["MD_EARN_WNE_P10"] <= 0, "MD_EARN_WNE_P10"] = np.nan

# Controls used in the model
_df = df.copy()
_df["med_hh_inc_k"] = _df["MEDIAN_HH_INC"] / 10000
_df["pctpell"] = _df["PCTPELL"]
_df["ln_earn"] = np.log(_df["MD_EARN_WNE_P10"])

model_df = _df.dropna(subset=[
    "ln_earn", "med_hh_inc_k", "pctpell", "stem_share",
    "completion_rate", "REGION", "LOCALE", "CONTROL"
]).copy()

print(model_df.shape)
model_df[["MD_EARN_WNE_P10", "stem_share", "MEDIAN_HH_INC", "PCTPELL", "completion_rate"]].describe()


## Baseline Model

`ln(earnings)` on family-income controls + location.


In [None]:
baseline = smf.ols(
    "ln_earn ~ med_hh_inc_k + pctpell + C(REGION) + C(LOCALE)",
    data=model_df,
).fit(cov_type="HC3")

print(baseline.summary())


## Full Model

Adds STEM share, school type (`CONTROL`), and completion rate.


In [None]:
full = smf.ols(
    "ln_earn ~ med_hh_inc_k + pctpell + stem_share + completion_rate + C(CONTROL) + C(REGION) + C(LOCALE)",
    data=model_df,
).fit(cov_type="HC3")

print(full.summary())


In [None]:
compare = pd.DataFrame({
    "model": ["baseline", "full"],
    "n_obs": [int(baseline.nobs), int(full.nobs)],
    "r_squared": [baseline.rsquared, full.rsquared],
    "adj_r_squared": [baseline.rsquared_adj, full.rsquared_adj],
})

key_terms = ["med_hh_inc_k", "pctpell", "stem_share", "completion_rate"]
coef = pd.DataFrame({
    "baseline_coef": baseline.params.reindex(key_terms),
    "baseline_p": baseline.pvalues.reindex(key_terms),
    "full_coef": full.params.reindex(key_terms),
    "full_p": full.pvalues.reindex(key_terms),
})

print("Model fit")
display(compare)
print("
Key coefficients")
display(coef)


In [None]:
out_file = project_root / "data/processed/model_data_latest.csv"
model_df.to_csv(out_file, index=False)
print(f"Saved: {out_file}")
