# Group 3: College ROI (Simple Baseline)

Goal: test which school characteristics are linked to median earnings 10 years after entry.


In [None]:
import pandas as pd
import statsmodels.formula.api as smf


In [None]:
# Use local file in this repo
file_path = "../data/raw/Most-Recent-Cohorts-Institution.csv"

# If you open notebook from repo root, use this path instead:
# file_path = "data/raw/Most-Recent-Cohorts-Institution.csv"

df = pd.read_csv(file_path, low_memory=False)
print(df.shape)


In [None]:
# Keep only variables we need
cols = [
    "INSTNM", "STABBR", "CONTROL", "REGION", "LOCALE",
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL",
    "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"
]

df = df[cols].copy()

# Turn key columns into numbers
num_cols = [
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL", "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


In [None]:
# Build features
stem_cols = ["PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"]
df["stem_share"] = df[stem_cols].fillna(0).sum(axis=1)
df["completion_rate"] = df["C150_4"].fillna(df["C150_L4"])

# Keep rows we can model
model_df = df.dropna(subset=[
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL",
    "stem_share", "completion_rate", "CONTROL", "REGION", "LOCALE"
]).copy()

print(model_df.shape)
model_df[["MD_EARN_WNE_P10", "stem_share", "MEDIAN_HH_INC", "PCTPELL", "completion_rate"]].describe()


## Baseline Model
Earnings on income controls + location.


In [None]:
baseline = smf.ols(
    "MD_EARN_WNE_P10 ~ MEDIAN_HH_INC + PCTPELL + C(REGION) + C(LOCALE)",
    data=model_df
).fit()

print(baseline.summary())


## Full Model
Adds STEM share, school type, and completion.


In [None]:
full = smf.ols(
    "MD_EARN_WNE_P10 ~ MEDIAN_HH_INC + PCTPELL + stem_share + completion_rate + C(CONTROL) + C(REGION) + C(LOCALE)",
    data=model_df
).fit()

print(full.summary())


In [None]:
# Quick side-by-side checks
print("Baseline R^2:", round(baseline.rsquared, 4))
print("Full R^2:", round(full.rsquared, 4))

print("
Key full-model coefficients:")
print(full.params[["MEDIAN_HH_INC", "PCTPELL", "stem_share", "completion_rate"]])
