# Group 3: College ROI (Simple Baseline)

Goal: test which school characteristics are linked to median earnings 10 years after entry.


In [1]:
import pandas as pd
from davis_stats import reg, scatter

pd.set_option('display.float_format', '{:,.2f}'.format)
import matplotlib.pyplot as plt


In [2]:
# load data
file_path = "../data/raw/Most-Recent-Cohorts-Institution.csv"

df = pd.read_csv(file_path, low_memory=False)
print(df.shape)


(6429, 3306)


In [3]:
# keep needed columns
cols = [
    "INSTNM", "STABBR", "CONTROL", "REGION", "LOCALE",
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL",
    "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"
]

df = df[cols].copy()

# make numeric columns numeric
num_cols = [
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL", "C150_4", "C150_L4",
    "PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


In [4]:
# build features
stem_cols = ["PCIP11", "PCIP14", "PCIP15", "PCIP26", "PCIP27", "PCIP40", "PCIP41"]
df["stem_share"] = df[stem_cols].fillna(0).sum(axis=1)
df["completion_rate"] = df["C150_4"].fillna(df["C150_L4"])

# rows for modeling
model_df = df.dropna(subset=[
    "MD_EARN_WNE_P10", "MEDIAN_HH_INC", "PCTPELL",
    "stem_share", "completion_rate", "CONTROL", "REGION", "LOCALE"
]).copy()

print(model_df.shape)
model_df[["MD_EARN_WNE_P10", "stem_share", "MEDIAN_HH_INC", "PCTPELL", "completion_rate"]].describe()


(4212, 19)


Unnamed: 0,MD_EARN_WNE_P10,stem_share,MEDIAN_HH_INC,PCTPELL,completion_rate
count,4212.0,4212.0,4212.0,4212.0,4212.0
mean,43588.03,0.1,58026.25,0.42,0.53
std,16284.92,0.13,12885.85,0.2,0.21
min,9656.0,0.0,15790.53,0.0,0.0
25%,32568.0,0.0,49701.65,0.26,0.37
50%,41000.0,0.06,57419.93,0.38,0.53
75%,52164.75,0.15,66211.08,0.55,0.69
max,143372.0,1.0,100870.75,1.0,1.0


## Baseline Model
Earnings on income controls + location.


In [5]:
baseline = reg(
    model_df,
    'MD_EARN_WNE_P10',
    ['MEDIAN_HH_INC', 'PCTPELL'],
    dummies=['REGION', 'LOCALE'],
    silent=True
)

print(baseline.summary())


                            OLS Regression Results                            
Dep. Variable:        MD_EARN_WNE_P10   R-squared:                       0.426
Model:                            OLS   Adj. R-squared:                  0.423
Method:                 Least Squares   F-statistic:                     135.3
Date:                Tue, 17 Feb 2026   Prob (F-statistic):               0.00
Time:                        15:24:48   Log-Likelihood:                -45654.
No. Observations:                4212   AIC:                         9.136e+04
Df Residuals:                    4188   BIC:                         9.151e+04
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          3.246e+04   1.77e+04      1.835

## Full Model
Adds STEM share, school type, and completion.


In [6]:
full = reg(
    model_df,
    'MD_EARN_WNE_P10',
    ['MEDIAN_HH_INC', 'PCTPELL', 'stem_share', 'completion_rate'],
    dummies=['CONTROL', 'REGION', 'LOCALE'],
    silent=True
)

print(full.summary())


                            OLS Regression Results                            
Dep. Variable:        MD_EARN_WNE_P10   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.600
Method:                 Least Squares   F-statistic:                     234.9
Date:                Tue, 17 Feb 2026   Prob (F-statistic):               0.00
Time:                        15:24:48   Log-Likelihood:                -44881.
No. Observations:                4212   AIC:                         8.982e+04
Df Residuals:                    4184   BIC:                         9.000e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            2.327e+04   1.48e+04     

In [7]:
# compare models
print('Baseline R^2:', round(baseline.rsquared, 4))
print('Full R^2:', round(full.rsquared, 4))

print()
print('Key full-model coefficients:')
print(full.params[['MEDIAN_HH_INC', 'PCTPELL', 'stem_share', 'completion_rate']])


Baseline R^2: 0.4264
Full R^2: 0.6025

Key full-model coefficients:
MEDIAN_HH_INC          0.38
PCTPELL           -9,928.23
stem_share        34,484.53
completion_rate    9,046.31
dtype: float64


In [8]:
# coefficient chart
coef = full.params[['MEDIAN_HH_INC', 'PCTPELL', 'stem_share', 'completion_rate']]
coef.plot(kind='barh', figsize=(8, 5))
plt.axvline(0, color='gray', linestyle='--')
plt.title('Full Model: Key Coefficients')
plt.xlabel('Coefficient')
plt.ylabel('')
plt.tight_layout()
plt.show()


  plt.show()


In [9]:
# R^2 chart
labels = ['Baseline', 'Full']
values = [baseline.rsquared, full.rsquared]
plt.figure(figsize=(6, 5))
plt.bar(labels, values)
plt.ylim(0, 1)
plt.title('Model Fit Comparison (R^2)')
plt.ylabel('R^2')
plt.tight_layout()
plt.show()


  plt.show()


In [10]:
# scatter plot
scatter(model_df, 'MD_EARN_WNE_P10', 'stem_share', fit_line=True)


  plt.show()
