In [2]:
import pandas as pd
from pathlib import Path

In [None]:
# copy of outcome_measures_by_state but it uses the specific graduation rates data on IPEDS instead of the outcome measures data

In [3]:
ROOT = Path.cwd()
DATA_DIR = ROOT / "datasets"
IPEDS_DIR = DATA_DIR / "ipeds"
GRAD_DIR = IPEDS_DIR / "graduation"
PROCESSED_DIR = IPEDS_DIR / "processed"

year = 2016

In [25]:
# grad rate data should be the 4 YEAR GRADUATION RATE in the database SIX YEARS AFTER the class entered, due to how the data is structured. Ex. 2022 grad rate data describes the incoming class of 2016 (for 4-year), graduating in 2020. 

grad = pd.read_csv(GRAD_DIR / f"gr{year+6}_rv.csv")

# bachelors or equivalent seeking cohort size (denominator)
totalCohort = grad[grad["GRTYPE"] == 8].reset_index(drop=True)
totalCohort = pd.concat([totalCohort["UNITID"], totalCohort["GRTOTLT"]], axis = 1)

# Completers of bachelors in <= 4 yrs (numerator)
graduatedCohort = grad[grad["GRTYPE"] == 13].reset_index(drop = True)
graduatedCohort = pd.concat([graduatedCohort["UNITID"], graduatedCohort["GRTOTLT"]], axis = 1)

mergedCohort = pd.merge(graduatedCohort, totalCohort, on="UNITID", how="inner")
mergedCohort["GRTOTLT_x"] = mergedCohort["GRTOTLT_x"] / mergedCohort["GRTOTLT_y"]
mergedCohort = mergedCohort.iloc[:, 0:2]
mergedCohort.columns = ["UNITID", "GRADRATE"]
mergedCohort

Unnamed: 0,UNITID,GRADRATE
0,100654,0.126781
1,100663,0.444502
2,100690,0.500000
3,100706,0.389774
4,100724,0.136483
...,...,...
1873,494685,0.357143
1874,496627,0.720000
1875,497408,1.000000
1876,498562,0.404128


In [26]:
# grab state makeups data
migration_csv = PROCESSED_DIR / f"{year}_crushed_migration.csv"
stateMakeup = pd.read_csv(migration_csv)
stateMakeup

Unnamed: 0,UNITID,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Unknown,US Total,GRAND TOTAL
0,100654,884.0,0.0,1.0,3.0,33.0,1.0,0.0,3.0,3.0,...,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,1428.0,1448
1,100663,1712.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,3.0,0.0,1.0,3.0,0.0,0.0,1974.0,2021
2,100690,19.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23.0,23
3,100706,861.0,0.0,0.0,1.0,3.0,1.0,1.0,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,3.0,0.0,0.0,1184.0,1213
4,100724,662.0,0.0,1.0,0.0,20.0,5.0,1.0,0.0,4.0,...,0.0,0.0,6.0,2.0,0.0,11.0,0.0,0.0,1148.0,1163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6191,489830,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21
6192,489900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5
6193,489937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,28.0,29
6194,490009,0.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8


In [28]:
# reduce data to fewest rows, so we only evaluate schools with both migration and outcome data
stateAndGradRate = pd.merge(stateMakeup, mergedCohort, on="UNITID", how = "inner")

stateAndGradRate


Unnamed: 0,UNITID,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Unknown,US Total,GRAND TOTAL,GRADRATE
0,100654,884.0,0.0,1.0,3.0,33.0,1.0,0.0,3.0,3.0,...,0.0,2.0,0.0,0.0,3.0,0.0,0.0,1428.0,1448,0.126781
1,100663,1712.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,3.0,0.0,1.0,3.0,0.0,0.0,1974.0,2021,0.444502
2,100690,19.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23.0,23,0.500000
3,100706,861.0,0.0,0.0,1.0,3.0,1.0,1.0,0.0,0.0,...,0.0,7.0,0.0,0.0,3.0,0.0,0.0,1184.0,1213,0.389774
4,100724,662.0,0.0,1.0,0.0,20.0,5.0,1.0,0.0,4.0,...,0.0,6.0,2.0,0.0,11.0,0.0,0.0,1148.0,1163,0.136483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1845,488350,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,19,0.263158
1846,488846,0.0,28.0,20.0,0.0,171.0,32.0,9.0,5.0,7.0,...,3.0,39.0,52.0,2.0,8.0,2.0,7.0,1198.0,2553,0.103093
1847,489344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.000000
1848,489779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,21,0.100000


In [52]:
# scaling school population uniformly by school grad rate
scaled = pd.DataFrame(stateAndGradRate.iloc[:, 1:-1].apply(pd.to_numeric).values * (stateAndGradRate.iloc[: , -1:].apply(pd.to_numeric, axis = 1).values))
scaled.columns = stateAndGradRate.columns[1:-1]
scaled

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Unknown,US Total,GRAND TOTAL
0,112.074074,0.000000,0.126781,0.380342,4.183761,0.126781,0.000000,0.380342,0.380342,4.437322,...,0.000000,0.000000,0.253561,0.000000,0.000000,0.380342,0.000000,0.000000,181.042735,183.578348
1,760.987552,0.444502,0.889004,0.444502,0.444502,0.444502,0.000000,0.000000,0.444502,12.446058,...,0.000000,0.000000,1.333506,0.000000,0.444502,1.333506,0.000000,0.000000,877.447095,898.338693
2,9.500000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,11.500000,11.500000
3,335.595138,0.000000,0.000000,0.389774,1.169321,0.389774,0.389774,0.000000,0.000000,5.846605,...,0.000000,0.000000,2.728416,0.000000,0.000000,1.169321,0.000000,0.000000,461.492037,472.795474
4,90.351706,0.000000,0.136483,0.000000,2.729659,0.682415,0.136483,0.000000,0.545932,8.461942,...,0.000000,0.000000,0.818898,0.272966,0.000000,1.501312,0.000000,0.000000,156.682415,158.729659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1845,0.000000,0.000000,0.000000,0.000000,0.263158,0.000000,0.000000,0.000000,0.000000,1.052632,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.473684,5.000000
1846,0.000000,2.886598,2.061856,0.000000,17.628866,3.298969,0.927835,0.515464,0.721649,10.309278,...,1.134021,0.309278,4.020619,5.360825,0.206186,0.824742,0.206186,0.721649,123.505155,263.195876
1847,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
1848,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.100000,2.100000


In [56]:
# divide the total graduating from a given state by the total incoming class size (4 years earlier) to get the average graduation rate from the state. 
graduatingMakeup = scaled.sum()

totalMakeup = stateMakeup.drop(columns = "UNITID").sum()

gradPercentByState = graduatingMakeup / totalMakeup
gradPercentByState

Alabama                 0.200323
Alaska                  0.196919
Arizona                 0.188544
Arkansas                0.224057
California              0.198382
Colorado                0.272941
Connecticut             0.389530
Delaware                0.303005
District of Columbia    0.339796
Florida                 0.212714
Georgia                 0.235693
Hawaii                  0.255178
Idaho                   0.217639
Illinois                0.307019
Indiana                 0.331100
Iowa                    0.264669
Kansas                  0.221123
Kentucky                0.266538
Louisiana               0.200088
Maine                   0.302258
Maryland                0.292265
Massachusetts           0.406827
Michigan                0.255809
Minnesota               0.329370
Mississippi             0.132865
Missouri                0.246747
Montana                 0.241091
Nebraska                0.262726
Nevada                  0.175251
New Hampshire           0.388377
New Jersey

In [None]:
gradPercentByState.to_csv(PROCESSED_DIR / f"class_{year}_4yr_gradrate_by_state.csv", header = ["GRADRATE"])