In [1]:
import pandas as pd
from pathlib import Path

In [52]:
# take state college data, multiply it by outcome measures of college, sum or average by column, and we have a measure for the state
# btw, realized that you can reindex using unitid and maybe have a way easier time. It could be worth doing, though it wouldn't significantly impact performance.

In [None]:
ROOT = Path.cwd()
DATA_DIR = ROOT / "datasets"
IPEDS_DIR = DATA_DIR / "ipeds"
OUTCOMES_DIR = IPEDS_DIR / "outcomes"
PROCESSED_DIR = IPEDS_DIR / "processed"

In [56]:
# outcomes data should be the 4 YEAR GRADUATION RATE in the database EIGHT YEARS AFTER the class entered, due to how the data is structured. Ex. 2018 om 4-year describes the incoming class of 2010, graduating in 2014. 

outcomes = pd.read_csv(OUTCOMES_DIR / "om2018_rv.csv")

# num unique schools
print(len(outcomes["UNITID"].unique()))

# first time, full time 

firstFullOutcomes = outcomes[outcomes["OMCHRT"] == 10].reset_index(drop=True)

unitIds = firstFullOutcomes["UNITID"].reset_index(drop=True)

# grad rate sanity check
# print(firstFullOutcomes["OMAWDP4"].mean()) 

gradPercent = pd.concat([firstFullOutcomes["UNITID"], firstFullOutcomes["OMAWDP4"]], axis = 1)
gradPercent

3818


Unnamed: 0,UNITID,OMAWDP4
0,100654,7.0
1,100663,30.0
2,100690,100.0
3,100706,15.0
4,100724,7.0
...,...,...
3667,491640,0.0
3668,491710,8.0
3669,492069,17.0
3670,492801,9.0


In [5]:
# grab state makeups data
migration_csv = PROCESSED_DIR / "2010_crushed_migration.csv"
stateMakeup = pd.read_csv(migration_csv)
stateMakeup

Unnamed: 0,UNITID,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Unknown,US Total,GRAND TOTAL
0,100654,720.0,2.0,2.0,1.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,3.0,0.0,2.0,1123.0,1123
1,100663,1407.0,0.0,2.0,2.0,7.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1556.0,1571
2,100690,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4
3,100706,503.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,607.0,622
4,100724,692.0,0.0,0.0,2.0,15.0,1.0,1.0,0.0,0.0,...,0.0,0.0,2.0,2.0,0.0,8.0,0.0,6.0,1100.0,1101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6733,460631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97.0,97
6734,460640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,24
6735,460659,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20
6736,460668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15


In [57]:
# reduce data to fewest rows, so we only evaluate schools with both migration and outcome data
reducedMakeup = stateMakeup[stateMakeup["UNITID"].isin(gradPercent["UNITID"])]
reducedGradRate = gradPercent[gradPercent["UNITID"].isin(stateMakeup["UNITID"])]

# row nums should be equal
print(reducedMakeup.shape)
print(reducedGradRate.shape)

(3458, 55)
(3458, 2)


In [59]:
# scaling school population uniformly by school grad rate
scaled = pd.DataFrame(reducedMakeup.iloc[:, 1:].apply(pd.to_numeric).values * (reducedGradRate.iloc[: , 1:].apply(pd.to_numeric, axis = 1).values / 100))
scaled = scaled.reset_index(drop=True)
print(scaled.shape)

COL_LABELS = reducedMakeup.columns.tolist()[1:]
scaled.columns = COL_LABELS

scaled

(3458, 54)


Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Unknown,US Total,GRAND TOTAL
0,50.40,0.14,0.14,0.07,0.49,0.00,0.00,0.0,0.0,0.77,...,0.0,0.00,0.14,0.00,0.0,0.21,0.0,0.14,78.61,78.61
1,422.10,0.00,0.60,0.60,2.10,0.00,0.00,0.0,0.0,3.30,...,0.3,0.00,0.30,0.00,0.0,0.30,0.0,0.00,466.80,471.30
2,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,1.00,...,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.00,4.00,4.00
3,75.45,0.30,0.00,0.00,0.15,0.30,0.00,0.0,0.0,0.45,...,0.0,0.15,0.30,0.00,0.0,0.00,0.0,0.15,91.05,93.30
4,48.44,0.00,0.00,0.14,1.05,0.07,0.07,0.0,0.0,3.57,...,0.0,0.00,0.14,0.14,0.0,0.56,0.0,0.42,77.00,77.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3453,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,79.12,...,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.00,80.96,80.96
3454,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,...,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.00,111.15,111.15
3455,0.00,0.00,0.00,0.00,344.82,0.00,0.00,0.0,0.0,0.00,...,0.0,0.00,0.00,0.00,0.0,0.00,0.0,5.46,350.28,351.96
3456,0.00,0.00,0.00,0.00,659.40,0.00,0.00,0.0,0.0,0.00,...,0.0,0.00,0.00,0.00,0.0,0.00,0.0,8.40,667.80,676.20


In [73]:
# divide the total graduating from a given state by the total incoming class size (4 years earlier) to get the average graduation rate from the state. 
graduatingMakeup = scaled.sum()

totalMakeup = reducedMakeup.drop(columns = "UNITID").sum()

gradPercentByState = graduatingMakeup / totalMakeup
gradPercentByState

Alabama                 0.234839
Alaska                  0.273732
Arizona                 0.328288
Arkansas                0.277166
California              0.380189
Colorado                0.322917
Connecticut             0.414364
Delaware                0.445272
District of Columbia    0.363088
Florida                 0.406851
Georgia                 0.283646
Hawaii                  0.302711
Idaho                   0.247630
Illinois                0.371750
Indiana                 0.347546
Iowa                    0.356038
Kansas                  0.327759
Kentucky                0.292672
Louisiana               0.256330
Maine                   0.379295
Maryland                0.344998
Massachusetts           0.436640
Michigan                0.277449
Minnesota               0.378329
Mississippi             0.267724
Missouri                0.329383
Montana                 0.275596
Nebraska                0.327949
Nevada                  0.217098
New Hampshire           0.458668
New Jersey

In [77]:
gradPercentByState.to_csv(PROCESSED_DIR / "class 2010 4yr grad rate by state.csv", header=False)