In [1]:
import sys
import psycopg2 as pg2  # Preferred cursor connection
from sqlalchemy import create_engine  # preferred for pushing back to DB
import yaml
import pandas as pd
import numpy as np

# Might need your own path...
with open('/data/users/dschnelb/secrets.yaml', 'r') as f:
        # loads contents of secrets.yaml into a python dictionary
        secret_config = yaml.safe_load(f.read())

# Set database connection to `conn`
db_params = secret_config['db']
conn = pg2.connect(host=db_params['host'],
                    port=db_params['port'],
                    dbname=db_params['dbname'],
                    user=db_params['user'],
                    password=db_params['password'])

# Connect cursor with psycopg2 database connection
cur = conn.cursor()

In [2]:
qry = ''' select student_lookup, school, gpa_9, gpa_9_missing, school_gpa_9_rank, school_gpa_9_decile
from sketch.grade_9_gpa; '''

cur.execute(qry)

rows = cur.fetchall()

gpa = pd.DataFrame([[int(row[0])] + list(row)[1:] for row in rows],
                    columns=[name[0] for name in cur.description])

# Make sure student_id is an int
gpa['student_lookup'] = gpa['student_lookup'].astype('int')
gpa.replace([None], np.nan, inplace=True)
gpa

Unnamed: 0,student_lookup,school,gpa_9,gpa_9_missing,school_gpa_9_rank,school_gpa_9_decile
0,57112,COSHOCTON CITY HIGH SCHOOL,,1,83,
1,57156,COSHOCTON CITY HIGH SCHOOL,,1,83,
2,57153,COSHOCTON CITY HIGH SCHOOL,,1,83,
3,57151,COSHOCTON CITY HIGH SCHOOL,,1,83,
4,57148,COSHOCTON CITY HIGH SCHOOL,,1,83,
...,...,...,...,...,...,...
19306,22495,Zanesville High School,,1,1,
19307,22494,Zanesville High School,,1,1,
19308,22493,Zanesville High School,,1,1,
19309,22445,Zanesville High School,,1,1,


In [7]:
qry = ''' select distinct student_lookup, school_year, COALESCE(school_name,school_code) as school
from clean.all_snapshots
where grade=10
and school_year BETWEEN 2007 AND 2013; '''

cur.execute(qry)

rows = cur.fetchall()

df = pd.DataFrame([[int(row[0])] + list(row)[1:] for row in rows],
                    columns=[name[0] for name in cur.description])

# Make sure student_id is an int
df['student_lookup'] = df['student_lookup'].astype('int')

In [8]:
df

Unnamed: 0,student_lookup,school_year,school
0,35993,2011,Philo High School
1,39939,2010,RVHS
2,52418,2011,River View High School
3,9028,2013,Ridgewood High School
4,829,2013,Coshocton High School
...,...,...,...
13288,41836,2012,New Lexington High School
13289,13710,2013,Philo High School
13290,4992,2012,Philo High School
13291,16793,2013,West Muskingum High School


In [9]:
data = pd.merge(df, gpa, how = 'left',on='student_lookup')
data.head()

Unnamed: 0,student_lookup,school_year,school_x,school_y,gpa_9,gpa_9_missing,school_gpa_9_rank,school_gpa_9_decile
0,35993,2011,Philo High School,FPHS,,1.0,153.0,
1,39939,2010,RVHS,,,,,
2,52418,2011,River View High School,RVHS,1.222222222222222,0.0,126.0,10.0
3,9028,2013,Ridgewood High School,Ridgewood High School,,1.0,89.0,
4,829,2013,Coshocton High School,Coshocton High School,3.15625,0.0,36.0,3.0


In [10]:
data.rename(columns={'school_x':'school_10','school_y':'school_9'},inplace=True)
data.head()

Unnamed: 0,student_lookup,school_year,school_10,school_9,gpa_9,gpa_9_missing,school_gpa_9_rank,school_gpa_9_decile
0,35993,2011,Philo High School,FPHS,,1.0,153.0,
1,39939,2010,RVHS,,,,,
2,52418,2011,River View High School,RVHS,1.222222222222222,0.0,126.0,10.0
3,9028,2013,Ridgewood High School,Ridgewood High School,,1.0,89.0,
4,829,2013,Coshocton High School,Coshocton High School,3.15625,0.0,36.0,3.0


In [11]:
1 - sum(data['gpa_9'].notnull())/len(data)

0.3303994583615437

## 33% of ALL students entering grade 10 in years 2007-2013 are missing ninth grade gpa data. Note, this does not drop the transfers, etc., as our load_data function does. 

In [12]:
data['gpa_9']=data['gpa_9'].astype('float')

In [13]:
missing_by_yr = data.groupby('school_year').agg({'student_lookup':'count','gpa_9':'median','gpa_9_missing':'sum'})

In [14]:
missing_by_yr['missing%'] = missing_by_yr['gpa_9_missing']/missing_by_yr['student_lookup']
missing_by_yr

Unnamed: 0_level_0,student_lookup,gpa_9,gpa_9_missing,missing%
school_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007,1360,2.817391,144.0,0.105882
2008,1328,2.728095,117.0,0.088102
2009,1279,2.73913,136.0,0.106333
2010,1543,2.72,150.0,0.097213
2011,2420,2.798437,194.0,0.080165
2012,2658,2.750806,1099.0,0.413469
2013,2705,2.916667,460.0,0.170055


## Median gpa imputation results in a pretty high GPA overall

In [18]:
by_school = data.groupby(['school_year', 'school_10']).agg({'student_lookup':'count','gpa_9_missing':'sum'})

In [19]:
by_school['missing%'] = by_school['gpa_9_missing']/by_school['student_lookup']
by_school

Unnamed: 0_level_0,Unnamed: 1_level_0,student_lookup,gpa_9_missing,missing%
school_year,school_10,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007,COSHOCTON CITY HIGH SCHOOL,171,70.0,0.409357
2007,Maysville High School,198,22.0,0.111111
2007,PHILO HIGH SCHOOL,217,13.0,0.059908
2007,RIDGEWOOD HIGH SCHOOL,131,2.0,0.015267
2007,RIVER VIEW HIGH SCHOOL,197,3.0,0.015228
...,...,...,...,...
2013,River View High School,189,19.0,0.100529
2013,Sheridan High School,185,4.0,0.021622
2013,Tri-Valley High School,283,47.0,0.166078
2013,West Muskingum High School,138,11.0,0.079710


In [20]:
by_school.unstack(1).loc[:,'missing%']

school_10,COHI,COSHOCTON CITY HIGH SCHOOL,Coshocton High School,Crooksville High School,EMHS,FPHS,John Glenn High School,Logan High School,MAHS,Maysville High School,...,RVHS,Ridgewood High School,River View High School,Sheridan High School,TVHS,Tri-Valley High School,WMHS,West Muskingum High School,Zanesville CCFD,Zanesville High School
school_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007,,0.409357,,,,,,,,0.111111,...,,,,,,0.03125,,0.158228,,
2008,0.401099,,,,,0.030303,,,0.026455,,...,0.014706,,,,0.031359,,0.074324,,,
2009,0.371257,,,,,0.093407,,,0.065574,,...,0.049751,,,,0.01773,,0.106383,,,
2010,0.431507,,,0.01087,1.0,0.110497,0.016854,,0.058296,,...,0.09596,,,,0.026119,,0.035461,,,
2011,,,0.351351,0.180952,,,0.196721,0.0,,0.075581,...,,0.113402,0.108247,0.0,,0.023904,,0.092308,0.333333,0.059211
2012,,,0.053333,0.15748,,,0.164835,0.925,,0.096939,...,,0.09901,0.058824,0.965,,0.200787,,0.086957,0.333333,0.921311
2013,,,0.058442,0.125,,,0.094737,0.014409,,0.065,...,,0.1,0.100529,0.021622,,0.166078,,0.07971,,0.942857


## Zanesville has effectively no gpas available in latter years; 
## Missingness comes in these 3-year spurts, and it must be related in some way to linking ninth grade data onto the student_lookup when that student is in the 10th grade... 

In [23]:
test = data.set_index('student_lookup')
test.head()

Unnamed: 0_level_0,school_year,school_10,school_9,gpa_9,gpa_9_missing,school_gpa_9_rank,school_gpa_9_decile
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
35993,2011,Philo High School,FPHS,,1.0,153.0,
39939,2010,RVHS,,,,,
52418,2011,River View High School,RVHS,1.222222,0.0,126.0,10.0
9028,2013,Ridgewood High School,Ridgewood High School,,1.0,89.0,
829,2013,Coshocton High School,Coshocton High School,3.15625,0.0,36.0,3.0


In [28]:
test.iloc[:,3:]

Unnamed: 0_level_0,gpa_9,gpa_9_missing,school_gpa_9_rank,school_gpa_9_decile
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35993,,1.0,153.0,
39939,,,,
52418,1.222222,0.0,126.0,10.0
9028,,1.0,89.0,
829,3.156250,0.0,36.0,3.0
...,...,...,...,...
41836,,1.0,2.0,
13710,3.876667,0.0,10.0,1.0
4992,2.013333,0.0,123.0,9.0
16793,1.300000,0.0,88.0,10.0


## Running through feature_ranker to check again - it looks like it works, but will be highly contingent on how we impute GPA. A GPA = NaN puts it at the TOP when ranking (i.e., above a 4.0 gpa). Imputing gives a pretty high replacement GPA, and we could likely suspect this to be false. For example, some of the NULL values might be because a student failed to pass the 9th grade and needed to repeat, which could make their "first" 9th grade year GPA null, but their "second" ninth grade year actually has one. 

In [30]:
x = test.iloc[:,3:].to_numpy(dtype=np.float)[:,0]
x[:10]

array([       nan,        nan, 1.22222222,        nan, 3.15625   ,
       3.3       ,        nan, 2.68709677, 2.87118644,        nan])

In [31]:
from scipy import stats
method = "max"
subtract = 0

In [32]:
ranks = stats.rankdata(x, method)
ranks[:10]

array([13293,  9510,   457,  9519,  5847,  6399,  9536,  4020,  4752,
        9526])

In [33]:
ranks = [(rank - subtract) / x.shape[0] for rank in ranks]
ranks[:10]

[1.0,
 0.7154141277364027,
 0.03437899646430452,
 0.7160911758068156,
 0.4398555630783119,
 0.4813811780636425,
 0.7173700443842624,
 0.302414804784473,
 0.35748138117806366,
 0.7166177687504701]

In [34]:
if method == 'max':
    ranks = [1 - rank for rank in ranks]
ranks[:10]

[0.0,
 0.28458587226359733,
 0.9656210035356955,
 0.2839088241931844,
 0.5601444369216881,
 0.5186188219363574,
 0.28262995561573756,
 0.697585195215527,
 0.6425186188219363,
 0.2833822312495299]

In [35]:
risks = np.array(ranks)
risks


array([0.        , 0.28458587, 0.965621  , ..., 0.86255924, 0.95870007,
       0.99759272])

In [36]:
converse_risks = 1 - risks.copy()
converse_risks

array([1.        , 0.71541413, 0.034379  , ..., 0.13744076, 0.04129993,
       0.00240728])

In [37]:
raw_risks = np.stack([converse_risks, risks], axis=-1)
raw_risks[:, 1]

array([0.        , 0.28458587, 0.965621  , ..., 0.86255924, 0.95870007,
       0.99759272])

In [38]:
pd.DataFrame(index=test.index,
                    data=raw_risks[:, 1],
                    columns=["Risk"])[:10]

Unnamed: 0_level_0,Risk
student_lookup,Unnamed: 1_level_1
35993,0.0
39939,0.284586
52418,0.965621
9028,0.283909
829,0.560144
37939,0.518619
36721,0.28263
34638,0.697585
33383,0.642519
38110,0.283382


In [39]:
test.loc[52418]

school_year                              2011
school_10              River View High School
school_9                                 RVHS
gpa_9                                 1.22222
gpa_9_missing                               0
school_gpa_9_rank                         126
school_gpa_9_decile                        10
Name: 52418, dtype: object

In [108]:
test.loc[14467]

school_year                          2013
school                  Logan High School
gpa_9                  2.9729729729729730
gpa_9_missing                           0
school_gpa_9_rank                     182
school_gpa_9_decile                     6
Name: 14467, dtype: object