In [24]:
%matplotlib inline
import pandas as pd
import DataHelper
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import xgboost as xgb
import sklearn.metrics as metrics

In [2]:
e = DataHelper.eclass_data()

In [3]:
def return_race(rcode):
    """
    Returns the racial identifier from IPEDS definitions
    
    All 'unknown' columns are reduced to 'Not Reported'
    """
    race_codes = {'10000000':'American Indian or Alaska Native'
                  , '01000000':'Asian'
                  , '00100000':'Black'
                  , '00010000':'Hispanic/Latino'
                  , '00001000':'Native Hawaiian or other Pacific Islander'
                  , '00000100':'White'
                 }
    try:
        return race_codes[rcode]
    except KeyError:
        return 'Not Reported'

races = e.post.set_index('anon_student_id')[[ 'Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7', 'race_unknown']].fillna(0).astype(int).astype(str)

races = (races[ 'Q52_1'] + races['Q52_2'] + races['Q52_3'] + races['Q52_4'] + races['Q52_5'] + races['Q52_6'] + races['Q52_7'] + races['race_unknown'])    

races = races.apply(return_race)
races = pd.DataFrame({'Race':races})


In [4]:
gender = e.post[['anon_student_id','Q54']].drop_duplicates().set_index('anon_student_id')#
gender.columns = ['Gender']

In [5]:
buffy_student = pd.DataFrame(index=pd.concat([e.get_buffy_pre(), e.get_buffy_post()]).anon_student_id.unique())
buffy_student['BUFFY'] = 1

In [6]:
future_plans = e.post[['anon_student_id', 'Q53_1', 'Q53_2', 'Q53_3', 'Q53_4', 'Q53_5', 'Q53_6', 'Q53_7', 'Q53_8']].set_index('anon_student_id')

In [7]:
index = np.arange(0, pd.concat([e.pre.anon_student_id, e.post.anon_student_id]).max()+1, 1)
students = pd.DataFrame(index=index)

students = students.join(races, how='left').join(gender, how='left').join(buffy_student, how='left').join(future_plans).reset_index()
students = students.drop_duplicates(subset=['index'], keep='first').set_index('index').dropna(how='all')
students['BUFFY'] = students['BUFFY'].fillna(0)
students

Unnamed: 0_level_0,Race,Gender,BUFFY,Q53_1,Q53_2,Q53_3,Q53_4,Q53_5,Q53_6,Q53_7,Q53_8
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,Hispanic/Latino,2.0,0.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
6,White,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
7,White,2.0,0.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0
8,Black,1.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
10,White,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
43076,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43077,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43078,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43079,White,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0


In [8]:
students

Unnamed: 0_level_0,Race,Gender,BUFFY,Q53_1,Q53_2,Q53_3,Q53_4,Q53_5,Q53_6,Q53_7,Q53_8
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,Hispanic/Latino,2.0,0.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
6,White,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
7,White,2.0,0.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0
8,Black,1.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
10,White,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
43076,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43077,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43078,Black,2.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
43079,White,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0


In [9]:
e.cis[['Q15', 'anon_instructor_id', 'anon_university_id', 'ResponseId']]

Unnamed: 0,Q15,anon_instructor_id,anon_university_id,ResponseId
0,,0,0,R_5vvkiNRDrvutGdX
1,4 year college,1,1,R_5mXbzVvsHehQ07L
2,,1,1,R_1g28mGAPrOOZ4Mh
3,4 year college,2,1,R_1pL1oZ2Un6TQfkp
4,4 year college,3,2,R_2ritbcOI1sEN5mn
...,...,...,...,...
594,4 year college,203,132,R_3qJOKOP2r5VyMrj
595,PhD granting institution,69,32,R_3nu6XUP5nAi4lZe
596,,69,32,R_qITeC301YGuhQNr
597,,69,32,R_A72jufifWSmfboZ


In [10]:
alldata = pd.concat([e.pre, e.post])
cis_connector = alldata[['anon_student_id', 'ResponseId']].drop_duplicates().drop_duplicates(subset='anon_student_id').set_index('anon_student_id')
students = students.join(cis_connector).merge(e.cis[['Q15', 'anon_instructor_id', 'anon_university_id', 'ResponseId']])

In [11]:
# data = pd.get_dummies(students)
# data.dropna(inplace=True)

In [19]:
cols = ['Race', 'Gender', 'BUFFY', 'Q53_1', 'Q53_2','Q53_3','Q53_4','Q53_5','Q53_6','Q53_7','Q53_8','Q15','anon_instructor_id','anon_university_id']

pd.get_dummies(students[cols]).columns

Index(['Gender', 'BUFFY', 'Q53_1', 'Q53_2', 'Q53_3', 'Q53_4', 'Q53_5', 'Q53_6',
       'Q53_7', 'Q53_8', 'anon_instructor_id', 'anon_university_id',
       'Race_American Indian or Alaska Native', 'Race_Asian', 'Race_Black',
       'Race_Hispanic/Latino',
       'Race_Native Hawaiian or other Pacific Islander', 'Race_Not Reported',
       'Race_White', 'Q15_2 year college', 'Q15_4 year college',
       'Q15_Master's granting institution', 'Q15_PhD granting institution'],
      dtype='object')

In [22]:
xcols = ['Gender', 'Q53_1', 'Q53_2', 'Q53_3', 'Q53_4', 'Q53_5', 'Q53_6',
       'Q53_7', 'Q53_8', 'anon_instructor_id', 'anon_university_id',
       'Race_American Indian or Alaska Native', 'Race_Asian', 'Race_Black',
       'Race_Hispanic/Latino',
       'Race_Native Hawaiian or other Pacific Islander', 'Race_Not Reported',
       'Race_White', 'Q15_2 year college', 'Q15_4 year college',
       "Q15_Master's granting institution", 'Q15_PhD granting institution']
ycol = 'BUFFY'

data = pd.get_dummies(students[cols])
data.dropna(inplace=True)

X_train, X_test, y_train, y_test = train_test_split(data[xcols], data[ycol])

In [23]:
logit_mod = sm.Logit(exog=X_train, endog=y_train)
logit_res = logit_mod.fit_regularized()
logit_res.summary()

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.34134465035992706
            Iterations: 180
            Function evaluations: 189
            Gradient evaluations: 180


0,1,2,3
Dep. Variable:,BUFFY,No. Observations:,20655.0
Model:,Logit,Df Residuals:,20633.0
Method:,MLE,Df Model:,21.0
Date:,"Tue, 01 Sep 2020",Pseudo R-squ.:,0.1465
Time:,14:22:57,Log-Likelihood:,-7050.5
converged:,True,LL-Null:,-8260.9
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Gender,0.2364,0.045,5.250,0.000,0.148,0.325
Q53_1,-2.0087,0.056,-36.035,0.000,-2.118,-1.899
Q53_2,0.1353,0.047,2.894,0.004,0.044,0.227
Q53_3,-0.2717,0.049,-5.568,0.000,-0.367,-0.176
Q53_4,0.5178,0.065,7.995,0.000,0.391,0.645
Q53_5,0.1948,0.063,3.101,0.002,0.072,0.318
Q53_6,-0.3968,0.091,-4.381,0.000,-0.574,-0.219
Q53_7,-0.0802,0.071,-1.136,0.256,-0.218,0.058
Q53_8,0.0340,0.072,0.469,0.639,-0.108,0.176


In [48]:
preds = logit_res.predict(X_test)
metrics.roc_auc_score(y_true=y_test, y_score=preds)

0.7369378955187001

In [25]:
X_train.join(y_train)

Unnamed: 0,Gender,Q53_1,Q53_2,Q53_3,Q53_4,Q53_5,Q53_6,Q53_7,Q53_8,anon_instructor_id,...,Race_Black,Race_Hispanic/Latino,Race_Native Hawaiian or other Pacific Islander,Race_Not Reported,Race_White,Q15_2 year college,Q15_4 year college,Q15_Master's granting institution,Q15_PhD granting institution,BUFFY
8698,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,121,...,0,0,0,0,1,0,0,0,1,1.0
24884,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,30,...,0,0,0,0,1,0,0,0,1,1.0
23425,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,108,...,0,0,0,0,0,0,0,0,1,1.0
17216,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,76,...,0,0,0,0,1,0,0,0,0,0.0
289,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,189,...,0,0,0,0,1,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9080,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,110,...,0,0,0,0,1,0,0,0,0,0.0
23275,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,165,...,0,0,0,1,0,0,0,0,0,1.0
20624,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,114,...,0,0,0,1,0,0,0,0,0,0.0
2250,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,161,...,0,0,0,0,1,0,0,0,0,0.0


In [35]:
dtrain = xgb.DMatrix(X_train, label=y_train, missing=-999.0)
dtest = xgb.DMatrix(X_test, label=y_test, missing=-999.0)

param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'

num_round = 10000
bst = xgb.train(param, dtrain, num_round)

In [47]:
classify = lambda x: 0 if x < 0.5 else 1
preds = [classify(y) for y in bst.predict(dtest)]
metrics.roc_auc_score(y_true=y_test, y_score=preds)

0.902495254145502

In [38]:
dtest

<xgboost.core.DMatrix at 0x7f717ab44208>