In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
combinedchis = pd.read_csv("data/combined_chis.csv")

## Separate input and output variables

In [4]:
X = combinedchis.iloc[:,:-1]
X = X.drop('PUF1Y_ID', axis=1)
X

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,AH141,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18
0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
1,-1.0,5.0,-1.0,-1.0,-1.0,2.0,2.0,2.0,-1.0,1.0,...,,,,,,,,,,
2,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
3,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
4,-1.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189118,-1.0,2.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,5.0,4.0,-1.0,-1.0,-1.0
189119,-1.0,1.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0
189120,-1.0,2.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
189121,-1.0,3.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [5]:
y = combinedchis.iloc[:, -1]
y

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
189118    0.0
189119    0.0
189120    0.0
189121    0.0
189122    0.0
Name: T2D, Length: 189123, dtype: float64

## Split training and test data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [7]:
combinedchis_train = pd.concat([X_train, y_train], axis=1)
combinedchis_train

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18,T2D
82968,-1.0,3.0,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,...,,,,,,,,,,0.0
157554,-1.0,3.0,,-1.0,,,,,,,...,,,,,,,,,,0.0
92170,-1.0,2.0,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,...,,,,,,,,,,0.0
162413,-1.0,1.0,,-1.0,,,,,,,...,,,,,,,,,,0.0
4890,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152315,-1.0,1.0,,-1.0,,,,,,,...,,,,,,,,,,0.0
176963,-1.0,3.0,,-1.0,,,,,,,...,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
117952,-1.0,4.0,,-1.0,,,,-1.0,,,...,,,,,,,,,,0.0
173685,2.0,2.0,,-1.0,,,,,,,...,4.0,-1.0,-1.0,-1.0,5.0,4.0,2.0,2.0,-1.0,0.0


## Feature Selection

In [8]:
corrmat = combinedchis_train.corr()
corrmat

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18,T2D
AA5C,1.000000,0.030440,-0.009180,0.015379,0.027197,0.013202,0.013828,0.010740,0.009461,-0.051835,...,-0.010306,-0.008609,-0.000861,-0.001448,-0.005344,-0.005503,0.005462,0.005534,0.017513,0.014852
AB1,0.030440,1.000000,0.092736,0.269661,0.226821,0.212191,0.214386,0.221132,0.176672,-0.213191,...,-0.237184,0.073445,-0.087104,-0.086598,-0.138717,-0.119883,-0.074235,-0.074257,0.024776,0.252359
AB100,-0.009180,0.092736,1.000000,0.143765,0.129436,0.134269,0.133775,0.131283,0.123835,-0.115432,...,,,,,,,,,,0.137436
AB112,0.015379,0.269661,0.143765,1.000000,0.760535,0.144834,0.143600,0.153994,0.108208,-0.088466,...,-0.113882,0.022400,-0.088557,-0.088368,-0.075304,-0.072934,-0.035753,-0.035662,-0.006446,0.909806
AB113,0.027197,0.226821,0.129436,0.760535,1.000000,0.138682,0.137230,0.134643,0.130687,-0.089152,...,,,,,,,,,,0.809775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AK20_P1,-0.005503,-0.119883,,-0.072934,,,,,,,...,0.112183,0.018042,0.139364,0.138708,0.707926,1.000000,0.063559,0.064180,-0.006100,-0.075178
AJ174_8,0.005462,-0.074235,,-0.035753,,,,,,,...,0.044488,-0.017363,-0.106754,-0.106900,0.179277,0.063559,1.000000,0.998702,-0.037543,-0.038045
AJ174_9,0.005534,-0.074257,,-0.035662,,,,,,,...,0.044450,-0.017626,-0.106687,-0.106834,0.179074,0.064180,0.998702,1.000000,-0.037962,-0.038018
AJ194_18,0.017513,0.024776,,-0.006446,,,,,,,...,0.043685,0.040472,-0.023965,-0.024190,-0.009742,-0.006100,-0.037543,-0.037962,1.000000,-0.005326


In [18]:
corrmat["T2D"].nlargest(11)

T2D         1.000000
AB51_P1     0.974814
AB111       0.931536
AB109       0.930741
AB24        0.921461
AB23_P1     0.915996
AB112       0.909806
AB25        0.891608
AB114_P1    0.887768
DIAMED      0.885319
AB27_P      0.883609
Name: T2D, dtype: float64

Top 10 features with the highest correlation to the response variable <br>
AB51_P1 = TYPE 1 OR TYPE 2 DIABETES (PUF 1 YR RECODE) <br>
AB111 = ADMITTED TO HOSPITAL OVERNIGHT OR LONGER FOR DIABETES PAST 12 MOS <br>
AB109 = VISITED ER FOR DIABETES IN PAST 12 MOS <br>
AB24 = CURRENTLY TAKING INSULIN <br>
AB23_P1 = AGE FIRST TOLD HAVE DIABETES (PUF 1 YR RECODE) <br>
AB112 = MEDICAL PROVIDERS DEVELOP DIABETES CARE PLAN <br>
AB25 = CURRENTLY TAKING DIABETIC PILLS TO LOWER BLOOD SUGAR <br>
AB114_P1 = CONFIDENCE TO CONTROL AND MANAGE DIABETES (PUF 1 YR RECODE) <br>
DIAMED = TAKING INSULIN OR PILLS <br>
AB27_P = # OF TIMES DOC CHECKED HEMOGLOBIN AIC LAST YR <br>

In [17]:
features = corrmat["T2D"].nlargest(11)[1:].index
features

Index(['AB51_P1', 'AB111', 'AB109', 'AB24', 'AB23_P1', 'AB112', 'AB25',
       'AB114_P1', 'DIAMED', 'AB27_P'],
      dtype='object')