In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
combinedchis = pd.read_csv("data/combined_chis.csv")

In [78]:
# fill in null values with the mode of the feature
for column in combinedchis:
    combinedchis.fillna(combinedchis[column].mode()[0], inplace = True)

## Separate input and output variables

In [79]:
X = combinedchis.iloc[:,:-1]
X = X.drop(["PUF1Y_ID"], axis=1)
X

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,AH141,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18
0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,-1.0,5.0,-1.0,-1.0,-1.0,2.0,2.0,2.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189118,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,3.0,-1.0,-1.0,-1.0,5.0,4.0,-1.0,-1.0,-1.0
189119,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,4.0,-1.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0
189120,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
189121,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [80]:
y = combinedchis.iloc[:, -1]
y

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
189118    0.0
189119    0.0
189120    0.0
189121    0.0
189122    0.0
Name: T2D, Length: 189123, dtype: float64

## Split training and test data

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## Feature Selection

In [82]:
chis = pd.concat([X_train, y_train], axis=1)
corrmat = chis.corr()
corrmat

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18,T2D
AA5C,1.000000,0.030440,0.004185,0.015379,0.020015,0.011976,0.012351,0.013186,0.009225,0.005668,...,-0.020096,-0.009716,-0.005838,-0.006049,-0.013058,-0.012165,-0.004116,-0.004086,-0.003591,0.014852
AB1,0.030440,1.000000,0.040617,0.269661,0.181505,0.171186,0.172809,0.211381,0.143880,0.007826,...,-0.089722,-0.004856,-0.047034,-0.046870,-0.079004,-0.071600,-0.045366,-0.045361,-0.030516,0.252359
AB100,0.004185,0.040617,1.000000,0.032730,0.077103,0.081612,0.081181,0.049280,0.073633,0.627929,...,-0.082091,-0.028064,-0.023448,-0.023428,-0.048361,-0.044049,-0.025934,-0.025920,-0.040246,0.026275
AB112,0.015379,0.269661,0.032730,1.000000,0.561184,0.107994,0.107085,0.137257,0.081455,-0.010648,...,-0.019247,0.002583,-0.030644,-0.030585,-0.026811,-0.026349,-0.014709,-0.014679,-0.007774,0.909806
AB113,0.020015,0.181505,0.077103,0.561184,1.000000,0.180160,0.178825,0.130744,0.166755,0.045015,...,-0.081792,-0.027962,-0.023363,-0.023342,-0.048185,-0.043889,-0.025839,-0.025825,-0.040100,0.586741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AK20_P1,-0.012165,-0.071600,-0.044049,-0.026349,-0.043889,-0.047322,-0.047377,-0.056634,-0.040769,-0.064475,...,0.534850,0.190959,0.261226,0.260571,0.795412,1.000000,0.214490,0.214906,0.248578,-0.026669
AJ174_8,-0.004116,-0.045366,-0.025934,-0.014709,-0.025839,-0.027861,-0.027893,-0.033343,-0.024003,-0.037960,...,0.312198,0.088366,-0.010607,-0.010824,0.318187,0.214490,1.000000,0.998827,0.117731,-0.015226
AJ174_9,-0.004086,-0.045361,-0.025920,-0.014679,-0.025825,-0.027846,-0.027878,-0.033325,-0.023990,-0.037939,...,0.312025,0.088072,-0.010599,-0.010816,0.317940,0.214906,0.998827,1.000000,0.117299,-0.015217
AJ194_18,-0.003591,-0.030516,-0.040246,-0.007774,-0.040100,-0.043237,-0.043287,-0.051745,-0.037249,-0.058909,...,0.479179,0.194698,0.114574,0.114266,0.270937,0.248578,0.117731,0.117299,1.000000,-0.006887


In [83]:
corrmat["T2D"].nlargest(11)

T2D         1.000000
AB51_P1     0.974814
AB24        0.921461
AB23_P1     0.915996
AB112       0.909806
AB25        0.891608
AB63        0.877318
AB114_P1    0.775773
AB28_P1     0.696493
AB111       0.602414
AB109       0.601955
Name: T2D, dtype: float64

Top 10 features with the highest correlation to the response variable <br>
AB51_P1 = TYPE 1 OR TYPE 2 DIABETES (PUF 1 YR RECODE) <br>
AB111 = ADMITTED TO HOSPITAL OVERNIGHT OR LONGER FOR DIABETES PAST 12 MOS <br>
AB109 = VISITED ER FOR DIABETES IN PAST 12 MOS <br>
AB24 = CURRENTLY TAKING INSULIN <br>
AB23_P1 = AGE FIRST TOLD HAVE DIABETES (PUF 1 YR RECODE) <br>
AB112 = MEDICAL PROVIDERS DEVELOP DIABETES CARE PLAN <br>
AB25 = CURRENTLY TAKING DIABETIC PILLS TO LOWER BLOOD SUGAR <br>
AB114_P1 = CONFIDENCE TO CONTROL AND MANAGE DIABETES (PUF 1 YR RECODE) <br>
DIAMED = TAKING INSULIN OR PILLS <br>
AB27_P = # OF TIMES DOC CHECKED HEMOGLOBIN AIC LAST YR <br>

In [84]:
features = corrmat["T2D"].nlargest(11)[1:].index
features

Index(['AB51_P1', 'AB24', 'AB23_P1', 'AB112', 'AB25', 'AB63', 'AB114_P1',
       'AB28_P1', 'AB111', 'AB109'],
      dtype='object')

In [85]:
# checks if there are any NaN values in any of the features
X_train[features].isnull().sum()

AB51_P1     0
AB24        0
AB23_P1     0
AB112       0
AB25        0
AB63        0
AB114_P1    0
AB28_P1     0
AB111       0
AB109       0
dtype: int64

In [86]:
# see if number of features are the same for training and testing 
X_train = X_train[features]
X_test = X_test[features]
print(X_train.shape)
print(X_test.shape)

(132386, 10)
(56737, 10)


In [87]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 10) (236866,)


In [88]:
# split the oversample data (keeping code for now in case we need it later)
# from sklearn.model_selection import train_test_split
# os_X_train, os_X_test, os_y_train, os_y_test = train_test_split(os_X, os_y, test_size=0.30, random_state=0)
# print(os_X_train.shape, os_y_train.shape)

In [89]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# Custom function to print the metrics of the model
def print_metrics(y_test, y_pred):
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(confusion_matrix(y_test, y_pred))
    print(f'Recall: {recall_score(y_test, y_pred)}')

## KNN

In [90]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.996897967816416
[[50527   168]
 [    8  6034]]
Recall: 0.9986759351208209


#### Model using oversampled data from SMOTE

In [91]:
knn_os = KNeighborsClassifier()
knn_os.fit(os_X, os_y)
y_pred = knn_os.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.9966512152563582
[[50508   187]
 [    3  6039]]
Recall: 0.9995034756703078


## Stochastic Gradient Descent

In [92]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=0)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.999806122988526
[[50690     5]
 [    6  6036]]
Recall: 0.9990069513406157


#### Model using oversampled data from SMOTE

In [93]:
sgd_os = SGDClassifier(random_state=0)
sgd_os.fit(os_X, os_y)
y_pred_os = sgd_os.predict(X_test)
print_metrics(y_test, y_pred_os)

Accuracy Score: 0.9999823748171387
[[50694     1]
 [    0  6042]]
Recall: 1.0


## Logistic Regression

In [94]:
# check p-values of features
import statsmodels.api as sm
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary2())

         Current function value: 0.001915
         Iterations: 35




                          Results: Logit
Model:                Logit             Pseudo R-squared:  0.994   
Dependent Variable:   T2D               AIC:               527.0891
Date:                 2022-05-08 19:29  BIC:               625.0239
No. Observations:     132386            Log-Likelihood:    -253.54 
Df Model:             9                 LL-Null:           -44585. 
Df Residuals:         132376            LLR p-value:       0.0000  
Converged:            0.0000            Scale:             1.0000  
No. Iterations:       35.0000                                      
-------------------------------------------------------------------
          Coef.    Std.Err.     z     P>|z|     [0.025     0.975]  
-------------------------------------------------------------------
AB51_P1   90.4794 21353.9507   0.0042 0.9966 -41762.4949 41943.4538
AB24     -21.0783   220.2378  -0.0957 0.9238   -452.7364   410.5798
AB23_P1   -2.0430     0.1252 -16.3240 0.0000     -2.2883    -1.7977
AB112  

The feature with p-values less than 0.05 are the significant ones. Consider getting rid of insignificant features and then fit it into the logistic regression model

In [95]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 1.0
[[50695     0]
 [    0  6042]]
Recall: 1.0


#### Model using oversampled data from SMOTE

In [96]:
logreg_os = LogisticRegression(random_state=0)
logreg_os.fit(os_X, os_y)
y_pred = logreg_os.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 1.0
[[50695     0]
 [    0  6042]]
Recall: 1.0
