In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [14]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 768) (236866,)


In [15]:
# split the oversample data (keeping code for now in case we need it later)
# from sklearn.model_selection import train_test_split
# os_X_train, os_X_test, os_y_train, os_y_test = train_test_split(os_X, os_y, test_size=0.30, random_state=0)
# print(os_X_train.shape, os_y_train.shape)

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Custom function to print the metrics of the model
def print_metrics(y_test, y_pred):
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(confusion_matrix(y_test, y_pred))
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

## Logistic Regression

In [None]:
# # check p-values of features
# import statsmodels.api as sm
# logit_model = sm.Logit(y_train, X_train)
# result = logit_model.fit()
# print(result.summary2())

The feature with p-values less than 0.05 are the significant ones. Consider getting rid of insignificant features and then fit it into the logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=0, max_iter=200)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.9130020973967605
[[50469   226]
 [ 4710  1332]]
Precision: 0.8549422336328626
Recall: 0.22045680238331677
F1 Score: 0.35052631578947363


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# from sklearn.model_selection import cross_val_score
# cross_val_score(logreg, X_train, y_train, cv=10, scoring="accuracy")

In [None]:
# from sklearn.model_selection import RepeatedStratifiedKFold

# # define model
# model = LogisticRegression(class_weight='balanced')

# # define cross-validation
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# # evaluate model
# scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

# # summarize performance
# print('Mean AUROC: %.3f' % np.mean(scores))

#### Logistic Regression using oversampled data from SMOTE

In [None]:
logreg_os = LogisticRegression(random_state=0, max_iter=200)
logreg_os.fit(os_X, os_y)
y_pred = logreg_os.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.8094365229039251
[[41749  8946]
 [ 1866  4176]]
Precision: 0.31824417009602196
Recall: 0.6911618669314796
F1 Score: 0.43581715716969316


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# # Use variance inflation factor to identify any significant multi-collinearity
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# X_constant = sm.add_constant(X_train, prepend=False)
# vif = pd.DataFrame()
# vif["Features"] = X_constant.columns
# vif["VIF"] = [variance_inflation_factor(X_constant.values, i) for i in range(X_constant.shape[1])]
# vif

In [None]:
# sns.set(font_scale=1.25)
# chis_train = pd.concat([X_train, y_train], axis=1)
# chis_train = chis_train.rename(columns = {'T2D': 'Type 2 Diabetes', 'AB24': 'Taking Insulin', 'AB23_P1': 'Age Told To Have Diabetes', 'AB112': 'Diabetes Care Plan', 
# 'AB25': 'Diabetic Pills', 'AB63': 'Eye Exam Dilated Pupils', 'AB114_P1': 'Confidence to Control Diabetes', 'AB28_P1': 'Doctor Checked Feet for Sores', 'AB109': 'Visited ER', 
# 'AB113': 'Copy of Diabetes Care Plan', 'AB111': "Hospital Overnight for Diabetes"})

# corrMatrix = chis_train.corr()
# plt.subplots(figsize=(12, 9))
# g = sns.heatmap(corrMatrix, annot=True, cmap="Accent", vmin=-1, vmax=1)
# plt.title('Correlation Heatmap', size = 25)
# plt.show()



In [None]:
# from statsmodels.genmod.generalized_linear_model import GLM
# from statsmodels.genmod import families

# # Setup logistic regression model (using GLM method so that we can retrieve residuals)
# logit_model = GLM(y_train, X_constant, family=families.Binomial())
# logit_results = logit_model.fit()
# print(logit_results.summary())

In [None]:
# import scipy
# # Generate residual series plot
# fig = plt.figure(figsize=(15,9))
# ax = fig.add_subplot(111, title="Residual Series Plot",
#                     xlabel="Index Number", ylabel="Deviance Residuals")

# # ax.plot(X.index.tolist(), stats.zscore(logit_results.resid_pearson))
# ax.plot(X_constant.index.tolist(), scipy.stats.zscore(logit_results.resid_deviance))
# plt.axhline(y=0, ls="--", color='red');