In [29]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [30]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path('Resources/CleanedDiabetes.csv')
)

# Review the DataFrame
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,some_college_technical_school,some_hs,"$10,000_<$15,000","$15,000_<$20,000","$20,000_<$25,000","$25,000_<$35,000","$35,000_<$50,000","$50,000_<$75,000","<$10,000",">$75,000"
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,0,0,0,0,0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,1,0,0,0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1,0,0,0,1,0,0,0,0,0


In [31]:
df.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'diabetes', 'no_diabetes',
       'prediabetes', '18_24', '25_29', '30_34', '35_39', '40_44', '45_49',
       '50_54', '55_59', '60_64', '65_69', '70_74', '75_79', '80+',
       'college_graduate', 'elementary', 'hs_graduate', 'no_school_pre_k',
       'some_college_technical_school', 'some_hs', '$10,000_<$15,000',
       '$15,000_<$20,000', '$20,000_<$25,000', '$25,000_<$35,000',
       '$35,000_<$50,000', '$50,000_<$75,000', '<$10,000', '>$75,000'],
      dtype='object')

In [32]:

# Separate the y variable, the labels
y=df[["no_diabetes", "prediabetes","diabetes"]].values.reshape(-1,3)

# Separate the X variable, the features
X=df.drop(columns=["diabetes", "no_diabetes", "prediabetes"],axis=1)

In [33]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data using train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=48, stratify=y)

In [34]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Create Log. Regression model

X, y = make_multilabel_classification(n_classes=3, random_state=0)
model = LogisticRegression(solver='liblinear', random_state=1)


In [36]:
# fit model to training data
wrapper = MultiOutputClassifier(model).fit(X_train_scaled, y_train)



In [37]:
# print accuracy score
wrapper.score(X_test_scaled, y_test)

0.8418006937874487

In [38]:
# Predict
predictions = wrapper.predict(X_test_scaled)

In [39]:
# Create and save the testing classification report
testing_report = classification_report(y_test, predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.86      0.97      0.92     53426
           1       0.00      0.00      0.00      1158
           2       0.55      0.15      0.24      8836

   micro avg       0.85      0.84      0.85     63420
   macro avg       0.47      0.38      0.39     63420
weighted avg       0.80      0.84      0.81     63420
 samples avg       0.84      0.84      0.84     63420



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
X, y = make_classification(random_state=10)
print('Original dataset shape %s' % Counter(y))


Original dataset shape Counter({0: 51, 1: 49})


In [41]:
# initate model
sm_model = SMOTE(random_state=0)

# Fit the original training data
X_resampled, y_resampled = sm_model.fit_resample(X_test_scaled, y_test)


In [42]:
# # define model
resampled_model = LogisticRegression(solver='liblinear', random_state=1)

# fit resampled data to logistic regression model
resampled_wrapper = MultiOutputClassifier(resampled_model).fit(X_resampled, y_resampled)

In [43]:
# print accuracy score
resampled_wrapper.score(X_test_scaled, y_test)

0.49515925575528225

In [44]:
# predict
resampled_predictions = resampled_wrapper.predict(X_test_scaled)

In [45]:
# Create and save the testing classification report
resampled_testing_report = classification_report(y_test, resampled_predictions)

# Print the testing classification report
print(resampled_testing_report)

              precision    recall  f1-score   support

           0       0.97      0.53      0.68     53426
           1       0.05      0.07      0.06      1158
           2       0.45      0.39      0.41      8836

   micro avg       0.83      0.50      0.62     63420
   macro avg       0.49      0.33      0.39     63420
weighted avg       0.88      0.50      0.63     63420
 samples avg       0.50      0.50      0.50     63420



  _warn_prf(average, modifier, msg_start, len(result))
