In [2]:
# ----IMPORTING LIBRARIES-----------
# Importing necessary libraries
from pandas import read_csv, get_dummies, Series
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn import metrics

# ------DATA EXPLORATION-----------
# Importing dataset and examining it
dataset = pd.read_csv("diabetes_012.csv")
pd.set_option('display.max_columns', None)  # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

#--------DATA PREPARATION----------
# Convert target variable to binary
dataset['Diabetes_012'] = dataset['Diabetes_012'].map({0: 0, 1: 0, 2: 1})

# Dividing dataset into label and feature sets
X = dataset.drop(['AnyHealthcare', 'CholCheck', 'NoDocbcCost', 'Stroke', 'HvyAlcoholConsump', 'Diabetes_012'], axis=1)  # Features drop
Y = dataset['Diabetes_012']  # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(X.info())
print(Y.shape)

# Scaling features
X_scaled = StandardScaler().fit_transform(X)

# Splitting dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.7, random_state=120)

# Balancing the training dataset using SMOTE
X_train, Y_train = SMOTE(random_state=120).fit_resample(X_train, Y_train)

print(Y.shape)
print(X.shape)

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  \
0                   0.0           0.0     0.0      1.0                0.0   
1                   0.0           1.0     0.0      0.0                0.0   
2                   0.0           0.0     1.0      0.0                0.0   
3                   0.0           1.0     1.0      1.0                0.0   
4                   0.0           1.0     1.0      1.0                0.0   

   AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex  \
0            1.0          0.0      5.0 

In [5]:
# ---------------------------------  ALGORITHM 1: AdaBoost for Best Features -----------------------------------------
# Implementing AdaBoost
# Tuning the AdaBoost parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', AdaBoostClassifier(random_state=1))
    ])
grid_param = {'classification__n_estimators': [2,3,4,5,10,20,30,40,50,100]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)


{'classification__n_estimators': 5}
0.7554746134436009
GenHlth                 0.4
HighBP                  0.2
BMI                     0.2
Age                     0.2
HighChol                0.0
Smoker                  0.0
HeartDiseaseorAttack    0.0
PhysActivity            0.0
Fruits                  0.0
Veggies                 0.0
MentHlth                0.0
PhysHlth                0.0
DiffWalk                0.0
Sex                     0.0
Education               0.0
Income                  0.0
dtype: float64


In [6]:
# ------------------------  ALGORITHM 1: Adaboost with Precision -----------------------------------------
# Implementing AdaBoost
# Tuning the AdaBoost parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', AdaBoostClassifier(random_state=1))
    ])
grid_param = {'classification__n_estimators': [2,3,4,5,10,20,30,40,50,100]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__n_estimators': 100}
0.37505550510438307


In [2]:
# ------------------------  ALGORITHM 2: SUPPORT VECTOR MACHINE(SVM)  -----------------------------------------
# Support Vector Machine
SV_classifier1 = SVC(kernel='linear', random_state=1)
SV_classifier1.fit(X_train, Y_train)  # Training
Y_pred1 = SV_classifier1.predict(X_test)  # Testing

# Accuracy and confusion matrix
accuracy = metrics.accuracy_score(Y_test, Y_pred1)  # Calculating accuracy
print("Accuracy: ", accuracy)  # Is this a good metric??
con_matrix = metrics.confusion_matrix(Y_test, Y_pred1)
print(con_matrix)
recall = metrics.recall_score(Y_test, Y_pred1)
print(recall)
precision = metrics.precision_score(Y_test, Y_pred1)
print(precision)
f1 = metrics.f1_score(Y_test, Y_pred1)
print(f1)

Accuracy:  0.7155753029688696
[[107777  45059]
 [  5448  19292]]
0.7797898140662894
0.2997933210051126
0.433085272361967


In [2]:
# ------------------------  ALGORITHM 3: Logistic Regression(LR with Recall)  -----------------------------------------

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import warnings
# Implementing Logistic Regression
# Tuning eta0, max_iter, alpha, and l1_ratio parameters and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),   # Synthetic Minority Oversampling Technique
        ('classification', SGDClassifier(loss = 'log', penalty = 'elasticnet', random_state = 7))
    ])
grid_param = {'classification__eta0': [.001,.01,.1,1,10,100], 'classification__max_iter' : [100,500,1000], 'classification__alpha': [.001, .01,.1, 1,10,100], 'classification__l1_ratio': [0,0.3,0.5,0.7,1]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=11)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best result: ", best_result)

Best parameters:  {'classification__alpha': 0.01, 'classification__eta0': 0.001, 'classification__l1_ratio': 0.5, 'classification__max_iter': 100}
Best result:  0.7612169782271178


In [3]:
# ------------------------  ALGORITHM 3: Logistic Regression(LR with Precision)  -----------------------------------------

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Implementing Logistic Regression
# Tuning eta0, max_iter, alpha, and l1_ratio parameters and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),   # Synthetic Minority Oversampling Technique
        ('classification', SGDClassifier(loss = 'log', penalty = 'elasticnet', random_state = 7))
    ])
grid_param = {'classification__eta0': [.001,.01,.1,1,10,100], 'classification__max_iter' : [100,500,1000], 'classification__alpha': [.001, .01,.1, 1,10,100], 'classification__l1_ratio': [0,0.3,0.5,0.7,1]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='precision', cv=11)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best result: ", best_result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Best parameters:  {'classification__alpha': 10, 'classification__eta0': 0.001, 'classification__l1_ratio': 0, 'classification__max_iter': 100}
Best result:  0.6337383390065124


In [None]:
# ------------------------  ALGORITHM 4: Decision Tree Classifier with Precision  ---------------------------------------------

#implementing Decision Tree classifier
#2- GridSearchCV (max_depth identification for decision tree)
from sklearn import tree
from sklearn.model_selection import GridSearchCV
DT_classifier2 = tree.DecisionTreeClassifier(criterion = 'entropy') # building
depth = {'max_depth': [2,3,4,5,10,15,20,25,30,35]}
grid_search1 = GridSearchCV(estimator=DT_classifier2, param_grid=depth, scoring='precision', cv=5)# building
grid_search1.fit(X_scaled, Y )# training, testing , evaluation, ranking.
best_depth = grid_search1.best_params_
print(best_depth)
best_result = grid_search1.best_score_
print(best_result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'max_depth': 5}
0.5605655634646018


In [None]:
# ------------------------  ALGORITHM 4: Decision Tree Classifier with max_depth  -------------------------

# 1- Regular method ( method#1)
from sklearn import tree
DT_classifier =tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5) #builders
DT_classifier.fit(X_train, Y_train) # training
y_pred = DT_classifier.predict(X_test) # testing

# Evaluation
# Acuracy and confusion matrix
from sklearn import metrics
Accuracy=metrics.accuracy_score(Y_test, y_pred)  # calculating accuaracy
print("Accuracy: ", Accuracy) # Is this a good metric??
con_matrix = metrics.confusion_matrix(Y_test, y_pred)
print (con_matrix)
recall = metrics.recall_score(Y_test, y_pred)
print ("recall :",recall)
percision=metrics.precision_score(Y_test, y_pred)
print("percesion :",percision)

Accuracy:  0.6866299499932423
[[102943  49893]
 [  5754  18986]]
recall : 0.7674211802748585
percesion : 0.2756427938849286


In [None]:
# ------------------------  ALGORITHM 4: Decision Tree Classifier with max_depth -------------------------
DT_classifier3 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)# classifier building
DT_classifier3.fit(X_train, Y_train) # training
y_pred2 = DT_classifier3.predict(X_test) # testing
imp_features = Series(DT_classifier3.feature_importances_, index=list(X)).sort_values(ascending=False) # what are the important features?
print(imp_features)

GenHlth                 0.689366
HighBP                  0.191982
Age                     0.071353
BMI                     0.047298
HighChol                0.000000
Smoker                  0.000000
HeartDiseaseorAttack    0.000000
PhysActivity            0.000000
Fruits                  0.000000
Veggies                 0.000000
MentHlth                0.000000
PhysHlth                0.000000
DiffWalk                0.000000
Sex                     0.000000
Education               0.000000
Income                  0.000000
dtype: float64


In [None]:
# ------------------------  ALGORITHM 4: Decision Tree Classifier with max_depth and important features  -------------------

#GridSearchCV with best features
X2 = dataset[['GenHlth','HighBP', 'Age', 'BMI']] # Features#
X_scaled = StandardScaler().fit_transform(X2) # scaling
DT_classifier4 = tree.DecisionTreeClassifier(criterion = 'entropy') # building classfier
depth = {'max_depth': [2,3,4,5,10,15]}
grid_search2 = GridSearchCV(estimator=DT_classifier4, param_grid=depth, scoring='recall', cv=6)
grid_search2.fit(X_scaled, Y) # Training, testing , evaluation, ranking.
best_depth = grid_search2.best_params_
print(best_depth)
best_result = grid_search2.best_score_
print(best_result)

{'max_depth': 4}
0.1758897753635489


In [39]:
# ------------------------  ALGORITHM 5:Random Forest Classifier ---------------------------------
# Implementing Random Forest Classifier
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search

model = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )])
grid_param = {'classification__n_estimators': [50,100,150,200,250,300]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'classification__n_estimators': 200}


In [45]:
# ------------------------  ALGORITHM 4: Decision Tree Classifier with best number of trees -------------------

# Random Forest Classifier with best number of tree (method 1)
from sklearn.ensemble import RandomForestClassifier
from pandas import read_csv, get_dummies,Series
RF_classifier3 = RandomForestClassifier(n_estimators=250, criterion='entropy', max_features='auto', random_state=1)# building model
RF_classifier3.fit(X_train,Y_train) #training
Y_pred3=RF_classifier3.predict(X_test)# testing
imp_features = Series(RF_classifier3.feature_importances_, index=list(X)).sort_values(ascending=False)
print(imp_features)

  warn(


BMI                     0.185269
Age                     0.167981
GenHlth                 0.161065
Income                  0.105674
PhysHlth                0.076305
HighBP                  0.064217
Education               0.053998
MentHlth                0.043978
HighChol                0.028922
Smoker                  0.019267
Fruits                  0.018893
Sex                     0.018070
PhysActivity            0.015959
Veggies                 0.014521
DiffWalk                0.014076
HeartDiseaseorAttack    0.011804
dtype: float64


In [46]:
# ------------------------  ALGORITHM 4: Random Forest Classifier with important features  -------------------
# Using important features only (method #2)
X2 = dataset[['BMI', 'GenHlth', 'Age', 'Income',]]
X_scaled = StandardScaler().fit_transform(X2) # scaling
# X_train, X_test, Y_train, Y_test = train_test_split( X_scaled, Y, test_size = 0.3, random_state = 100)# splitting
# X_train,Y_train =SMOTE (random_state = 100).fit_resample(X_train,Y_train)# balancing

RF_classifier4 = RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) # building classifier
no_trees = {'n_estimators': [200, 250, 300, 350, 400, 450]}
grid_search2 = GridSearchCV(estimator=RF_classifier4, param_grid=no_trees, scoring='recall', cv=5)
grid_search2.fit(X_scaled, Y) # training, testing , evaluation, ranking.

best_parameters = grid_search2.best_params_
print(best_parameters)
best_result = grid_search2.best_score_
print(best_result)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


{'n_estimators': 350}
0.14072273245957256


In [47]:
# Evaluation
#from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
#RF_classifier1 =ensemble.RandomForestClassifier()
RF_classifier1 =ensemble.RandomForestClassifier(n_estimators=350, criterion='entropy', max_features='auto', random_state=1)  # building model
RF_classifier1.fit(X_train,Y_train)#training
Y_pred1=RF_classifier1.predict(X_test)# testing
# imp_features = Series(RF_classifier1.feature_importances_, index=list(X)).sort_values(ascending=False)
# print(imp_features)
# Acuracy and confusion matrix

from sklearn import metrics
Accuracy=metrics.accuracy_score(Y_test, Y_pred1) # calculating accuaracy
print("Accuracy: ", Accuracy) # Is this a good metric??
con_matrix = metrics.confusion_matrix(Y_test, Y_pred1)
print (con_matrix)
recall = metrics.recall_score(Y_test, Y_pred1)
print (recall)
percision=metrics.precision_score(Y_test, Y_pred1)
print(percision)

  warn(


Accuracy:  0.8506780195521917
[[61931  3376]
 [ 7988  2809]]
0.26016486060942856
0.45416329830234436
