# Machine Learning

### Logistic Regression

In [1]:
import pandas as pd

In [2]:
# Load the data
file_path = 'Resources/diabetes_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df.drop('Diabetes', axis=1)
y = df['Diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# extracting important features
# Would leaving in PhysHealth, and HvyAlcoholConsump help our model train?
# Because if we leave it in there it would tell the model something along the lines of
# "if it has this feature, they probably don't."
unimportant_features_df = df[['Fruits','MentHlth','PhysActivity','Veggies','PhysHlth','HvyAlcoholConsump']]
important_features_df = df[['GenHlth', 'BMI', 'Age', 'HighBP', 'HighChol', 'CholCheck', 'Sex', 'HeartDiseaseorAttack', 'DiffWalk', 'Stroke']]

In [5]:
from sklearn.linear_model import LogisticRegression


# Train the Logistic Regression model using the orignal dataframe
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [6]:
# Calculated the balanced accuracy score of the orignal dataframe
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.7422862376595556

In [7]:
# Display the confusion matrix of the orignal dataframe
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[6384, 2437],
       [2117, 6735]], dtype=int64)

In [8]:
# Print the imbalanced classification report of the orignal dataframe
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.75      0.72      0.76      0.74      0.74      0.55      8821
        1.0       0.73      0.76      0.72      0.75      0.74      0.55      8852

avg / total       0.74      0.74      0.74      0.74      0.74      0.55     17673



### Logistic Regression with Top 6 Features

In [9]:
# extracting top 6 important features
top6_features_df = df[['GenHlth', 'BMI', 'Age', 'HighBP', 'HighChol', 'CholCheck']]

#Scaling the Data with the important features used above
X_important = top6_features_df
y_important = df['Diabetes']
X_important_train, X_important_test, y_important_train, y_important_test = train_test_split(X_important, y_important)

scaler = StandardScaler()
X_important_train_scaled = scaler.fit_transform(X_important_train)
X_important_test_scaled = scaler.transform(X_important_test)

In [10]:
# Train the Logistic Regression model using the important dataframe
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_important_train_scaled, y_important_train)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score of the important dataframe
y_important_pred = model.predict(X_important_test_scaled)
balanced_accuracy_score(y_important_test, y_important_pred)

0.7473620928736128

In [12]:
# Display the confusion matrix of the important dataframe
confusion_matrix(y_important_test, y_important_pred)

array([[6345, 2476],
       [1988, 6864]], dtype=int64)

In [13]:
# Print the imbalanced classification report of the important dataframe
print(classification_report_imbalanced(y_important_test, y_important_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.76      0.72      0.78      0.74      0.75      0.55      8821
        1.0       0.73      0.78      0.72      0.75      0.75      0.56      8852

avg / total       0.75      0.75      0.75      0.75      0.75      0.56     17673



Extracting important features barely effected results of the accuracy of the logicsitc regression model.

The actual machine learning model will likely have a bigger impact on predictions than the columns included or not.

In [14]:
# extracting top 6 important features
top3_features_df = df[['GenHlth', 'BMI', 'Age']]

#Scaling the Data with the important features used above
X3 = top3_features_df
y3 = df['Diabetes']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3)

scaler = StandardScaler()
X3_train_scaled = scaler.fit_transform(X3_train)
X3_test_scaled = scaler.transform(X3_test)

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X3_train_scaled, y3_train)
y3_pred = rf_model.predict(X3_test_scaled)

In [16]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y3_test, y3_pred)

0.7215264745097574

In [17]:
# Display the confusion matrix
confusion_matrix(y3_test, y3_pred)

array([[5989, 2905],
       [2022, 6757]], dtype=int64)

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y3_test, y3_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.75      0.67      0.77      0.71      0.72      0.51      8894
        1.0       0.70      0.77      0.67      0.73      0.72      0.52      8779

avg / total       0.72      0.72      0.72      0.72      0.72      0.52     17673



In [19]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.41123804744119596, 'Age'),
 (0.32871441735201024, 'Sex'),
 (0.2600475352067938, 'HighChol')]

### Easy Ensemble Classifier

In [20]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
e_model = EasyEnsembleClassifier(n_estimators=100, random_state=1) 
e_model = e_model.fit(X3_train, y3_train)
y3_pred = e_model.predict(X3_test)

In [21]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y3_test, y3_pred)

0.726506032382559

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y3_test, y3_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.75      0.68      0.77      0.71      0.72      0.52      8894
        1.0       0.70      0.77      0.68      0.74      0.72      0.53      8779

avg / total       0.73      0.73      0.73      0.73      0.72      0.53     17673



### XGBoost

In [23]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-win_amd64.whl (89.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3
Note: you may need to restart the kernel to use updated packages.


In [24]:
from numpy import loadtxt
from xgboost import XGBClassifier

In [25]:
# split data into X and y
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [27]:
print(model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


In [28]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [29]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 75.24%


In [30]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df.drop('Diabetes', axis=1)
y = df['Diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
import numpy as np
from collections import Counter

In [32]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [33]:
y.value_counts()

0.0    35346
1.0    35346
Name: Diabetes, dtype: int64

In [34]:
X.describe()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,8.584055,0.456997,0.525703,0.975259,29.856985,0.475273,0.14781,0.703036,0.611795,0.788774,0.042721,2.837082,3.752037,5.810417,0.25273,0.062171,0.563458
std,2.852153,0.498151,0.499342,0.155336,7.113954,0.499392,0.354914,0.456924,0.487345,0.408181,0.202228,1.113565,8.155627,10.062261,0.434581,0.241468,0.49596
min,1.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,9.0,0.0,1.0,1.0,29.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0
75%,11.0,1.0,1.0,1.0,33.0,1.0,0.0,1.0,1.0,1.0,0.0,4.0,2.0,6.0,1.0,0.0,1.0
max,13.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0,1.0,1.0


# Balanced Random Forest Classifier

In [35]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [36]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7245017185176048

In [37]:
confusion_matrix(y_test, y_pred)

array([[6082, 2742],
       [2126, 6723]], dtype=int64)

In [38]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.74      0.69      0.76      0.71      0.72      0.52      8824
        1.0       0.71      0.76      0.69      0.73      0.72      0.53      8849

avg / total       0.73      0.72      0.72      0.72      0.72      0.52     17673



In [39]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
cols = X.columns

feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df.head()

Unnamed: 0,feature,importance
0,Age,0.152108
1,Sex,0.030463
2,HighChol,0.038169
3,CholCheck,0.00711
4,BMI,0.214667


In [40]:
feature_importances_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
4,BMI,0.214667
0,Age,0.152108
11,GenHlth,0.118222
16,HighBP,0.090755
13,PhysHlth,0.087782
12,MentHlth,0.067894
2,HighChol,0.038169
8,Fruits,0.033646
5,Smoker,0.03344
1,Sex,0.030463


# Easy Ensemble Adaboost

In [41]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [42]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7506515070467572

In [43]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[6364, 2460],
       [1946, 6903]], dtype=int64)

In [44]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Prediction'])
plt.figure(figsize=(5,5))
sb.heatmap(confusion_matrix, annot=True, vmin=0, vmax=int(len(y_test)/2))
plt.title('EE Adaboost Matrix')
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))