# Part III: Machine Learning model training

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# prepare independent and dependent variables
df = pd.read_csv("../Dataset/diabetes_data_clean.csv")

X = df.drop('class', axis=1)
y = df['class']

In [5]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                   stratify = y)

In [7]:
# begin model training
# start with DummyClassifier to establish baseline
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [8]:
# access DummyClassifier model
confusion_matrix(y_test, dummy_pred)
# [TN, FP,
# FN, TP]

array([[14, 26],
       [18, 46]], dtype=int64)

In [9]:
# classification report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.44      0.35      0.39        40
           1       0.64      0.72      0.68        64

    accuracy                           0.58       104
   macro avg       0.54      0.53      0.53       104
weighted avg       0.56      0.58      0.57       104



In [11]:
# start with LogisticRegression
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)

In [13]:
confusion_matrix(y_test, logr_pred)

array([[38,  2],
       [ 3, 61]], dtype=int64)

In [14]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94        40
           1       0.97      0.95      0.96        64

    accuracy                           0.95       104
   macro avg       0.95      0.95      0.95       104
weighted avg       0.95      0.95      0.95       104



In [15]:
# DecisionTree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [16]:
confusion_matrix(y_test, tree_pred)

array([[38,  2],
       [ 2, 62]], dtype=int64)

In [17]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        40
           1       0.97      0.97      0.97        64

    accuracy                           0.96       104
   macro avg       0.96      0.96      0.96       104
weighted avg       0.96      0.96      0.96       104



In [18]:
# RandomForest
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [19]:
confusion_matrix(y_test, forest_pred)

array([[39,  1],
       [ 0, 64]], dtype=int64)

In [20]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.98      1.00      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [21]:
forest.feature_importances_

array([0.09265281, 0.09042782, 0.17768817, 0.2554904 , 0.05969295,
       0.02219977, 0.02512731, 0.02194217, 0.0280501 , 0.03397979,
       0.0362069 , 0.03726192, 0.03926335, 0.02958957, 0.03416952,
       0.01625744])

In [22]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [25]:
pd.DataFrame({'feature': X.columns,
             'importance': forest.feature_importances_}).sort_values('importance',
                                                                    ascending=False)

Unnamed: 0,feature,importance
3,polydipsia,0.25549
2,polyuria,0.177688
0,age,0.092653
1,ismale,0.090428
4,sudden weight loss,0.059693
12,partial paresis,0.039263
11,delayed healing,0.037262
10,irritability,0.036207
14,alopecia,0.03417
9,itching,0.03398


# Summary
1. Trained baseline model
2. Trained 3 different models - logistic regression, decision tree, random forest
3. Identified important features in best performing model