## Part 3: Machine Learning model training

In [1]:
#import libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [4]:
#prepare independent and dependent variables
df = pd.read_csv("diabetes_data_clean.csv")

X = df.drop('class', axis = 1)
y = df['class']

In [7]:
#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [8]:
#begin model training
#start with DummyClassifier to establish baseline
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)



In [9]:
#assess DummyClassifier model
confusion_matrix(y_test, dummy_pred)

array([[16, 24],
       [25, 39]], dtype=int64)

In [10]:
#use classification report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.39      0.40      0.40        40
           1       0.62      0.61      0.61        64

    accuracy                           0.53       104
   macro avg       0.50      0.50      0.50       104
weighted avg       0.53      0.53      0.53       104



## Logistic Regression

In [11]:
#start with logistic regression
logr = LogisticRegression(max_iter = 10000)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)

In [13]:
confusion_matrix(y_test, logr_pred)

array([[33,  7],
       [ 6, 58]], dtype=int64)

In [14]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.84        40
           1       0.89      0.91      0.90        64

    accuracy                           0.88       104
   macro avg       0.87      0.87      0.87       104
weighted avg       0.87      0.88      0.87       104



## Decision Tree

In [15]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [16]:
confusion_matrix(y_test, tree_pred)

array([[38,  2],
       [ 2, 62]], dtype=int64)

In [17]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        40
           1       0.97      0.97      0.97        64

    accuracy                           0.96       104
   macro avg       0.96      0.96      0.96       104
weighted avg       0.96      0.96      0.96       104



In [25]:
tree.feature_importances_

array([0.0802463 , 0.16627467, 0.11284418, 0.41586064, 0.02851563,
       0.        , 0.03385417, 0.02920511, 0.00585193, 0.        ,
       0.03390256, 0.01866066, 0.        , 0.00677083, 0.05005853,
       0.0179548 ])

In [26]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [27]:
pd.DataFrame({'feature': X.columns, 'importance': tree.feature_importances_}).sort_values('importance', ascending = False)

Unnamed: 0,feature,importance
3,polydipsia,0.415861
1,ismale,0.166275
2,polyuria,0.112844
0,age,0.080246
14,alopecia,0.050059
10,irritability,0.033903
6,polyphagia,0.033854
7,genital thrush,0.029205
4,sudden weight loss,0.028516
11,delayed healing,0.018661


## Random Forest

In [18]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [19]:
confusion_matrix(y_test, forest_pred)

array([[35,  5],
       [ 3, 61]], dtype=int64)

In [20]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90        40
           1       0.92      0.95      0.94        64

    accuracy                           0.92       104
   macro avg       0.92      0.91      0.92       104
weighted avg       0.92      0.92      0.92       104



In [21]:
forest.feature_importances_

array([0.09013302, 0.13670031, 0.20871814, 0.18780523, 0.04566563,
       0.02059535, 0.02923477, 0.02167836, 0.01617542, 0.02962482,
       0.04204476, 0.03488803, 0.05878119, 0.01869921, 0.04131939,
       0.01793637])

In [22]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [24]:
pd.DataFrame({'feature': X.columns, 'importance': forest.feature_importances_}).sort_values('importance', ascending = False)

Unnamed: 0,feature,importance
2,polyuria,0.208718
3,polydipsia,0.187805
1,ismale,0.1367
0,age,0.090133
12,partial paresis,0.058781
4,sudden weight loss,0.045666
10,irritability,0.042045
14,alopecia,0.041319
11,delayed healing,0.034888
9,itching,0.029625


Summary:
1. Trained a baseline model
2. Trained three different models -logistic regression, decision tree, random forest
3. Identified the importance features in the best performing model