In [26]:
# import libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix , classification_report

In [32]:
# prepare our independent and dependent variables
df = pd.read_csv('diabetes_data_clean.csv')

x = df.drop('class', axis=1)
y = df['class']

0      1
1      1
2      1
3      1
4      1
      ..
515    1
516    1
517    1
518    0
519    0
Name: class, Length: 520, dtype: int64

In [33]:
# split data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [36]:
# begin our model trining
# start with DummyClassifier to establish the baseline
dummy = DummyClassifier()
dummy.fit(x_train, y_train)
dummy_pred = dummy.predict(x_test)


In [37]:
# asses DummyClassifier model by
# 1.confussion_matrix
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

In [38]:
# 2.use a classification_report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



In [41]:
# start with LogisticsRegression
logr = LogisticRegression(max_iter=10000)
logr.fit(x_train, y_train)
logr_pred = logr.predict(x_test)

In [42]:
confusion_matrix(y_test, logr_pred)

array([[36,  4],
       [ 5, 59]], dtype=int64)

In [44]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        40
           1       0.94      0.92      0.93        64

    accuracy                           0.91       104
   macro avg       0.91      0.91      0.91       104
weighted avg       0.91      0.91      0.91       104



In [49]:
# try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)


In [51]:
confusion_matrix(y_test, tree_pred)

array([[40,  0],
       [ 1, 63]], dtype=int64)

In [52]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [56]:
# try RandomForest
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
forest_pred = forest.predict(x_test)

In [57]:
confusion_matrix(y_test, forest_pred)

array([[40,  0],
       [ 1, 63]], dtype=int64)

In [58]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



Among the four models, RandomForestClassifier and DecisionTreeClassifier are the best models.

In [71]:
#features that went into training the forest model
forest.feature_importances_

array([0.09866038, 0.09555059, 0.20579772, 0.19953996, 0.03891051,
       0.0225456 , 0.02519118, 0.01960805, 0.03113374, 0.03081944,
       0.04693333, 0.0392385 , 0.06534541, 0.02844314, 0.03378658,
       0.01849588])

In [62]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [70]:
pd.DataFrame({'feature': x.columns,
            'importance': forest.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.205798
3,polydipsia,0.19954
0,age,0.09866
1,ismale,0.095551
12,partial paresis,0.065345
10,irritability,0.046933
11,delayed healing,0.039239
4,sudden weight loss,0.038911
14,alopecia,0.033787
8,visual blurring,0.031134


In [74]:
# features and importance of the tree model.
tree.feature_importances_

array([0.08649881, 0.11034941, 0.41698542, 0.11747776, 0.04417969,
       0.        , 0.00677083, 0.01701389, 0.02539063, 0.01625   ,
       0.02990787, 0.03740885, 0.00865178, 0.03920433, 0.03644289,
       0.00746783])

In [75]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [76]:
pd.DataFrame({'features': x.columns,
             'importance': tree.feature_importances_}).sort_values('importance', ascending = False)

Unnamed: 0,features,importance
2,polyuria,0.416985
3,polydipsia,0.117478
1,ismale,0.110349
0,age,0.086499
4,sudden weight loss,0.04418
13,muscle stiffness,0.039204
11,delayed healing,0.037409
14,alopecia,0.036443
10,irritability,0.029908
8,visual blurring,0.025391


### Summary
1. Trained a baseline model.
2. Trained three different models - logistic regresssion, decision tree and random forest.
3. Identified the important features in the best perfoming models.