In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

In [2]:
df = pd.read_csv("filtered_diabetes_data.csv")
df

Unnamed: 0,age,ismale,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [3]:
X = df.drop('class', axis =1)
y = df['class']

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [5]:
dummy = DummyClassifier()
dummy.fit(X_train,y_train)
dummy_pred = dummy.predict(X_test)

In [6]:
confusion_matrix(y_test,dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

In [7]:
print(classification_report(y_test,dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train,y_train)
logr_pred = logr.predict(X_test)

In [15]:
confusion_matrix(y_test,logr_pred)

array([[38,  2],
       [ 3, 61]], dtype=int64)

In [16]:
print(classification_report(y_test,logr_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94        40
           1       0.97      0.95      0.96        64

    accuracy                           0.95       104
   macro avg       0.95      0.95      0.95       104
weighted avg       0.95      0.95      0.95       104



In [17]:
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
tree_pred = tree.predict(X_test)

In [18]:
confusion_matrix(y_test,tree_pred)

array([[38,  2],
       [ 1, 63]], dtype=int64)

In [19]:
print(classification_report(y_test,tree_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96        40
           1       0.97      0.98      0.98        64

    accuracy                           0.97       104
   macro avg       0.97      0.97      0.97       104
weighted avg       0.97      0.97      0.97       104



In [20]:
forest = RandomForestClassifier()
forest.fit(X_train,y_train)
forest_pred = forest.predict(X_test)

In [21]:
confusion_matrix(y_test,forest_pred)

array([[39,  1],
       [ 0, 64]], dtype=int64)

In [22]:
print(classification_report(y_test,forest_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.98      1.00      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [23]:
forest.feature_importances_

array([0.09712364, 0.08848056, 0.2065404 , 0.22028858, 0.06622147,
       0.02191361, 0.03292962, 0.02623239, 0.0268994 , 0.02957679,
       0.03454929, 0.03202175, 0.04938666, 0.02110633, 0.03143266,
       0.01529685])

In [24]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [28]:
pd.DataFrame({'feature': X.columns,
             'importance': forest.feature_importances_}).sort_values('importance', ascending = False)

Unnamed: 0,feature,importance
3,polydipsia,0.220289
2,polyuria,0.20654
0,age,0.097124
1,ismale,0.088481
4,sudden weight loss,0.066221
12,partial paresis,0.049387
10,irritability,0.034549
6,polyphagia,0.03293
11,delayed healing,0.032022
14,alopecia,0.031433
