In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [7]:
x = df.iloc[:,:-1]     # df.drop('Outcome', axis=1)
y = df.iloc[:,-1]      # df['Outcome']
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(768, 8)
(768,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [9]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [10]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)    # 0.25 * 768 = 192 is size of test data
print(x_train.shape)                                                       # 0.75 * 768 = 576 is remaining
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 8)
(192, 8)
(576,)
(192,)


In [46]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [47]:
def gen_metrics(ytest,ypred):
    cm = confusion_matrix(ytest,ypred)
    print('Confusion Matrix = \n',cm)
    print('Classification Report = \n',classification_report(ytest,ypred))
    print('Acc Score = ',accuracy_score(ytest,ypred))

## Build Models

### 1) Log Reg

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
m1 = LogisticRegression(max_iter=1000)
m1.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

In [50]:
print('Training Score = ', m1.score(x_train,y_train))
print('Testing Score = ', m1.score(x_test,y_test))

Training Score =  0.7708333333333334
Testing Score =  0.8020833333333334


In [51]:
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1
 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0
 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1
 1 0 0 0 0 1 0]


In [52]:
gen_metrics(y_test,ypred_m1)

Confusion Matrix = 
 [[116  14]
 [ 24  38]]
Classification Report = 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       130
           1       0.73      0.61      0.67        62

    accuracy                           0.80       192
   macro avg       0.78      0.75      0.76       192
weighted avg       0.80      0.80      0.80       192

Acc Score =  0.8020833333333334


### 2) KNN

In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [121]:
m2 = KNeighborsClassifier(n_neighbors=11)
m2.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=11)

In [122]:
print('Training Score = ', m2.score(x_train,y_train))
print('Testing Score = ', m2.score(x_test,y_test))

Training Score =  0.78125
Testing Score =  0.75


In [123]:
ypred_m2 = m2.predict(x_test)
print(ypred_m2)

[0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 1 1 0 1
 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0
 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0
 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 1
 1 0 0 0 0 1 1]


In [124]:
gen_metrics(y_test,ypred_m2)

Confusion Matrix = 
 [[106  24]
 [ 24  38]]
Classification Report = 
               precision    recall  f1-score   support

           0       0.82      0.82      0.82       130
           1       0.61      0.61      0.61        62

    accuracy                           0.75       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.75      0.75      0.75       192

Acc Score =  0.75


### 3) DT

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [117]:
m3 = DecisionTreeClassifier(criterion='gini',max_depth=5,min_samples_split=10)
m3.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=5, min_samples_split=10)

In [118]:
print('Training Score = ', m3.score(x_train,y_train))
print('Testing Score = ', m3.score(x_test,y_test))

Training Score =  0.8194444444444444
Testing Score =  0.7447916666666666


In [119]:
ypred_m3 = m3.predict(x_test)
print(ypred_m3)

[1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 1 1 1 1 0 1 0 1
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0
 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1
 1 0 1 0 0 0 1]


In [120]:
gen_metrics(y_test,ypred_m3)

Confusion Matrix = 
 [[105  25]
 [ 24  38]]
Classification Report = 
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       130
           1       0.60      0.61      0.61        62

    accuracy                           0.74       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.75      0.74      0.75       192

Acc Score =  0.7447916666666666


### 4) Random Forest

In [72]:
from sklearn.ensemble import RandomForestClassifier

In [109]:
m4 = RandomForestClassifier(n_estimators=50,criterion='entropy',max_depth=6,min_samples_split=12)
m4.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_split=12,
                       n_estimators=50)

In [110]:
print('Training Score = ', m4.score(x_train,y_train))
print('Testing Score = ', m4.score(x_test,y_test))

Training Score =  0.8541666666666666
Testing Score =  0.7916666666666666


In [111]:
ypred_m4 = m4.predict(x_test)
print(ypred_m4)

[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1 1 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0
 0 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0
 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1
 1 0 0 0 0 1 1]


In [112]:
gen_metrics(y_test,ypred_m4)

Confusion Matrix = 
 [[113  17]
 [ 23  39]]
Classification Report = 
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       130
           1       0.70      0.63      0.66        62

    accuracy                           0.79       192
   macro avg       0.76      0.75      0.76       192
weighted avg       0.79      0.79      0.79       192

Acc Score =  0.7916666666666666


### 5) SVM

In [104]:
from sklearn.svm import SVC

In [105]:
m5 = SVC(kernel='linear',C=1)
m5.fit(x_train,y_train)

SVC(C=1, kernel='linear')

In [106]:
print('Training Score = ', m5.score(x_train,y_train))
print('Testing Score = ', m5.score(x_test,y_test))

Training Score =  0.7638888888888888
Testing Score =  0.7864583333333334


In [107]:
ypred_m5 = m5.predict(x_test)
print(ypred_m5)

[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1
 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0
 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1
 1 0 0 0 0 1 0]


In [108]:
gen_metrics(y_test,ypred_m5)

Confusion Matrix = 
 [[114  16]
 [ 25  37]]
Classification Report = 
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       130
           1       0.70      0.60      0.64        62

    accuracy                           0.79       192
   macro avg       0.76      0.74      0.75       192
weighted avg       0.78      0.79      0.78       192

Acc Score =  0.7864583333333334


## Conclusion
#### 1) Log_Reg is the best performing model in terms of accuracy.<br>
#### 2) Random Forest is the best performing model in terms of recall.<br>