In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [46]:
train = pd.read_csv('diabetes.csv')

In [47]:
train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [48]:
train.shape

(768, 9)

In [49]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [50]:
train.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [51]:
categorical_val = []
continous_val = []

for column in train.columns:
    if len(train[column].unique()) <= 10:
        categorical_val.append(column)
    else:
        continous_val.append(column)
    

In [52]:
train.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [53]:
feature_columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
    'BMI', 'DiabetesPedigreeFunction', 'Age',
]

for column in feature_columns:
    print(f"{column} ==> Missing Value : {len(train.loc[train[column] == 0])}")

Pregnancies ==> Missing Value : 111
Glucose ==> Missing Value : 5
BloodPressure ==> Missing Value : 35
SkinThickness ==> Missing Value : 227
Insulin ==> Missing Value : 374
BMI ==> Missing Value : 11
DiabetesPedigreeFunction ==> Missing Value : 0
Age ==> Missing Value : 0


In [54]:
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy = False)

train[feature_columns] = fill_values.fit_transform(train[feature_columns])

for column in feature_columns:
    print(f"{column} ==> Missing Value : {len(train.loc[train[column] == 0])}")

Pregnancies ==> Missing Value : 0
Glucose ==> Missing Value : 0
BloodPressure ==> Missing Value : 0
SkinThickness ==> Missing Value : 0
Insulin ==> Missing Value : 0
BMI ==> Missing Value : 0
DiabetesPedigreeFunction ==> Missing Value : 0
Age ==> Missing Value : 0


In [55]:
train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [56]:
from sklearn.model_selection import train_test_split

X = train[feature_columns]
y = train.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [57]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def evaluate(model, X_test, y_test):
    y_test_pred = model.predict(X_test)
    # y_test_pred = model.predict(X_train)

    print(f"Confusion Matrix : \n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy score : \n{accuracy_score(y_test, y_test_pred)}")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict = True))
    clf_report            
    print(f"classification_report:\n {clf_report}") 

In [58]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
bagging_clf = BaggingClassifier(estimator = tree, n_estimators= 1500)
bagging_clf.fit(X_train, y_train)

evaluate(bagging_clf, X_test, y_test)

Confusion Matrix : 
[[120  31]
 [ 23  57]]
Accuracy score : 
0.7662337662337663
classification_report:
                     0          1  accuracy   macro avg  weighted avg
precision    0.839161   0.647727  0.766234    0.743444      0.772864
recall       0.794702   0.712500  0.766234    0.753601      0.766234
f1-score     0.816327   0.678571  0.766234    0.747449      0.768619
support    151.000000  80.000000  0.766234  231.000000    231.000000


In [60]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators= 1500)
rf_clf.fit(X_train, y_train)

evaluate(rf_clf, X_test, y_test)

Confusion Matrix : 
[[122  29]
 [ 28  52]]
Accuracy score : 
0.7532467532467533
classification_report:
                     0          1  accuracy   macro avg  weighted avg
precision    0.813333   0.641975  0.753247    0.727654      0.753989
recall       0.807947   0.650000  0.753247    0.728974      0.753247
f1-score     0.810631   0.645963  0.753247    0.728297      0.753603
support    151.000000  80.000000  0.753247  231.000000    231.000000


In [62]:
from sklearn.ensemble import AdaBoostClassifier

adab_clf = AdaBoostClassifier(n_estimators= 30)
adab_clf.fit(X_train, y_train)

evaluate(adab_clf, X_test, y_test)

Confusion Matrix : 
[[123  28]
 [ 27  53]]
Accuracy score : 
0.7619047619047619
classification_report:
                     0          1  accuracy   macro avg  weighted avg
precision    0.820000   0.654321  0.761905    0.737160      0.762622
recall       0.814570   0.662500  0.761905    0.738535      0.761905
f1-score     0.817276   0.658385  0.761905    0.737830      0.762249
support    151.000000  80.000000  0.761905  231.000000    231.000000




In [65]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

estimators = []

log_reg = LogisticRegression(solver = 'liblinear')
estimators.append(('Logistic', log_reg))

svm_clf = SVC(gamma='scale')
estimators.append(('SVM', svm_clf))

voting = VotingClassifier(estimators=estimators)
voting.fit(X_train, y_train)

evaluate(voting, X_test, y_test)

Confusion Matrix : 
[[134  17]
 [ 42  38]]
Accuracy score : 
0.7445887445887446
classification_report:
                     0          1  accuracy   macro avg  weighted avg
precision    0.761364   0.690909  0.744589    0.726136      0.736964
recall       0.887417   0.475000  0.744589    0.681209      0.744589
f1-score     0.819572   0.562963  0.744589    0.691267      0.730703
support    151.000000  80.000000  0.744589  231.000000    231.000000
