In [128]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import recall_score,precision_score,f1_score,confusion_matrix , classification_report , accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler , KBinsDiscretizer , RobustScaler ,StandardScaler
from scipy.stats import chi2_contingency
import pickle

In [91]:
df = pd.read_csv("diabetes.csv")

In [92]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [94]:
NewGlucose = pd.Series(["Low", "Normal", "Overweight", "Secret", "High"], dtype = "category")
df["NewGlucose"] = NewGlucose
df.loc[df["Glucose"] <= 70, "NewGlucose"] = NewGlucose[0]
df.loc[(df["Glucose"] > 70) & (df["Glucose"] <= 99), "NewGlucose"] = NewGlucose[1]
df.loc[(df["Glucose"] > 99) & (df["Glucose"] <= 126), "NewGlucose"] = NewGlucose[2]
df.loc[df["Glucose"] > 126 ,"NewGlucose"] = NewGlucose[3]

In [95]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewGlucose
0,6,148,72,35,0,33.6,0.627,50,1,Secret
1,1,85,66,29,0,26.6,0.351,31,0,Normal
2,8,183,64,0,0,23.3,0.672,32,1,Secret
3,1,89,66,23,94,28.1,0.167,21,0,Normal
4,0,137,40,35,168,43.1,2.288,33,1,Secret


In [96]:
NewBloodPressure = pd.Series([ "Normal", "Stage1", "Stage2"], dtype = "category")
df["NewBloodPressure"] = NewBloodPressure
df.loc[df["BloodPressure"] < 80, "NewBloodPressure"] = NewBloodPressure[0]
df.loc[(df["BloodPressure"] > 80) & (df["BloodPressure"] <= 89), "NewBloodPressure"] = NewBloodPressure[1]
df.loc[(df["BloodPressure"] >= 90), "NewBloodPressure"] = NewBloodPressure[2]

In [97]:
def set_insuline(row):
    if row["Insulin"]>=16 and row["Insulin"]<=166:
        return "Normal"
    else:
        return "Abnormal"

In [98]:
df = df.assign(NewInsulinScore=df.apply(set_insuline, axis=1))

In [99]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewGlucose,NewBloodPressure,NewInsulinScore
0,6,148,72,35,0,33.6,0.627,50,1,Secret,Normal,Abnormal
1,1,85,66,29,0,26.6,0.351,31,0,Normal,Normal,Abnormal
2,8,183,64,0,0,23.3,0.672,32,1,Secret,Normal,Abnormal
3,1,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Normal
4,0,137,40,35,168,43.1,2.288,33,1,Secret,Normal,Abnormal


In [100]:
NewBMI = pd.Series(["Underweight","Normal", "Overweight","Obesity 1", "Obesity 2", "Obesity 3"], dtype = "category")
df['NewBMI'] = NewBMI
df.loc[df["BMI"]<18.5, "NewBMI"] = NewBMI[0]
df.loc[(df["BMI"]>18.5) & df["BMI"]<=24.9, "NewBMI"] = NewBMI[1]
df.loc[(df["BMI"]>24.9) & df["BMI"]<=29.9, "NewBMI"] = NewBMI[2]
df.loc[(df["BMI"]>29.9) & df["BMI"]<=34.9, "NewBMI"] = NewBMI[3]
df.loc[(df["BMI"]>34.9) & df["BMI"]<=39.9, "NewBMI"] = NewBMI[4]
df.loc[df["BMI"]>39.9, "NewBMI"] = NewBMI[5]

In [101]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewGlucose,NewBloodPressure,NewInsulinScore,NewBMI
0,6,148,72,35,0,33.6,0.627,50,1,Secret,Normal,Abnormal,Obesity 2
1,1,85,66,29,0,26.6,0.351,31,0,Normal,Normal,Abnormal,Obesity 2
2,8,183,64,0,0,23.3,0.672,32,1,Secret,Normal,Abnormal,Obesity 2
3,1,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Normal,Obesity 2
4,0,137,40,35,168,43.1,2.288,33,1,Secret,Normal,Abnormal,Obesity 3


In [102]:
df = pd.get_dummies(df, columns = ["NewGlucose", "NewBloodPressure", "NewInsulinScore" , "NewBMI" ], drop_first=True , dtype = int )

In [105]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewGlucose_Low,...,NewGlucose_Overweight,NewGlucose_Secret,NewBloodPressure_Stage1,NewBloodPressure_Stage2,NewInsulinScore_Normal,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight
0,6,148,72,35,0,33.6,0.627,50,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,85,66,29,0,26.6,0.351,31,0,0,...,0,0,0,0,0,0,1,0,0,0
2,8,183,64,0,0,23.3,0.672,32,1,0,...,0,1,0,0,0,0,1,0,0,0
3,1,89,66,23,94,28.1,0.167,21,0,0,...,0,0,0,0,1,0,1,0,0,0
4,0,137,40,35,168,43.1,2.288,33,1,0,...,0,1,0,0,0,0,0,1,0,0


In [106]:
catagorical_df = df.iloc[: , 9:]

In [107]:
catagorical_df.head()

Unnamed: 0,NewGlucose_Low,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret,NewBloodPressure_Stage1,NewBloodPressure_Stage2,NewInsulinScore_Normal,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight
0,0,0,0,1,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,1,0,1,0,0,0
4,0,0,0,1,0,0,0,0,0,1,0,0


In [108]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewGlucose_Low,...,NewGlucose_Overweight,NewGlucose_Secret,NewBloodPressure_Stage1,NewBloodPressure_Stage2,NewInsulinScore_Normal,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight
0,6,148,72,35,0,33.6,0.627,50,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,85,66,29,0,26.6,0.351,31,0,0,...,0,0,0,0,0,0,1,0,0,0
2,8,183,64,0,0,23.3,0.672,32,1,0,...,0,1,0,0,0,0,1,0,0,0
3,1,89,66,23,94,28.1,0.167,21,0,0,...,0,0,0,0,1,0,1,0,0,0
4,0,137,40,35,168,43.1,2.288,33,1,0,...,0,1,0,0,0,0,0,1,0,0


In [109]:
y =  df["Outcome"]

In [110]:
x = df.iloc[: , :8]

In [111]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [112]:
cols = x.columns
index = x.index

In [113]:
transformer = RobustScaler().fit(x)
x=transformer.transform(x)
x=pd.DataFrame(x, columns = cols, index = index)

In [114]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.6,0.751515,0.0,0.375,-0.239686,0.172043,0.665359,1.235294
1,-0.4,-0.775758,-0.333333,0.1875,-0.239686,-0.580645,-0.056209,0.117647
2,1.0,1.6,-0.444444,-0.71875,-0.239686,-0.935484,0.783007,0.176471
3,-0.4,-0.678788,-0.333333,0.0,0.499018,-0.419355,-0.537255,-0.470588
4,-0.6,0.484848,-1.777778,0.375,1.08055,1.193548,5.007843,0.235294


In [115]:
x = pd.concat([x, catagorical_df], axis=1)

In [73]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,NewGlucose_Low,NewGlucose_Normal,...,NewInsulinScore_Normal,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewAge_1,NewAge_2,NewAge_3,NewAge_4
0,0.6,0.751515,0.0,0.375,-0.239686,0.172043,0.665359,1.235294,0,0,...,0,0,1,0,0,0,0,1,0,0
1,-0.4,-0.775758,-0.333333,0.1875,-0.239686,-0.580645,-0.056209,0.117647,0,1,...,0,0,1,0,0,0,0,0,0,0
2,1.0,1.6,-0.444444,-0.71875,-0.239686,-0.935484,0.783007,0.176471,0,0,...,0,0,1,0,0,0,1,0,0,0
3,-0.4,-0.678788,-0.333333,0.0,0.499018,-0.419355,-0.537255,-0.470588,0,1,...,1,0,1,0,0,0,0,0,0,0
4,-0.6,0.484848,-1.777778,0.375,1.08055,1.193548,5.007843,0.235294,0,0,...,0,0,0,1,0,0,1,0,0,0


In [74]:
x_train , x_test , y_train , y_test = train_test_split(x , y ,test_size = 0.2 , random_state = 0)

In [116]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [117]:
y_pred = lr.predict(x_test)

In [118]:
accuracy_score(y_test , y_pred)

0.7922077922077922

In [119]:
confusion_matrix(y_test, y_pred)

array([[91, 16],
       [16, 31]], dtype=int64)

In [120]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       107
           1       0.66      0.66      0.66        47

    accuracy                           0.79       154
   macro avg       0.76      0.76      0.76       154
weighted avg       0.79      0.79      0.79       154



In [121]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(accuracy_score(y_train, knn.predict(x_train)))
knn_acc = accuracy_score(y_test, knn.predict(x_test))
print(accuracy_score(y_test, knn.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7964169381107492
0.7467532467532467
[[90 17]
 [22 25]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       107
           1       0.60      0.53      0.56        47

    accuracy                           0.75       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.74      0.75      0.74       154



In [122]:
svc = SVC(C=10, gamma = 0.01, probability=True)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print(accuracy_score(y_train, svc.predict(x_train)))
svc_acc = accuracy_score(y_test, svc.predict(x_test))
print(accuracy_score(y_test, svc.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8485342019543974
0.8051948051948052
[[95 12]
 [18 29]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       107
           1       0.71      0.62      0.66        47

    accuracy                           0.81       154
   macro avg       0.77      0.75      0.76       154
weighted avg       0.80      0.81      0.80       154



In [123]:
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)
y_pred = DT.predict(x_test)
print(accuracy_score(y_train, DT.predict(x_train)))

print(accuracy_score(y_test, DT.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
0.7402597402597403
[[86 21]
 [19 28]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       107
           1       0.57      0.60      0.58        47

    accuracy                           0.74       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.74      0.74      0.74       154



In [124]:
rand_clf = RandomForestClassifier(
    max_features = 0.75,
    n_estimators = 1500,
    max_samples = 0.5,
    random_state = 42,
)
rand_clf.fit(x_train, y_train)
y_pred = rand_clf.predict(x_test)
print(accuracy_score(y_train, rand_clf.predict(x_train)))
rand_acc = accuracy_score(y_test, rand_clf.predict(x_test))
print(accuracy_score(y_test, rand_clf.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9641693811074918
0.8116883116883117
[[94 13]
 [16 31]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       107
           1       0.70      0.66      0.68        47

    accuracy                           0.81       154
   macro avg       0.78      0.77      0.77       154
weighted avg       0.81      0.81      0.81       154



In [125]:
gbc = GradientBoostingClassifier(learning_rate = 0.01, loss = 'exponential', n_estimators = 1500)
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
print(accuracy_score(y_train, gbc.predict(x_train)))
print(accuracy_score(y_test, gbc.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9478827361563518
0.7922077922077922
[[91 16]
 [16 31]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       107
           1       0.66      0.66      0.66        47

    accuracy                           0.79       154
   macro avg       0.76      0.76      0.76       154
weighted avg       0.79      0.79      0.79       154



In [126]:
from xgboost import XGBClassifier 
xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.01, max_depth = 10, n_estimators = 1000)
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
print(accuracy_score(y_train, xgb.predict(x_train)))
print(accuracy_score(y_test, xgb.predict(x_test)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
0.7987012987012987
[[90 17]
 [14 33]]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       107
           1       0.66      0.70      0.68        47

    accuracy                           0.80       154
   macro avg       0.76      0.77      0.77       154
weighted avg       0.80      0.80      0.80       154



In [129]:
with open("diabetes_pipeline.pkl", "wb") as f:
    pickle.dump({"scaler": transformer, "model": rand_clf}, f)