In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [4]:
# bloodpressure and skinthickness has less contribution in outcome so less remove that
df.drop(["SkinThickness", "BloodPressure"], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,0,33.6,0.627,50,1
1,1,85,0,26.6,0.351,31,0
2,8,183,0,23.3,0.672,32,1
3,1,89,94,28.1,0.167,21,0
4,0,137,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...
763,10,101,180,32.9,0.171,63,0
764,2,122,0,36.8,0.340,27,0
765,5,121,112,26.2,0.245,30,0
766,1,126,0,30.1,0.349,47,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   Insulin                   768 non-null    int64  
 3   BMI                       768 non-null    float64
 4   DiabetesPedigreeFunction  768 non-null    float64
 5   Age                       768 non-null    int64  
 6   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 42.1 KB


In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,846.0,67.1,2.42,81.0,1.0


In [8]:
df["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [10]:
x = df.drop(["Outcome"], axis=1)
y = df["Outcome"]

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

In [12]:
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

In [13]:
print("Original dataset shape:", y_train.value_counts())
print("SMOTE dataset shape:", y_train_smote.value_counts())

Original dataset shape: Outcome
0    395
1    219
Name: count, dtype: int64
SMOTE dataset shape: Outcome
1    395
0    395
Name: count, dtype: int64


In [15]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(x_train_smote, y_train_smote)

In [16]:
y_pred = model.predict(x_test)
y_pred

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
      dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, roc_auc_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.71      0.77       105
           1       0.54      0.71      0.61        49

    accuracy                           0.71       154
   macro avg       0.69      0.71      0.69       154
weighted avg       0.75      0.71      0.72       154



In [18]:
roc_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.8141885325558795


In [19]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7142857142857143

In [20]:
precision = precision_score(y_test, y_pred)
precision

0.5384615384615384

In [21]:
f1 = f1_score(y_test, y_pred)
f1

0.6140350877192983

In [22]:
import xgboost as xgb

In [23]:
xgbmodel = xgb.XGBClassifier(random_state=42)

In [24]:
xgbmodel.fit(x_train_smote, y_train_smote)

In [25]:
y_pred_new = xgbmodel.predict(x_test)

In [26]:
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       0.85      0.73      0.79       105
           1       0.56      0.71      0.62        49

    accuracy                           0.73       154
   macro avg       0.70      0.72      0.71       154
weighted avg       0.75      0.73      0.73       154



In [27]:
import pickle

In [28]:
with open("xgboost-model.pkl", "wb") as file:
    pickle.dump(xgbmodel, file)
print("model saved as xgboost-model.pkl")

model saved as xgboost-model.pkl
