In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('diabetesML.csv')

data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
columns_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

zero_counts = data[columns_to_check].apply(lambda x: (x == 0).sum())

zero_counts


Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

In [9]:
columns_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
zero_counts = data[columns_to_check].apply(lambda x: (x == 0).sum())
print(zero_counts)

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [12]:
data[columns_to_check] = data[columns_to_check].replace(0, np.nan)
data.fillna(data.median(), inplace=True)

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[columns_to_check] = scaler.fit_transform(data[columns_to_check])


In [17]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.627,50,1
1,1,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,0.351,31,0
2,8,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.672,32,1
3,1,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,0.167,21,0
4,0,0.504422,-2.679076,0.670643,0.316566,1.549303,2.288,33,1


In [19]:
zero_counts_after = data[columns_to_check].apply(lambda x: (x == 0).sum())
nan_counts_after = data[columns_to_check].isna().sum()
print("Zero value counts after replacement:\n", zero_counts_after)
print("NaN counts after imputation:\n", nan_counts_after)

scaler = StandardScaler()
data[columns_to_check] = scaler.fit_transform(data[columns_to_check])

X = data.drop('Outcome', axis=1)
y = data['Outcome']

kf = KFold(n_splits=10, shuffle=True, random_state=42)

knn = KNeighborsClassifier()
knn_scores = cross_val_score(knn, X, y, cv=kf)
print("KNN Cross-Validation Scores:\n", knn_scores)
print("KNN Average Score:", np.mean(knn_scores))

dt = DecisionTreeClassifier(random_state=42)
dt_scores = cross_val_score(dt, X, y, cv=kf)
print("Decision Tree Cross-Validation Scores:\n", dt_scores)
print("Decision Tree Average Score:", np.mean(dt_scores))

Zero value counts after replacement:
 Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64
NaN counts after imputation:
 Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64
KNN Cross-Validation Scores:
 [0.5974026  0.79220779 0.7012987  0.75324675 0.72727273 0.63636364
 0.76623377 0.7012987  0.67105263 0.72368421]
KNN Average Score: 0.7070061517429939
Decision Tree Cross-Validation Scores:
 [0.79220779 0.66233766 0.66233766 0.72727273 0.79220779 0.5974026
 0.72727273 0.71428571 0.63157895 0.80263158]
Decision Tree Average Score: 0.7109535201640464
