In [26]:
import pandas as pd

data = pd.read_csv("diabetes.csv")
data.sample(5, random_state=42)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
668,6,98,58,33,190,34.0,0.43,43,0
324,2,112,75,32,0,35.7,0.148,21,0
624,2,108,64,0,0,30.8,0.158,21,0
690,8,107,80,0,0,24.6,0.856,34,0
473,7,136,90,0,0,29.9,0.21,50,0


In [27]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [37]:
zero_insulin = round(data[data['Insulin'] == 0]['Insulin'].count() / len(data) * 100, 1)
zero_skin_thickness = round(data[data['SkinThickness'] == 0]['SkinThickness'].count() / len(data) * 100, 1)
print(f'Instances with Insulin level of zero: {zero_insulin} %')
print(f'Instances with Skin Thickness of zero: {zero_skin_thickness} %')

Instances with Insulin level of zero: 48.7 %
Instances with Skin Thickness of zero: 29.6 %


In [39]:
zero_both = round(data[(data['Insulin'] == 0) & (data['SkinThickness'] == 0)]['Insulin'].count() / len(data) * 100, 1)
print(f'Instances with both Insulin and Skin Thickness of zero: {zero_both} %')

Instances with both Insulin and Skin Thickness of zero: 29.6 %


In [46]:
fixed_data = data[data['Insulin'] != 0]
print(f'Data shrinked from {len(data)} instances to {len(fixed_data)}.')

Data shrinked from 768 instances to 394.


In [52]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(fixed_data, test_size=0.2, random_state=42)

train = train_set.drop(columns=['Outcome'])
train_label = train_set['Outcome']

test = test_set.drop(columns=['Outcome'])
test_label = test_set['Outcome']

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

logistic_reg = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=42, penalty="l2")
)
logistic_reg.fit(train, train_label)
logistic_reg_pred = logistic_reg.predict(test)

logistic_reg_accuracy = round((logistic_reg_pred == test_label).mean() * 100, 1)
print(f'Accuracy of Logistic Regression model: {logistic_reg_accuracy} %')

Accuracy of Logistic Regression model: 78.5 %


In [60]:
from sklearn.svm import SVC

svc_poly = make_pipeline(
    StandardScaler(),
    SVC(kernel="poly", random_state=42)
)

svc_rbf = make_pipeline(
    StandardScaler(),
    SVC(kernel="rbf", random_state=42)
)
svc_poly.fit(train, train_label)
svc_rbf.fit(train, train_label)

svc_poly_pred = svc_poly.predict(test)
svc_rbf_pred = svc_rbf.predict(test)

svc_poly_accuracy = round((svc_poly_pred == test_label).mean() * 100, 1)
svc_rbf_accuracy = round((svc_rbf_pred == test_label).mean() * 100, 1)

print(f'Accuracy of SVC model with Poly kernel: {svc_poly_accuracy} %')
print(f'Accuracy of SVC model with RBF kernel: {svc_rbf_accuracy} %')

Accuracy of SVC model with Poly kernel: 75.9 %
Accuracy of SVC model with RBF kernel: 77.2 %
