In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

In [2]:
diabetes = pd.read_csv('../datasets/diabetes_processed.csv')

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0,0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
5,5.0,116.0,74.0,32.0,127.840221,25.6,0.201,30.0,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1
7,10.0,115.0,72.405184,32.0,136.570245,35.3,0.134,29.0,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1
9,8.0,125.0,96.0,32.0,155.57148,32.0,0.232,54.0,1


In [3]:
diabetes_features = diabetes.drop('Outcome', axis=1)
diabetes_label = diabetes[['Outcome']]

diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [4]:
# introducing missing values
mask = np.random.randint(0, 100, size=diabetes_features.shape).astype(np.bool_)

mask = np.logical_not(mask)

In [5]:
diabetes_features[mask] = np.nan

diabetes_features.sample(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
592,3.0,132.0,80.0,32.0,180.524041,34.4,0.402,44.0
80,3.0,113.0,44.0,13.0,124.620053,22.4,0.14,22.0
607,1.0,92.0,62.0,25.0,41.0,19.5,0.482,25.0
652,5.0,123.0,74.0,40.0,77.0,34.1,0.269,28.0
408,8.0,197.0,74.0,32.0,302.761018,25.9,1.191,39.0
324,2.0,112.0,75.0,32.0,131.194603,35.7,0.148,21.0
16,0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0
186,8.0,181.0,68.0,36.0,495.0,30.1,0.615,60.0
516,9.0,145.0,88.0,34.0,165.0,30.3,0.771,53.0
484,0.0,145.0,72.405184,32.0,221.795204,44.2,0.63,31.0


In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [10]:
x_train, x_test, y_train, y_test = train_test_split(diabetes_features, diabetes_label, test_size=0.2)

In [11]:
transformer = ColumnTransformer(
     transformers=[('features', SimpleImputer(strategy='mean'), [0, 1, 2, 3, 4, 5, 6, 7])]
)

In [12]:
transformer.fit_transform(x_train)

array([[0.00000000e+00, 1.17000000e+02, 8.00000000e+01, ...,
        4.52000000e+01, 8.90000000e-02, 2.40000000e+01],
       [8.00000000e+00, 1.20000000e+02, 7.80000000e+01, ...,
        2.50000000e+01, 4.09000000e-01, 6.40000000e+01],
       [6.00000000e+00, 1.05000000e+02, 7.00000000e+01, ...,
        3.08000000e+01, 1.22000000e-01, 3.70000000e+01],
       ...,
       [3.00000000e+00, 9.90000000e+01, 6.20000000e+01, ...,
        2.18000000e+01, 2.79000000e-01, 2.60000000e+01],
       [4.00000000e+00, 9.50000000e+01, 6.00000000e+01, ...,
        3.54000000e+01, 2.84000000e-01, 2.80000000e+01],
       [8.00000000e+00, 1.67000000e+02, 7.23896956e+01, ...,
        3.76000000e+01, 1.65000000e-01, 4.30000000e+01]], shape=(614, 8))

In [13]:
clf = make_pipeline(transformer, DecisionTreeClassifier(max_depth=4))

In [14]:
clf = clf.fit(x_train, y_train)

clf.score(x_train, y_train)

0.8045602605863192

In [15]:
y_pred = clf.predict(x_test)

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.7012987012987013