In [21]:
import pandas as pd
import dill
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_recall_curve
import numpy as np

#normalizing data
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

In [22]:
df = pd.read_csv('data/cardio.csv', sep=';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [34]:
X_train, X_test, y_train, y_test = train_test_split(df, df['cardio'], test_size=0.13, random_state=42)

X_train = X_train.drop(columns=['id', 'cardio'])
X_test = X_test.drop(columns=['id', 'cardio'])

#save test
X_test.to_csv("data_split/X_test.csv", index=None)
y_test.to_csv("data_split/y_test.csv", index=None)

#save train
X_train.to_csv("data_split/X_train.csv", index=None)
y_train.to_csv("data_split/y_train.csv", index=None)

In [24]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key, values=None):
        self.key = key
        self.columns = []
        self.values = values

    def fit(self, X, y=None):
        if self.values is not None:
            # exist values
            self.columns = [f'{self.key}_{value}' for value in self.values]
        else:
            # if not found col_value, create col with postfix _value
            self.columns = [f'{self.key}_{value}' for value in X[self.key].unique()]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, columns=[self.key], prefix=self.key)
        for col in self.columns:
            if col not in X.columns:
                X[col] = 0
        return X[self.columns]

In [25]:
features = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
target = 'cardio'

In [26]:
# Pipelines

age = Pipeline([
    ('selector', NumberSelector(key='age')),
    ('standard', StandardScaler())
])

height = Pipeline([
    ('selector', NumberSelector(key='height')),
    ('standard', StandardScaler())
])

weight = Pipeline([
    ('selector', NumberSelector(key='weight')),
    ('standard', StandardScaler())
])

ap_hi = Pipeline([
    ('selector', NumberSelector(key='ap_hi')),
    ('standard', StandardScaler())
])

ap_lo = Pipeline([
    ('selector', NumberSelector(key='ap_lo')),
    ('standard', StandardScaler())
])

gender = Pipeline([
    ('selector', ColumnSelector(key='gender')),
    ('enc_gender', OHEEncoder(key='gender', values=[1, 2]))
])

cholesterol = Pipeline([
    ('selector', ColumnSelector(key='cholesterol')),
    ('enc_cholesterol', OHEEncoder(key='cholesterol', values=[1, 2, 3]))
])

gluc = Pipeline([
    ('selector', NumberSelector(key='gluc'))
])

smoke = Pipeline([
    ('selector', NumberSelector(key='smoke'))
])

alco = Pipeline([
    ('selector', NumberSelector(key='alco'))
])

active = Pipeline([
    ('selector', NumberSelector(key='active'))
])

feats = FeatureUnion([('age', age),
                      ('gender', gender),
                      ('height', height),
                      ('weight', weight),
                      ('ap_hi', ap_hi),
                      ('ap_lo', ap_lo),
                      ('cholesterol', cholesterol),
                      ('gluc', gluc),
                      ('smoke', smoke),
                      ('alco', alco),
                      ('active', active)])

In [27]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier()),
])

pipeline.fit(X_train, y_train)

CPU times: total: 4.7 s
Wall time: 4.74 s


In [28]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('age',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='age')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('gender',
                                  Pipeline(steps=[('selector',
                                                   ColumnSelector(key='gender')),
                                                  ('enc_gender',
                                                   OHEEncoder(key='gender',
                                                              values=[1, 2]))])),
                                 ('height',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='height')),
                                                  ('standard',
   

In [29]:
with open("model/cardio.dill", "wb") as f:
    dill.dump(pipeline, f)

In [30]:
# gradient boosting

from sklearn.ensemble import GradientBoostingClassifier


model_gb = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])

model_gb.fit(X_train, y_train)

preds = model_gb.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.38154779797343324, F-Score=0.750, Precision=0.702, Recall=0.804


In [33]:
!pip freeze > requirements.txt

