In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

from processing.preprocessors import (BinaryEncoder, CleanStrings,
                                      ColumnDropperTransformer)


In [2]:
# Importing the data
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
# Cleaning the columns
columns = [cols.lower().replace(" ","_") for cols in data]
data.columns = columns

In [4]:
# Storing the column definitions so can run them through the pipeline
id = 'id'
target = 'stroke'


categorical_columns = ['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'residence_type', 'smoking_status']

numerical_columns = ['avg_glucose_level', 'bmi']

missing_data_num_columns = ['bmi']

binary_encoder_column = "ever_married"

categorical_binary_classifier_column = ['heart_disease']

In [5]:
# Split the data into train and test
train, test = train_test_split(data, test_size=0.1, random_state=1)

train.shape, test.shape




((4599, 12), (511, 12))

In [6]:
# Defining the Pipeline Objects
preprocess_pipeline = Pipeline(
    [
        ("dropping_id_column",
        
            ColumnDropperTransformer(
                column_list=[id]
            )
        ),
        ("binary_encoder",
            BinaryEncoder(
                column_name=binary_encoder_column
            )
        ),
        ("cleaning_strings",
            CleanStrings(
                column_list=categorical_columns
            )
        )
    ])

transform_pipeline = Pipeline(
    [   
        ("dict_vectorizer",
            DictVectorizer(sparse=False)
        ),
        ("scaling_data",
            MinMaxScaler()
        ),
        ("multiple_numeric_values_imputation",
            KNNImputer(add_indicator=True)
        )
    ]
)

In [7]:
# Preprocessed data
preprocessed_train = preprocess_pipeline.fit_transform(train.drop([target],1))
preprocessed_test = preprocess_pipeline.transform(test.drop([target],1))

  preprocessed_train = preprocess_pipeline.fit_transform(train.drop([target],1))
  preprocessed_test = preprocess_pipeline.transform(test.drop([target],1))


In [8]:
# Converting dataframes to dict objects
train_dict = preprocessed_train.to_dict(orient='records')
test_dict = preprocessed_test.to_dict(orient='records')

In [9]:
# Transforming the data and getting ready for training the pipeline
X_train = transform_pipeline.fit_transform(train_dict)
X_test = transform_pipeline.transform(test_dict)

y_train = train[target]
y_test = test[target]

In [17]:
# Defining the models

d_param_grid = {
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10,20],
    'min_samples_leaf' : [1,3,5,10,20],
    'criterion' : ['gini', 'entropy'],
    'random_state' : [1], 
    'class_weight' : ['balanced']
}
d_clf = DecisionTreeClassifier(random_state=1, class_weight='balanced')


lr_param_grid = {
    "C":np.logspace(-3,3,7), 
    "max_iter": [1000,2000,5000,10000]
    } # l1 lasso l2 ridge
    
lr_clf = LogisticRegression()


In [18]:
# Training the models
%time

d_clf_cv = GridSearchCV(estimator=d_clf, param_grid=d_param_grid, cv=5, scoring='recall')
d_clf_cv.fit(X_train, y_train)

lr_clf_cv = GridSearchCV(estimator=lr_clf, param_grid=lr_param_grid, cv=5, scoring='recall')
lr_clf_cv.fit(X_train, y_train)

lr_best_params = lr_clf_cv.best_params_
d_best_params = d_clf_cv.best_params_

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 7.15 µs


In [26]:
# Training the best models
lr_best_clf = LogisticRegression(**lr_best_params)
d_best_clf = DecisionTreeClassifier(**d_best_params)

lr_best_clf.fit(X_train, y_train)
d_best_clf.fit(X_train, y_train)

In [27]:
# Evaluate the models
def evaluate(model, X_val, y_val):
    """Evaluation function to return AUC"""

    predictions = model.predict(X_val)
    recall = recall_score(y_val, predictions)
    return recall


d_recall = evaluate(d_best_clf, X_val=X_test, y_val=y_test)
lr_recall = evaluate(lr_best_clf, X_val=X_test, y_val=y_test)

d_recall, lr_recall


(0.7058823529411765, 0.058823529411764705)

In [None]:
# Xgboost framework