In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns
from Models import probitModel, logisticModel
from util import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import *
from sklearn.metrics import make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from joblib import Memory
from shutil import rmtree

from sklearn.metrics import get_scorer

In [None]:
SEED = 1

data_folder = 'data'
df = pd.read_csv(os.path.join(data_folder,'mimiciv_traindata.csv'))
df.head()

In [3]:
x = df.drop('mortality',axis=1)
y = df.mortality.values


In [None]:
print(df.mortality.value_counts(normalize=True))
df.mortality.value_counts(normalize=True).plot(kind='bar')
#sns.countplot(x='mortality', data=df)

# GridSearchCV for weight of weighted loss

### Probit

In [5]:
probit_pipeline = [
    #('scaler', RobustScaler(unit_variance=True)),
    #('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
    ('scaler', PowerTransformer(method='yeo-johnson', standardize=True)),
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
   # ('smote', SMOTE(random_state = SEED)),
    ('model',probitModel()),
]

In [None]:
pipe = Pipeline(probit_pipeline)
param_grid = {
    'model__w': np.arange(0.1,1.0,0.05),
}
search = GridSearchCV(pipe, param_grid, cv=5, scoring=make_scorer(f1_score), n_jobs=-1, verbose=3)
search.fit(x,y)

In [None]:
search.best_params_

### Logistic Model

In [None]:
logistic_pipeline = [
    #('scaler', RobustScaler(unit_variance=True)),
    #('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
    ('scaler', PowerTransformer(method='yeo-johnson', standardize=True)),
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
   # ('smote', SMOTE(random_state = SEED)),
    ('model', logisticModel()),
]

In [None]:
pipe = Pipeline(probit_pipeline)
param_grid = {
    'model__w': np.arange(0.1,1.0,0.05),
}
search = GridSearchCV(pipe, param_grid, cv=5, scoring=make_scorer(f1_score), n_jobs=-1, verbose=3)
search.fit(x,y)

In [None]:
search.best_params_

# Bayes Opt

In [6]:
from bayes_opt import BayesianOptimization

In [7]:
def probit_cv(l1, l2):
    probit_pipeline = [
        #('scaler', RobustScaler(unit_variance=True)),
        #('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
        ('scaler', PowerTransformer(method='yeo-johnson', standardize=True)),
        #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
    # ('smote', SMOTE(random_state = SEED)),
        ('model',probitModel( l1=l1, l2=l2, w=0.75 )),
    ]


    mean_score = cv(probit_pipeline, x.values, y, f1_score, 5, random_state=SEED)

    return mean_score

In [8]:
xgb_bo = BayesianOptimization(
    probit_cv, 
    pbounds={
        'l1':(0,5),
        'l2':(0,5),
    },                              
    verbose=2
)

In [None]:
from datetime import datetime

start = datetime.now()

xgb_bo.maximize(init_points=3, n_iter=100)

time_elapsed = datetime.now() - start

print('Time elapsed:',time_elapsed)