In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns
from Models import probitModel, logisticModel
from util import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import *
from sklearn.metrics import make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from joblib import Memory
from shutil import rmtree

from sklearn.metrics import get_scorer
from datetime import datetime

from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

In [None]:
SEED = 1

data_folder = 'data'
df = pd.read_csv(os.path.join(data_folder,'mimiciv_traindata.csv'))
df.head()

In [None]:
x = df.drop('mortality',axis=1)
y = df.mortality.values


In [None]:
print(df.mortality.value_counts(normalize=True))
df.mortality.value_counts(normalize=True).plot(kind='bar')
#sns.countplot(x='mortality', data=df)

# GridSearchCV for weight of weighted loss

### Probit

In [None]:
probit_pipeline = [
    ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
    ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
    ('model',probitModel()),
]

In [None]:
pipe = Pipeline(probit_pipeline)
param_grid = {
    'model__w': np.arange(0.1,1.0,0.05),
}
search = GridSearchCV(pipe, param_grid, cv=5, scoring=make_scorer(f1_score), n_jobs=-1, verbose=3)
search.fit(x,y)

In [None]:
search.best_params_

### Logistic Model

In [None]:
logistic_pipeline = [
    ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
    ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
    ('model', logisticModel()),
]

In [None]:
pipe = Pipeline(probit_pipeline)
param_grid = {
    'model__w': np.arange(0.1,1.0,0.05),
}
search = GridSearchCV(pipe, param_grid, cv=5, scoring=make_scorer(f1_score), n_jobs=-1, verbose=3)
search.fit(x,y)

In [None]:
search.best_params_

# Bayes Opt

## Probit

In [None]:
def probit_cv(l1, l2):
    probit_pipeline = [
        ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
        ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
        ('model',probitModel( l1=l1, l2=l2, w=0.75 )),
    ]


    mean_score = cv(probit_pipeline, x.values, y, f1_score, 5, random_state=SEED)

    return mean_score

In [None]:
bo = BayesianOptimization(
    probit_cv, 
    pbounds={
        'l1':(0,5),
        'l2':(0,5),
    },                              
    verbose=2
)

In [None]:


start = datetime.now()

bo.maximize(init_points=3, n_iter=100)

time_elapsed = datetime.now() - start

print('Time elapsed:',time_elapsed)

In [None]:
print(bo.max)

## Logistic

In [None]:
def logistic_cv(l1, l2):

    logistic_pipeline = [
        ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
        ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
        ('model', logisticModel(l1=l1, l2=l2, w=0.75)),
    ]

    mean_score = cv(logistic_pipeline, x.values, y, f1_score, 5, random_state=SEED)

    return mean_score

In [None]:
bo = BayesianOptimization(
    logistic_cv, 
    pbounds={
        'l1':(0,5),
        'l2':(0,5),
    },                              
    verbose=2
)

In [None]:
start = datetime.now()

bo.maximize(init_points=3, n_iter=100)

time_elapsed = datetime.now() - start

print('Time elapsed:',time_elapsed)

In [None]:
print(bo.max)

## LightGBM

In [None]:


def lightgbm_cv(
        max_depth, num_leaves, 
        min_data_in_leaf, bagging_fraction, 
        feature_fraction, lambda_l1, lambda_l2,
        min_split_gain, max_bin, drop_rate, max_drop
    ):
    params = {
        "objective": "binary",
        "metric": None,
        "verbosity": -1,
        "boosting_type": "dart",#"gbdt", #
        "is_unbalance": True,
        "num_boost_round": 1000,
        "learning_rate" : 0.03,

        "max_depth" : int(max_depth),
        "num_leaves" : int(num_leaves),
        "min_data_in_leaf": int(min_data_in_leaf),
        "bagging_fraction": bagging_fraction,
        "feature_fraction": feature_fraction,
        "lambda_l1": lambda_l1,
        "lambda_l2": lambda_l2,
        "min_split_gain": min_split_gain,
        "max_bin": int(max_bin),
        # dart param
        "drop_rate": drop_rate,
        "max_drop": int(max_drop)
    }

    lightgbm_pipeline = [
        ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
        #('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
        ('model', LGBMClassifier(**params)),
    ]

    mean_score = cv(lightgbm_pipeline, x.values, y, f1_score, 5, random_state=SEED)

    return mean_score

In [None]:
bo = BayesianOptimization(
    lightgbm_cv, 
    pbounds={
        "max_depth" : (3, 10),
        "num_leaves" : (20, 100),
        "min_data_in_leaf": (1, 50),
        "bagging_fraction": (0.5, 1.0),
        "feature_fraction": (0.5, 1.0),
        "lambda_l1": (0, 10),
        "lambda_l2": (0, 10),
        "min_split_gain": (0, 0.1),
        "max_bin": (50,255),
        # dart param
        "drop_rate": (0,0.5),
        "max_drop": (10,50)
    },                              
    verbose=2
)

In [None]:
start = datetime.now()

bo.maximize(init_points=3, n_iter=100)

time_elapsed = datetime.now() - start

print('Time elapsed:',time_elapsed)

In [None]:
print(bo.max)

In [None]:
logistic_pipeline = [
    ('scaler', QuantileTransformer(output_distribution='normal', ignore_implicit_zeros=False)),
    ('imputer', KNNImputer(missing_values=np.nan, n_neighbors=10)),
    ('model', logisticModel()),
]
logistic_pipeline