In [None]:
import utils
import dotenv
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
dotenv.load_dotenv()

In [None]:
df = pd.read_csv(f'{os.getenv("ROOT_DIR")}\\data\\finalrefined.csv')

In [None]:
df

In [None]:
df_subject_mr = utils.aggregate_score_to_mortality()
df_subject_mr

In [None]:
df = df.merge(df_subject_mr.drop_duplicates(subset='subject_id'), on='subject_id', how='left')
df

In [None]:
df['icu_level'] = df['mr_lods'].apply(utils.map_mortality_rate_to_icu_level)
df['icu_level']

In [None]:
df["icu_level"].describe()

In [None]:
df["icu_level"].groupby(df["icu_level"]).count()

In [None]:
columns_to_remove = ["subject_id", "hadm_id", "icustay_id"]
df = df.drop(columns_to_remove, axis=1)

In [None]:
y = df['icu_level']
X = df.drop(columns=['icu_level'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
random_forest_model = RandomForestClassifier(random_state=42)
gbm_model = GradientBoostingClassifier(random_state=42)
knn_model = KNeighborsClassifier()
svm_model = SVC(probability=True, random_state=42)
logistic_regression_model = LogisticRegression(random_state=42)
naive_bayes_model = GaussianNB()
xgb_model = xgb.XGBClassifier(random_state=42)
catboost_model = cb.CatBoostClassifier(random_state=42, verbose=0)

In [None]:
ensemble_models = [
    ('Random Forest', random_forest_model),
    ('GBM', gbm_model),
    ('KNN', knn_model),
    ('SVM', svm_model),
    ('Logistic Regression', logistic_regression_model),
    ('Naive Bayes', naive_bayes_model),
    ('XGBoost', xgb_model),
    ('CatBoost', catboost_model),
]

In [None]:
hard_voting_ensemble = VotingClassifier(estimators=ensemble_models, voting='hard')

In [None]:
soft_voting_ensemble = VotingClassifier(estimators=ensemble_models, voting='soft')

In [None]:
models = [random_forest_model, gbm_model, knn_model, svm_model,
          logistic_regression_model, naive_bayes_model, xgb_model, catboost_model,
          hard_voting_ensemble, soft_voting_ensemble]