In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, RocCurveDisplay
from sklearn.metrics import make_scorer, roc_auc_score, recall_score, accuracy_score, confusion_matrix

# Loading and cleaning Dataset

In [None]:
database_file = ""
output_location = ""

In [None]:
def load_clean_dataset(csv_file, na_perc_limit, columns_to_delete=[]):
    loaded_df = pd.read_csv(csv_file)
    print(loaded_df.columns)
    print(loaded_df.shape)
    loaded_df = loaded_df.drop(columns=columns_to_delete)
    tot = loaded_df.shape[0]
    print("Cleaning dataset... \n")
    for col in loaded_df.columns:
        na_per = 1-len(loaded_df[col].dropna())/tot
        if na_per > na_perc_limit:
            print(f"Column {col} --> %NaN = {na_per}. Removed")
            loaded_df = loaded_df.drop(columns=col)
    print("\n Dropping rows with NaNs...")
    loaded_df = loaded_df.dropna()
    print(f"\nFinal columns: {loaded_df.columns}")
    print(loaded_df.shape)
    return loaded_df

columns_to_delete = ["Unnamed: 0", "person_id", "fecha_ingreso_urgencias", "shock_septico", "foco", "sintoma_nan", "fecha_nacimiento", "codigo_postal"]
processed_df = load_clean_dataset(
    csv_file = csv_file,
    na_perc_limit = 0.25,
    columns_to_delete = columns_to_delete
)

In [None]:
for x in processed_df.columns:
    print(x)

In [None]:
print(f"Training table size: {processed_df.shape}")

### Splitting data into train/test

In [None]:
X = processed_df.drop("sepsis", axis=1)
y = processed_df["sepsis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Defining models and hyperparameters for grid search

In [None]:
log_dict = {
    "logistic_regression": {
        "model": LogisticRegression(),
        "params": {
            "solver": ["liblinear", "saga"],
            "C": [0.1, 1, 10, 100, 250, 500]
        }
    }
}
randfor_dict = {
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {
            "criterion": ["gini", "entropy", "log_loss"],
            "n_estimators": [10, 50, 100, 250, 500],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [5, 10, 20, 30, 50]
        }
    }
}
bayes_dict = {
    "bernoulli_bayes": {
        "model": BernoulliNB(),
        "params": {
            "alpha": [0.1, 0.5, 1.0, 1.5, 2.0],
            "fit_prior": [True, False],
            "binarize": [0.0, 0.5, 1.0, 1.5, 2.0]
        }
    }
}
grad_boost_dict = {
    "gradient_boosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": [3, 4, 5],
            "subsample": [0.8, 0.9, 1.0],
            "min_samples_split": [2, 5, 10]
        }
    }
}
"""svc_dict = {
    "svc": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10, 100],
            "kernel": ["linear", "rbf", "poly", "sigmoid"],
            "gamma": ["scale", "auto"]
        }
    }
}"""
grid_search_list = (log_dict, randfor_dict, bayes_dict, grad_boost_dict)
all_grid_models = {k:v for model_dict in model_list for k,v in model_dict.items()}

In [None]:
from scipy.stats import uniform, loguniform, randint

log_dict = {
    "logistic_regression": {
        "model": LogisticRegression(),
        "params": {
            "solver": ["liblinear", "saga"],
            "C": loguniform(0.1, 500)
        }
    }
}

randfor_dict = {
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {
            "criterion": ["gini", "entropy", "log_loss"],
            "n_estimators": randint(10, 501),
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": randint(5, 51)
        }
    }
}

bayes_dict = {
    "bernoulli_bayes": {
        "model": BernoulliNB(),
        "params": {
            "alpha": uniform(0.1, 2.0),
            "fit_prior": [True, False],
            "binarize": uniform(0.0, 2.0)
        }
    }
}

grad_boost_dict = {
    "gradient_boosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": randint(100, 301),
            "learning_rate": loguniform(0.01, 0.1),
            "max_depth": randint(3, 6),
            "subsample": uniform(0.8, 0.2),
            "min_samples_split": randint(2, 11)
        }
    }
}

random_search_list = (log_dict, randfor_dict, bayes_dict, grad_boost_dict)
all_random_models = {k:v for model_dict in model_list for k,v in model_dict.items()}

### Defining search method arguments

f1 score is very common for clinical data as it indicates a balance between precision and recall

In [None]:

random_search_args = {
    "search_model" = RandomizedSearchCV()
    "search_params": {
        "cv":10, "n_iter":1000, "n_jobs":4, "scoring":{
            "f1": "f1",
            "AUC": "roc_auc",
            "Accuracy": "accuracy",
            "Recall": "recall"
        },
        "refit": "f1"
    }
}

grid_search_args = {
    "search_model" = GridSearchCV()
    "search_params": {
        "cv":5, "scoring":{
            "f1": "f1",
            "AUC": "roc_auc",
            "Accuracy": "accuracy",
            "Recall": "recall"
        },
        "refit": "f1"
    }
}



### Training

In [None]:
def cv_search(models_params, search_args):
    print("Given search args: ", search_args)
    scores_list = []
    for model_name, mp in models_params.items():
        search_engine = search_args["search_model"]
        search_params = search_args["search_params"]
        cv_classifier = search_engine(mp["model"], mp["params"], **search_args)
        cv_classifier.fit(X_train, y_train)
        print(f"{model_name} classifier results:\n", cv_classifier.best_score_, cv_classifier.best_params_)
        scores_list.append({
            "model": model_name,
            "best_score": cv_classifier.best_score_,
            "best_params": cv_classifier.best_params_,
            "cv_results": cv_classifier.cv_results_
        })
    return scores_list

def run_testing(processed_df, models_dict, search_args):
    X = processed_df.drop("sepsis", axis=1)
    y = processed_df["sepsis"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

    scores = cv_search(models_params=models_dict, search_args=random_search_args, grid=False)

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(scores, columns=["model", "best_score", "best_params", "cv_results"])

    results_df.to_csv(os.path.join(output_location, "results_df.csv"))
    pd.DataFrame(scores["cv_results"]).to_csv(os.path.join(output_location, "cv_results.csv"))

    # Display the results
    print(results_df)

    # Select the best model and evaluate it on the test set
    best_model_name = results_df.loc[results_df["best_score"].idxmax()]["model"]
    best_model_params = results_df.loc[results_df["best_score"].idxmax()]["best_params"]

    best_model = models_dict[best_model_name]["model"]
    best_model.set_params(**best_model_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]

    print(f"Found best model: {best_model_name} with parameters: {best_model_params}")
    print("Scores table: \n", classification_report(y_test, y_pred))
    
    try:
        importance_dict = {y:x for x,y in zip(X_train.columns, best_model.feature_log_prob_[0])}
    except Exception:
        try:
            importance_dict = {y:x for x,y in zip(X_train.columns, best_model.feature_importances_)}
        except Exception:
            importance_dict = {}
    if importance_dict:
        importance_df = pd.DataFrame({x:importance_dict[x] for x in sorted(importance_dict, reverse=True)}, columns=["Column", "Weight"])
        importance_df.to_csv(os.path.join(output_location, f"{best_model_name}_importance_df.csv"))
    else:
        print(f"Could not extract importance values for best model {best_model_name}:{best_model_params}")
    class_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    class_report_df["roc_auc"] = roc_auc_score(y_test, y_pred_prob)
    class_report_df.to_csv(os.path.join(output_location, "class_report.csv"))
    return results_df, class_report_df

models_dict = randfor_dict

results_df, class_report_df = run_testing(processed_df, models_dict, search_args)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cv_results_df = pd.DataFrame(results_df[0]["cv_results"])
fig, ax = plt.subplots(figsize=(12,6))
for col in ["mean_test_AUC", "mean_test_f1", "mean_test_Accuracy", "mean_test_Recall"]:
    sns.kdeplot(data=cv_results_df, x=col, label=col)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
print(len(best_model.feature_log_prob_[0]), len(X_train.columns))

# Deep Learning approach

In [None]:
import tensorflow