# Optimization using Optuna

In [None]:
import os
import joblib
import optuna
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import (train_test_split, cross_val_score)
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import (
    f1_score,
    recall_score,
    accuracy_score,
    precision_score,
    classification_report,
    roc_auc_score,
    log_loss,
    confusion_matrix,
    precision_recall_curve,
    auc,
    roc_curve
)
import seaborn as sns

In [None]:
def load_clean_dataset(loaded_df, na_perc_limit, columns_to_delete=[]):
    tot = loaded_df.shape[0]
    loaded_df = loaded_df.drop(columns=columns_to_delete)
    for col in loaded_df.columns:
        na_per = 1 - len(loaded_df[col].dropna()) / tot
        if na_per > na_perc_limit:
            loaded_df = loaded_df.drop(columns=col)
    loaded_df = loaded_df.dropna()
    return loaded_df

def combine_columns(df_to_clean, column_list, new_col_name):
    df_to_clean[new_col_name] = df_to_clean[column_list].sum(axis=1).astype(int)
    clean_df = df_to_clean.drop(columns=column_list)
    return clean_df

def preprocess_data(database_file):
    columns_to_delete = [
        "Unnamed: 0", "person_id", "fecha_ingreso_urgencias", "shock_septico", 
        "foco", "sintoma_nan", "fecha_nacimiento", "codigo_postal", "center", "dag"
    ]
    df_to_clean = pd.read_csv(database_file)
    hepatic_cols = [c for c in df_to_clean.columns if "hepatopatia" in c]
    tumor_cols = [c for c in df_to_clean.columns if "cancer" in c]
    for new_name, col_list in {"enf_hepaticas": hepatic_cols, "tumores": tumor_cols}.items():
        df_to_clean = combine_columns(df_to_clean, col_list, new_name)
    processed_df = load_clean_dataset(
        loaded_df=df_to_clean,
        na_perc_limit=0.1,
        columns_to_delete=columns_to_delete
    )
    return processed_df

In [None]:
def objective_random_forest(trial, X_train, y_train, X_val, y_val):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    logloss = log_loss(y_val, model.predict_proba(X_val))
    return f1, logloss

def objective_xgboost(trial, X_train, y_train, X_val, y_val):
    params = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 10)
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    logloss = log_loss(y_val, model.predict_proba(X_val))
    return f1, logloss

def objective_svm(trial, X_train, y_train, X_val, y_val):
    params = {
        'C': trial.suggest_float('C', 0.1, 10.0),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'gamma': trial.suggest_float('gamma', 0.001, 1.0)
    }
    model = SVC(**params, probability=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    logloss = log_loss(y_val, model.predict_proba(X_val))
    return f1, logloss

def evaluate_models(models, X_test, y_test):
    results = []
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        metrics = {
            'Model': model_name,
            'F1': f1_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_pred_prob) if y_pred_prob is not None else 'N/A'
        }
        print(f"\nClassification Report for {model_name}:\n")
        print(classification_report(y_test, y_pred))
        results.append(metrics)
    return pd.DataFrame(results)

def plot_optuna_metrics(rf_study, xgb_study, svm_study):
    studies = {
        "Random Forest": rf_study,
        "XGBoost": xgb_study,
        "SVM": svm_study
    }

    # Plot F1 Score
    plt.figure(figsize=(12, 8))
    for model_name, study in studies.items():
        trials_f1 = [trial.values[0] for trial in study.trials if trial.state == optuna.trial.TrialState.COMPLETE]
        plt.plot(range(1, len(trials_f1) + 1), trials_f1, label=f"{model_name} - F1")

    plt.title("F1-Score Evolution Across Trials")
    plt.xlabel("Trial")
    plt.ylabel("F1-Score")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot LogLoss
    plt.figure(figsize=(12, 8))
    for model_name, study in studies.items():
        trials_logloss = [trial.values[1] for trial in study.trials if trial.state == optuna.trial.TrialState.COMPLETE]
        plt.plot(range(1, len(trials_logloss) + 1), trials_logloss, label=f"{model_name} - LogLoss")

    plt.title("LogLoss Evolution Across Trials")
    plt.xlabel("Trial")
    plt.ylabel("LogLoss")
    plt.legend()
    plt.grid(True)
    plt.show()
    
def plot_metrics(models, X_test, y_test):
    plt.figure(figsize=(15, 5))

    # ROC Curve
    plt.subplot(1, 3, 1)
    for model_name, model in models.items():
        y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        if y_pred_prob is not None:
            fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()

    # Recall-Precision Curve
    plt.subplot(1, 3, 2)
    for model_name, model in models.items():
        y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        if y_pred_prob is not None:
            precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
            plt.plot(recall, precision, label=f"{model_name}")
    plt.title("Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()

    plt.tight_layout()
    plt.show()

    # Confusion Matrix
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(5, 5))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Sepsis", "Sepsis"], yticklabels=["No Sepsis", "Sepsis"])
        plt.title(f"Confusion Matrix for {model_name}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

In [None]:
if __name__ == "__main__":
    database_file = "/home/sergio/git/dev/mepram_testing/bd-tools/data/df_sin_antecedentes_v1.csv"
    output_location = "/home/sergio/git/dev/mepram_testing/bd-tools/output_testing/"

    processed_df = preprocess_data(database_file)
    processed_df = processed_df.drop(columns=['mujer_gestante'])
    X = processed_df.drop("sepsis", axis=1)
    y = processed_df["sepsis"]

    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

    rf_study = optuna.create_study(directions=["maximize", "minimize"])
    rf_study.optimize(lambda trial: objective_random_forest(trial, X_train, y_train, X_test, y_test), n_trials=250)
    best_rf_params = rf_study.best_trials[0].params
    best_rf_model = RandomForestClassifier(**best_rf_params).fit(X_train, y_train)

    xgb_study = optuna.create_study(directions=["maximize", "minimize"])
    xgb_study.optimize(lambda trial: objective_xgboost(trial, X_train, y_train, X_test, y_test), n_trials=250)
    best_xgb_params = xgb_study.best_trials[0].params
    best_xgb_model = XGBClassifier(**best_xgb_params).fit(X_train, y_train)

    svm_study = optuna.create_study(directions=["maximize", "minimize"])
    svm_study.optimize(lambda trial: objective_svm(trial, X_train, y_train, X_test, y_test), n_trials=250)
    best_svm_params = svm_study.best_trials[0].params
    best_svm_model = SVC(**best_svm_params, probability=True).fit(X_train, y_train)

    models = {
        "Random Forest": best_rf_model,
        "XGBoost": best_xgb_model,
        "SVM": best_svm_model
    }
    results = evaluate_models(models, X_test, y_test)

    print("Model Comparison:\n", results)
    results.to_csv(os.path.join(output_location, "model_comparison.csv"), index=False)

    # Plot metrics
    plot_optuna_metrics(rf_study, xgb_study, svm_study)
    
    plot_metrics(models, X_test, y_test)

# Impute missing data

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

def load_clean_dataset_impute(loaded_df, na_perc_limit, columns_to_delete=[]):
    print(f"Shape inicial: {loaded_df.shape}")

    loaded_df = loaded_df.drop(columns=columns_to_delete, errors="ignore")

    tot = loaded_df.shape[0]

    for col in loaded_df.columns:
        na_per = 1 - len(loaded_df[col].dropna()) / tot
        if na_per > na_perc_limit:
            print(f"Column {col} --> %NaN = {na_per}. deleted")
            loaded_df = loaded_df.drop(columns=col)


    if 'mujer_gestante' in loaded_df.columns:
        loaded_df['mujer_gestante'] = (
            loaded_df['mujer_gestante']
            .map({'False': 0, 'True': 1})  
            .fillna(0)  
        )

    numeric_cols = loaded_df.select_dtypes(include=["int", "float"]).columns
    categorical_cols = loaded_df.select_dtypes(include=["object", "category"]).columns
    binary_cols = [col for col in numeric_cols if set(loaded_df[col].dropna().unique()) <= {0, 1}]
    continuous_cols = [col for col in numeric_cols if col not in binary_cols]
    categorical_numeric_cols = [
        col for col in continuous_cols if loaded_df[col].nunique() <15
    ]
    continuous_cols = [col for col in continuous_cols if col not in categorical_numeric_cols]
    
    if binary_cols:
        binary_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[binary_cols] = binary_imputer.fit_transform(loaded_df[binary_cols]).astype(int)

    if continuous_cols:
        numeric_imputer = KNNImputer(n_neighbors=5, weights="distance")
        loaded_df[continuous_cols] = numeric_imputer.fit_transform(loaded_df[continuous_cols])
        loaded_df[continuous_cols] = loaded_df[continuous_cols].round(1)

    if len(categorical_cols) > 0:
        categorical_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[categorical_cols] = categorical_imputer.fit_transform(loaded_df[categorical_cols])
        loaded_df[categorical_cols] = loaded_df[categorical_cols].astype(int)
    
    if categorical_numeric_cols:
        categorical_numeric_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[categorical_numeric_cols] = categorical_numeric_imputer.fit_transform(loaded_df[categorical_numeric_cols])
        loaded_df[categorical_numeric_cols] = loaded_df[categorical_numeric_cols].astype(int)

    print(f"Shape final: {loaded_df.shape}")
    return loaded_df


def combine_columns(df_to_clean, column_list, new_col_name):
    df_to_clean[new_col_name] = df_to_clean[column_list].sum(axis=1).astype(int)
    clean_df = df_to_clean.drop(columns=column_list)
    return clean_df


database_file = "/home/sergio/git/dev/mepram_testing/bd-tools/data/df_sin_antecedentes_v1.csv" 
df_to_clean = pd.read_csv(database_file)

hepatic_cols = [c for c in df_to_clean.columns if "hepatopatia" in c]
tumor_cols = [c for c in df_to_clean.columns if "cancer" in c]
for new_name, col_list in {"enf_hepaticas": hepatic_cols, "tumores": tumor_cols}.items():
    df_to_clean = combine_columns(df_to_clean, col_list, new_name)

columns_to_delete = ["Unnamed: 0", "person_id", "fecha_ingreso_urgencias", "shock_septico", "foco", "sintoma_nan", "fecha_nacimiento", "codigo_postal", "center", "dag"]
processed_df = load_clean_dataset_impute(
    loaded_df=df_to_clean,
    na_perc_limit=0.1,
    columns_to_delete=columns_to_delete
)