# Dimensionality reduction.

- Imputation of missing values by KNN for numeric columns.
- Imputation of missing values by the "most frequent" (mode) for categoric variables.  
- Elimination of columns with >10% missing values.

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

def load_clean_dataset(loaded_df, na_perc_limit, columns_to_delete=[]):
    print(f"Shape inicial: {loaded_df.shape}")

    loaded_df = loaded_df.drop(columns=columns_to_delete, errors="ignore")

    tot = loaded_df.shape[0]

    for col in loaded_df.columns:
        na_per = 1 - len(loaded_df[col].dropna()) / tot
        if na_per > na_perc_limit:
            print(f"Column {col} --> %NaN = {na_per}. deleted")
            loaded_df = loaded_df.drop(columns=col)


    if 'mujer_gestante' in loaded_df.columns:
        loaded_df['mujer_gestante'] = (
            loaded_df['mujer_gestante']
            .map({'False': 0, 'True': 1})  
            .fillna(0)  
        )

    numeric_cols = loaded_df.select_dtypes(include=["int", "float"]).columns
    categorical_cols = loaded_df.select_dtypes(include=["object", "category"]).columns
    binary_cols = [col for col in numeric_cols if set(loaded_df[col].dropna().unique()) <= {0, 1}]
    continuous_cols = [col for col in numeric_cols if col not in binary_cols]
    categorical_numeric_cols = [
        col for col in continuous_cols if loaded_df[col].nunique() <15
    ]
    continuous_cols = [col for col in continuous_cols if col not in categorical_numeric_cols]
    
    if binary_cols:
        binary_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[binary_cols] = binary_imputer.fit_transform(loaded_df[binary_cols]).astype(int)

    if continuous_cols:
        numeric_imputer = KNNImputer(n_neighbors=5, weights="distance")
        loaded_df[continuous_cols] = numeric_imputer.fit_transform(loaded_df[continuous_cols])
        loaded_df[continuous_cols] = loaded_df[continuous_cols].round(1)

    if len(categorical_cols) > 0:
        categorical_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[categorical_cols] = categorical_imputer.fit_transform(loaded_df[categorical_cols])
        loaded_df[categorical_cols] = loaded_df[categorical_cols].astype(int)
    
    if categorical_numeric_cols:
        categorical_numeric_imputer = SimpleImputer(strategy="most_frequent")
        loaded_df[categorical_numeric_cols] = categorical_numeric_imputer.fit_transform(loaded_df[categorical_numeric_cols])
        loaded_df[categorical_numeric_cols] = loaded_df[categorical_numeric_cols].astype(int)

    print(f"Shape final: {loaded_df.shape}")
    return loaded_df


def combine_columns(df_to_clean, column_list, new_col_name):
    df_to_clean[new_col_name] = df_to_clean[column_list].sum(axis=1).astype(int)
    clean_df = df_to_clean.drop(columns=column_list)
    return clean_df


database_file = "/home/sergio/git/dev/mepram_testing/bd-tools/data/df_sin_antecedentes_v1.csv" 
df_to_clean = pd.read_csv(database_file)

hepatic_cols = [c for c in df_to_clean.columns if "hepatopatia" in c]
tumor_cols = [c for c in df_to_clean.columns if "cancer" in c]
for new_name, col_list in {"enf_hepaticas": hepatic_cols, "tumores": tumor_cols}.items():
    df_to_clean = combine_columns(df_to_clean, col_list, new_name)

columns_to_delete = ["Unnamed: 0", "person_id", "fecha_ingreso_urgencias", "shock_septico", "foco", "sintoma_nan", "fecha_nacimiento", "codigo_postal", "center", "dag"]
processed_df = load_clean_dataset(
    loaded_df=df_to_clean,
    na_perc_limit=0.1,
    columns_to_delete=columns_to_delete
)

print("Pre-processing completed.")

- Elimination of variables with variance close to zero.
- Elimination of highly correlated variables.

In [None]:
varianza_cero = processed_df.var() == 0
preprocessed_df = processed_df.loc[:, ~varianza_cero]

print(f"Deleted columns: {processed_df.columns[varianza_cero].tolist()}")

In [None]:
processed_df_copy = processed_df

target_copy = processed_df_copy["sepsis"]
scaler = MinMaxScaler()
X_preprocessed = pd.DataFrame(scaler.fit_transform(processed_df_copy.drop(columns=["sepsis"])))
X_preprocessed["sepsis"] = target_copy

target_palette = {0: "blue", 1: "red"}
row_colors = X_preprocessed["sepsis"].map(target_palette)
X_preprocessed = X_preprocessed.dropna() 
sns.clustermap(
    X_preprocessed.drop(columns=["sepsis"]), 
    cmap="coolwarm",
    row_colors=row_colors,  
    figsize=(30, 60),
    annot=False, 
    col_cluster=False
)

plt.title("Heatmap con Clustering y Anotación por Target", pad=100)
plt.show()

# PCA

- Scaled data using MinMaxScaler()

Plot PCA 2D and 3D

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

target = processed_df['sepsis']
data = processed_df.drop(columns= ['sepsis'])

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

explained_variance = pca.explained_variance_ratio_
print(f"Varianza explicada por cada componente: {explained_variance}")
print(f"Varianza total explicada: {sum(explained_variance)}")

plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=target, cmap='viridis', alpha=0.7)
plt.title("PCA 2D")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid()
plt.show()


In [None]:
import plotly.express as px 

pca_3d = PCA(n_components=3)
pca_data_3d = pca_3d.fit_transform(scaled_data)

pca_df = pd.DataFrame(pca_data_3d, columns=["PC1", "PC2" ,"PC3"])
pca_df["sepsis"] = target

fig = px.scatter_3d(
    pca_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color = "sepsis",
    title = "PCA - 3D Visualization",
    labels= {"sepsis": "Sepsis (0=No, 1= Yes)"},
    color_continuous_scale="Viridis", 
    opacity=0.7  
)
fig.update_traces(marker=dict(size=5))  
fig.update_layout(scene=dict(
    xaxis_title="PC 1",
    yaxis_title="PC 2",
    zaxis_title="PC 3"
))
fig.show()

# Recursive Feature Elimination

- Random forest
- SVM
- Logistic Regression
- Decision Tree

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, f1_score, precision_recall_curve, average_precision_score
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

X = processed_df.drop(columns=["sepsis"])
y = processed_df["sepsis"].astype(int)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=99)

X_train = pd.DataFrame(X_train, columns=X.columns).reset_index(drop=True)
X_test = pd.DataFrame(X_test, columns=X.columns).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

models = {
    "Random Forest": RandomForestClassifier(random_state=99),
    "SVM": SVC(kernel="linear", probability=True, random_state=99),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=99),
    "Decision Tree": DecisionTreeClassifier(random_state=99),
}

results_all_vars = {}
results = {}
important_features = {}

for name, model in models.items():
    
    print(f"\nModelo: {name}")
    model.fit(X_train, y_train)
    y_pred_prob_test = model.predict_proba(X_test)[:, 1]
    y_pred_test = model.predict(X_test)

    fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_prob_test)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_test)
    auc_test = auc(fpr_test, tpr_test)
    avg_precision = average_precision_score(y_test, y_pred_prob_test)
    f1_test = f1_score(y_test, y_pred_test)
    
    results_all_vars[name] = {
        "fpr_test": fpr_test,
        "tpr_test": tpr_test,
        "precision": precision,
        "recall": recall,
        "auc_test": auc_test,
        "avg_precision": avg_precision,
        "f1_test": f1_test,
    }

    print(f"AUC (all features): {auc_test:.3f}, Avg Precision: {avg_precision:.3f}, F1: {f1_test:.3f}")
    
    print(f"\Training {name} with RFECV ...")

    auc_test_scores = []
    f1_test_scores = []

    rfe = RFECV(estimator=model,
                step=1,
                cv=StratifiedKFold(n_splits=10),
                scoring="f1")
    
    rfe.fit(X_train, y_train)

    print(f"Optimum number of characteristics for {name}: {rfe.n_features_}")
    
    final_features = X_train.columns[rfe.support_]
    important_features[name] = final_features

    X_train_rfe = X_train[final_features]
    X_test_rfe = X_test[final_features]

    model.fit(X_train_rfe, y_train)

    y_pred_prob_test = model.predict_proba(X_test_rfe)[:, 1]
    y_pred_test = model.predict(X_test_rfe)

    fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_prob_test)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_test)
    auc_test = auc(fpr_test, tpr_test)
    avg_precision = average_precision_score(y_test, y_pred_prob_test)
    f1_test = f1_score(y_test, y_pred_test)

    results[name] = {
        "fpr_test": fpr_test,
        "tpr_test": tpr_test,
        "precision": precision,
        "recall": recall,
        "auc_test": auc_test,
        "avg_precision": avg_precision,
        "f1_test": f1_test,
        "selected_features": final_features,
    }

    print(f"{name} - AUC Test: {auc_test:.3f}, F1 Test: {f1_test:.3f}")
    print(f"Características seleccionadas por {name}: {final_features}")

# ROC
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for model_name in results_all_vars:
    metrics = results_all_vars[model_name]
    plt.plot(metrics["fpr_test"], metrics["tpr_test"], label=f"{model_name} (AUC: {metrics['auc_test']:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random (AUC: 0.500)")
plt.title("Comparison of models by ROC curves (all features)")
plt.xlabel("1 - Specificity")
plt.ylabel("Sensitivity")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
for model_name, metrics in results.items():
    plt.plot(metrics["fpr_test"], metrics["tpr_test"], label=f"{model_name} (AUC: {metrics['auc_test']:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random (AUC: 0.500)")
plt.title("Comparison of models by ROC curves")
plt.xlabel("1 - Specificity")
plt.ylabel("Sensitivity")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

# RECALL
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for model_name, metrics in results_all_vars.items():
    plt.plot(metrics["recall"], metrics["precision"], label=f"{model_name} (Avg Precision: {metrics['avg_precision']:.3f})")
plt.axhline(y=0.5, color="r", linestyle="--", label="Precisión = 0.500")
plt.title("Precision-Recall - All features")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
for model_name, metrics in results.items():
    plt.plot(metrics["recall"], metrics["precision"], label=f"{model_name} (Avg Precision: {metrics['avg_precision']:.3f})")
plt.axhline(y=0.5, color="r", linestyle="--", label="Precisión = 0.500")
plt.title("Precision-Recall - Selected features")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()


fig, axes = plt.subplots(1, len(important_features), figsize=(20, 8), sharey=True)
for ax, (model_name, features) in zip(axes, important_features.items()):
    ax.barh(features, range(len(features)))
    ax.set_title(model_name)
    ax.set_xlabel("Importance ranking")
plt.tight_layout()
plt.show()

consensus_features = set(important_features[list(important_features.keys())[0]])
for features in important_features.values():
    consensus_features &= set(features)

print("\nSelected features: ", list(consensus_features))

Selected features
- estado_mental_alterado
- proteina_c_reactiva
- respiracion
- frec_cardiaca
- creatinina
- edad
- vasopresores
- cardiovascular