# 4.3 Correlaciones ampliadas

En este notebook se exploran las correlaciones entre la actividad clínica (**SLEDAI**) y un conjunto de variables clínicas y metabólicas de interés: **IMC (BMI)**, **Triglicéridos (TG)**, **Proteína C reactiva (CRP)** y **Vitamina D**.  
Se calcularán correlaciones de Pearson y Spearman, generando una matriz resumen y figuras para visualizar patrones y asociaciones relevantes.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1) Cargar dataset final ---
df = pd.read_csv("outputs/dataset_ready.csv")

# --- 2) Renombrar variables de interés (ajustar si tu dataset tiene nombres distintos) ---
df = df.rename(columns={
    "BMI (kg/m2)": "BMI",
    "Triglycerides (mg/dL)": "Trigliceridos",
    "C-reactive protein": "CRP",
    "Vitamin D (ng/mL)": "VitD"
})

# --- 3) Selección de variables ---
vars_corr = ["SLEDAI", "BMI", "Trigliceridos", "CRP", "VitD"]
data_corr = df[vars_corr].dropna()

data_corr.head()


Unnamed: 0,SLEDAI,BMI,Trigliceridos,CRP,VitD
0,4.0,36.9,306.0,8.51,26.91
1,0.0,30.175,106.68,15.92,14.16
2,0.0,30.175,119.04,3.59,27.27
3,4.0,27.0,137.16,3.02,33.27
4,0.0,30.175,66.0,3.19,25.13


In [1]:
import os
import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt

# Paths
DATA = "outputs/dataset_ready.csv"
TAB_DIR = "outputs/tablas"
FIG_DIR = "outputs/figuras"
os.makedirs(TAB_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)

# Cargar dataset final
df = pd.read_csv(DATA)

# Detectar columna de país
for cand in ["Pais", "País", "Country", "country"]:
    if cand in df.columns:
        COL_PAIS = cand
        break
else:
    raise KeyError("No encuentro la columna de país (Pais/País/Country).")

# Renombrar variables de interés (ajusta si tus nombres difieren)
rename_map = {
    "BMI (kg/m2)": "BMI",
    "Triglycerides (mg/dL)": "Trigliceridos",
    "C-reactive protein": "CRP",
    "Vitamin D (ng/mL)": "VitD",
}
df = df.rename(columns=rename_map)

vars_corr = ["SLEDAI", "BMI", "Trigliceridos", "CRP", "VitD"]

# Mantener solo columnas existentes
vars_presentes = [v for v in vars_corr if v in df.columns]
faltan = [v for v in vars_corr if v not in df.columns]
if faltan:
    print("Aviso: faltan columnas en el dataset:", faltan)

# Filtrar casos completos para las variables presentes
data = df[[COL_PAIS] + vars_presentes].copy()
data_clean = data.dropna(subset=vars_presentes)

# Normalizar nombres de país
map_pais = {"Brazil":"Brasil", "BR":"Brasil", "Mexico":"México", "MX":"México", "Spain":"España", "ES":"España"}
data_clean[COL_PAIS] = data_clean[COL_PAIS].replace(map_pais)

print("N global (completo en variables seleccionadas):", len(data_clean))
print("Casos por país:")
print(data_clean[COL_PAIS].value_counts())


N global (completo en variables seleccionadas): 213
Casos por país:
Country
Brasil    143
México     70
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean[COL_PAIS] = data_clean[COL_PAIS].replace(map_pais)


### Nota metodológica
Se calcularon **correlaciones de Pearson y Spearman** entre SLEDAI y cada variable (BMI, TG, CRP, VitD).  
Se reportan coeficientes **r** y **p-valores** con corrección por comparaciones múltiples mediante **FDR (Benjamini–Hochberg)**.  
Dado que **España** no dispone de SLEDAI válido, el análisis global y estratificado se limita a **Brasil y México**.


In [2]:
from statsmodels.stats.multitest import multipletests

def corr_with_sledai(df_sub, metodo="pearson"):
    res = []
    for var in [v for v in vars_presentes if v != "SLEDAI"]:
        x = df_sub["SLEDAI"].values
        y = df_sub[var].values
        if metodo == "pearson":
            r, p = st.pearsonr(x, y)
        else:
            r, p = st.spearmanr(x, y, nan_policy='omit')
        res.append((var, r, p))
    out = pd.DataFrame(res, columns=["Variable", "r", "p"])
    # FDR
    out["p_FDR"] = multipletests(out["p"].values, method="fdr_bh")[1]
    out["|r|"] = out["r"].abs()
    return out.sort_values("|r|", ascending=False)[["Variable","r","p","p_FDR"]]

# GLOBAL (solo países con SLEDAI)
global_mask = data_clean[COL_PAIS].isin(["Brasil","México"])
dg = data_clean[global_mask].dropna(subset=["SLEDAI"])
pearson_global = corr_with_sledai(dg, "pearson")
spearman_global = corr_with_sledai(dg, "spearman")

# Estratificado por país (Brasil, México)
res_pais = {}
for p in ["Brasil","México"]:
    dp = data_clean[(data_clean[COL_PAIS]==p)].dropna(subset=["SLEDAI"])
    if len(dp) >= 10:  # umbral mínimo para estabilidad
        res_pais[(p,"pearson")]  = corr_with_sledai(dp, "pearson")
        res_pais[(p,"spearman")] = corr_with_sledai(dp, "spearman")

# Mostrar
print("=== Pearson (Global Brasil+México) ===")
display(pearson_global)
print("=== Spearman (Global Brasil+México) ===")
display(spearman_global)

for (p,m) in res_pais:
    print(f"=== {m.title()} ({p}) ===")
    display(res_pais[(p,m)])

# Exportar a Excel
with pd.ExcelWriter(os.path.join(TAB_DIR, "tabla_correlaciones_ampliadas.xlsx")) as writer:
    pearson_global.to_excel(writer, sheet_name="Pearson_global", index=False)
    spearman_global.to_excel(writer, sheet_name="Spearman_global", index=False)
    for (p,m), df_out in res_pais.items():
        df_out.to_excel(writer, sheet_name=f"{m}_{p}", index=False)
print("Guardado:", os.path.join(TAB_DIR, "tabla_correlaciones_ampliadas.xlsx"))


=== Pearson (Global Brasil+México) ===


Unnamed: 0,Variable,r,p,p_FDR
2,CRP,0.103144,0.133488,0.271063
1,Trigliceridos,0.102608,0.135532,0.271063
3,VitD,-0.065055,0.344729,0.459639
0,BMI,-0.009338,0.89223,0.89223


=== Spearman (Global Brasil+México) ===


Unnamed: 0,Variable,r,p,p_FDR
1,Trigliceridos,0.091228,0.184722,0.305177
0,BMI,0.08577,0.212504,0.305177
2,CRP,0.08279,0.228883,0.305177
3,VitD,-0.045474,0.509182,0.509182


=== Pearson (Brasil) ===


Unnamed: 0,Variable,r,p,p_FDR
1,Trigliceridos,0.15137,0.071129,0.284516
0,BMI,0.110958,0.187071,0.374141
3,VitD,-0.067834,0.420827,0.561102
2,CRP,0.00017,0.998396,0.998396


=== Spearman (Brasil) ===


Unnamed: 0,Variable,r,p,p_FDR
0,BMI,0.157277,0.060663,0.24265
1,Trigliceridos,0.057573,0.494607,0.919233
3,VitD,0.00947,0.910623,0.919233
2,CRP,0.008554,0.919233,0.919233


=== Pearson (México) ===


Unnamed: 0,Variable,r,p,p_FDR
0,BMI,-0.296343,0.012739,0.050958
2,CRP,0.255064,0.033091,0.066182
1,Trigliceridos,0.123647,0.307826,0.410435
3,VitD,0.028482,0.814941,0.814941


=== Spearman (México) ===


Unnamed: 0,Variable,r,p,p_FDR
0,BMI,-0.200117,0.096713,0.381296
2,CRP,0.158019,0.191383,0.381296
1,Trigliceridos,0.123725,0.307518,0.381296
3,VitD,0.10626,0.381296,0.381296


Guardado: outputs/tablas\tabla_correlaciones_ampliadas.xlsx


In [3]:
# Matriz de correlaciones (Pearson) solo de las variables numéricas presentes
mat_vars = [v for v in vars_presentes]  # incluye SLEDAI
mat = dg[mat_vars].corr(method="pearson")

plt.figure(figsize=(6.5,5.5))
ax = sns.heatmap(mat, vmin=-1, vmax=1, center=0, annot=True, fmt=".2f",
                 cmap="coolwarm", linewidths=.5, cbar_kws={"shrink": .8})
plt.title("Matriz de correlaciones (Pearson) – Global (Brasil+México)")
plt.tight_layout()
fig_path = os.path.join(FIG_DIR, "heatmap_correlaciones_ampliadas.png")
plt.savefig(fig_path, dpi=300, bbox_inches="tight")
plt.close()
print("Guardado:", fig_path)


Guardado: outputs/figuras\heatmap_correlaciones_ampliadas.png


### Interpretación (plantilla)
- **Criterios**: |r| < 0.2 (muy débil), 0.2–0.39 (débil), 0.40–0.59 (moderada), ≥0.60 (fuerte).
- Reportar **significativas tras FDR** (p_FDR < 0.05).
- Notar consistencia **Pearson vs Spearman** (linealidad vs. monótona).
- Señalar si los patrones se mantienen por país (Brasil/México) o son específicos.
