# AllBiomarkers — Informe visual profesional
*Generado: 2025-10-12 12:36*

Este notebook crea visualizaciones profesionales (solo **matplotlib**, sin seaborn) para datos de biomarcadores.
Es **robusto**: detecta automáticamente columnas numéricas (biomarcadores) y columnas de agrupación comunes
(p. ej., `DX`, `Diagnosis`, `VISCODE`, `Group`). Si hay múltiples ficheros, intentará unirlos por las claves comunes.

> **Uso rápido**: ajusta la celda de configuración, ejecuta todo y revisa las figuras en cada sección.


In [1]:

DATA_FILES = [
    "./data/adni/demographics/PTDEMOG.csv",
    "./data/adni/demographics/UPENNBIOMK_ROCHE_ELECSYS_11Oct2025.csv",
    "./data/adni/demographics/All_Subjects_UCSFFSX7_11Oct2025.csv",
    "./data/adni/demographics/All_Subjects_UCBERKELEY_AMY_6MM_11Oct2025.csv",
]

CANDIDATE_KEYS = ["RID", "PTID", "Subject", "SubjectID", "ID", "USUBJID"]

GROUP_COLS = ["DX", "Diagnosis", "diagnosis", "Group", "VISCODE", "PHASE"]

DROP_DUPLICATES = True
COERCE_NUMERIC = True   # Intentar convertir a numérico ignorando errores
WINSORIZE_PCT = 0.0     # 0.0 = sin winsorización; ej., 0.01 para recortar 1% colas

MIN_NONNA_RATIO = 0.6

TOP_K = 24


In [2]:
import os, math, itertools, warnings, io
from pathlib import Path
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def read_any(path: str) -> pd.DataFrame:
    p = str(path).lower()
    if p.endswith(".csv"):
        return pd.read_csv(path)
    if p.endswith(".tsv"):
        return pd.read_csv(path, sep="\t")
    if p.endswith(".parquet"):
        return pd.read_parquet(path)
    if p.endswith(".xlsx") or p.endswith(".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Formato no soportado: {path}")

def common_keys(dfs: List[pd.DataFrame], candidates: List[str]) -> List[str]:
    present = [set(df.columns) for df in dfs]
    inter = set.intersection(*present) if present else set()
    keys = [k for k in candidates if k in inter]
    return keys

def safe_merge(dfs: List[pd.DataFrame], keys: List[str]) -> pd.DataFrame:
    if not dfs:
        return pd.DataFrame()
    if not keys:
        base = dfs[0].copy()
        for i, df in enumerate(dfs[1:], start=2):
            base = pd.concat([base.reset_index(drop=True), df.reset_index(drop=True)], axis=1)
        return base
    base = dfs[0].copy()
    for df in dfs[1:]:
        base = pd.merge(base, df, on=keys, how="outer", suffixes=("", "_dup"))
        dup_cols = [c for c in base.columns if c.endswith("_dup")]
        for c in dup_cols:
            orig = c[:-4]
            if orig in base.columns:
                base[orig] = base[orig].fillna(base[c])
                base.drop(columns=[c], inplace=True)
    return base

def detect_group_cols(df: pd.DataFrame, candidates: List[str]) -> List[str]:
    return [c for c in candidates if c in df.columns]

def numeric_biomarkers(df: pd.DataFrame, min_non_na_ratio: float) -> List[str]:
    n = len(df)
    num_cols = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            non_na = df[c].notna().sum() if n else 0
            if n == 0 or (non_na / max(1,n)) >= min_non_na_ratio:
                num_cols.append(c)
    return num_cols

def winsorize_series(s: pd.Series, p: float) -> pd.Series:
    if p <= 0 or p >= 0.5 or not pd.api.types.is_numeric_dtype(s):
        return s
    lo = s.quantile(p)
    hi = s.quantile(1-p)
    return s.clip(lower=lo, upper=hi)

def cols_by_missingness(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    total = len(df)
    data = []
    for c in cols:
        miss = df[c].isna().sum()
        data.append((c, miss, total - miss, (miss/total if total else np.nan)))
    out = pd.DataFrame(data, columns=["column","missing","non_missing","missing_ratio"]).sort_values("missing_ratio", ascending=False)
    return out

def top_variance_cols(df: pd.DataFrame, cols: List[str], k: int) -> List[str]:
    v = pd.Series({c: df[c].var(skipna=True) for c in cols})
    v = v.replace([np.inf, -np.inf], np.nan).dropna().sort_values(ascending=False)
    return list(v.head(k).index)

def ncols_nrows(n_plots: int, max_cols: int = 3) -> Tuple[int,int]:
    cols = min(max_cols, n_plots)
    rows = math.ceil(n_plots / cols) if n_plots else 1
    return rows, cols


## Carga y unión de ficheros

In [3]:
dfs = []
existing = []
for p in DATA_FILES:
    if Path(p).exists():
        try:
            df = read_any(p)
            dfs.append(df)
            existing.append(p)
        except Exception as e:
            print(f"Error leyendo {p}: {e}")
    else:
        print(f"No existe: {p}")
        
print(f"Leídos {len(dfs)} ficheros:", *existing, sep="\n - ")

if not dfs:
    raise SystemExit("No se encontraron ficheros. Edita DATA_FILES y vuelve a ejecutar.")

dfs = [df.rename(columns=lambda c: str(c).strip()) for df in dfs]

keys = common_keys(dfs, CANDIDATE_KEYS)
print("Claves comunes detectadas:", keys if keys else "(ninguna)")
data = safe_merge(dfs, keys)

if DROP_DUPLICATES:
    data = data.drop_duplicates()

if COERCE_NUMERIC:
    for c in data.columns:
        if data[c].dtype == object:
            if any(k in c.lower() for k in ["date", "exam", "visit", "acq", "scan"]):
                try:
                    data[c] = pd.to_datetime(data[c], errors="ignore")
                except:
                    pass
            try:
                _tmp = pd.to_numeric(data[c], errors="coerce")
                if _tmp.notna().sum() >= 0.2 * len(_tmp):
                    data[c] = _tmp
            except:
                pass

if WINSORIZE_PCT > 0:
    for c in data.columns:
        if pd.api.types.is_numeric_dtype(data[c]):
            data[c] = winsorize_series(data[c], WINSORIZE_PCT)

print(f"Dimensiones del dataset combinado: {data.shape}")
display(data.head(10))


Leídos 4 ficheros:
 - ./data/adni/demographics/PTDEMOG.csv
 - ./data/adni/demographics/UPENNBIOMK_ROCHE_ELECSYS_11Oct2025.csv
 - ./data/adni/demographics/All_Subjects_UCSFFSX7_11Oct2025.csv
 - ./data/adni/demographics/All_Subjects_UCBERKELEY_AMY_6MM_11Oct2025.csv
Claves comunes detectadas: ['RID', 'PTID']
Dimensiones del dataset combinado: (136814, 769)


Unnamed: 0,PHASE,PTID,RID,VISCODE,VISCODE2,VISDATE,PTSOURCE,PTGENDER,PTDOB,PTDOBYY,...,RIGHT_PALLIDUM_SUVR,RIGHT_PALLIDUM_VOLUME,RIGHT_PUTAMEN_SUVR,RIGHT_PUTAMEN_VOLUME,RIGHT_THALAMUS_PROPER_SUVR,RIGHT_THALAMUS_PROPER_VOLUME,RIGHT_VENTRALDC_SUVR,RIGHT_VENTRALDC_VOLUME,RIGHT_VESSEL_SUVR,RIGHT_VESSEL_VOLUME
0,ADNI1,022_S_0001,1,f,f,1124323200000000000,1.0,2.0,12/1944,1944.0,...,,,,,,,,,,
1,ADNI1,011_S_0002,2,sc,sc,1124236800000000000,1.0,1.0,04/1931,1931.0,...,,,,,,,,,,
2,ADNIGO,011_S_0002,2,sc,sc,1285113600000000000,1.0,1.0,04/1931,1931.0,...,,,,,,,,,,
3,ADNI2,011_S_0002,2,v06,m72,1316390400000000000,1.0,1.0,04/1931,1931.0,...,,,,,,,,,,
4,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,
5,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,
6,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,
7,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,
8,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,
9,ADNI1,011_S_0003,3,sc,sc,1124323200000000000,1.0,1.0,05/1924,1924.0,...,,,,,,,,,,


## Missingness y calidad de datos

In [4]:
bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)
miss_df = cols_by_missingness(data, bio_cols)
from caas_jupyter_tools import display_dataframe_to_user
display_dataframe_to_user("Missingness por biomarcador", miss_df)

top = miss_df.head(30)
plt.figure()
plt.barh(top["column"], top["missing_ratio"])
plt.xlabel("Proporción de valores perdidos")
plt.ylabel("Biomarcador")
plt.title("Missingness (Top 30)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'caas_jupyter_tools'

## Distribuciones univariantes

In [None]:
bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)
bio_cols = bio_cols[:TOP_K]

rows, cols = ncols_nrows(len(bio_cols), max_cols=3)
fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 3.5*rows))
axes = np.array(axes).reshape(rows, cols)

idx = 0
for r in range(rows):
    for c in range(cols):
        ax = axes[r, c]
        if idx < len(bio_cols):
            col = bio_cols[idx]
            x = data[col].dropna().values
            ax.hist(x, bins=30)
            ax.set_title(f"{col} — Histograma")
            ax.set_xlabel(col)
            ax.set_ylabel("Frecuencia")
        else:
            ax.axis("off")
        idx += 1

plt.tight_layout()
plt.show()


## Boxplots por grupo (DX/Diagnosis/VISCODE si existe)

In [None]:
group_cols = detect_group_cols(data, GROUP_COLS)
print("Columnas de grupo detectadas:", group_cols)

if group_cols:
    gcol = group_cols[0]
    bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)[:TOP_K]
    n = len(bio_cols)
    rows, cols = ncols_nrows(n, max_cols=3)
    fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 3.8*rows))
    axes = np.array(axes).reshape(rows, cols)
    idx = 0
    for r in range(rows):
        for c in range(cols):
            ax = axes[r, c]
            if idx < n:
                col = bio_cols[idx]
                groups = []
                labels = []
                for g, dfg in data[[gcol, col]].dropna().groupby(gcol):
                    groups.append(dfg[col].values)
                    labels.append(str(g))
                ax.boxplot(groups, labels=labels, showfliers=False)
                ax.set_title(f"{col} por {gcol}")
                ax.set_xlabel(gcol)
                ax.set_ylabel(col)
            else:
                ax.axis("off")
            idx += 1
    plt.tight_layout()
    plt.show()
else:
    print("No se detectó una columna de grupo. Omite esta sección o añade una en GROUP_COLS.")


## Correlaciones entre biomarcadores

In [None]:
bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)[:TOP_K]
corr = data[bio_cols].corr(method="pearson", min_periods=30)

plt.figure(figsize=(max(6, 0.28*len(bio_cols)), max(4, 0.28*len(bio_cols))))
plt.imshow(corr, aspect="auto")
plt.xticks(range(len(bio_cols)), bio_cols, rotation=90)
plt.yticks(range(len(bio_cols)), bio_cols)
plt.colorbar(label="Correlación de Pearson")
plt.title("Matriz de correlación de biomarcadores")
plt.tight_layout()
plt.show()


## Dispersogramas de pares (top varianza)

In [None]:
bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)
top_cols = top_variance_cols(data, bio_cols, k=min(6, len(bio_cols)))

pairs = list(itertools.combinations(top_cols, 2))[:9]
rows, cols = ncols_nrows(len(pairs), max_cols=3)
fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 3.8*rows))
axes = np.array(axes).reshape(rows, cols)

idx = 0
for r in range(rows):
    for c in range(cols):
        ax = axes[r, c]
        if idx < len(pairs):
            xcol, ycol = pairs[idx]
            dfp = data[[xcol, ycol]].dropna()
            ax.scatter(dfp[xcol], dfp[ycol], s=8, alpha=0.7)
            ax.set_xlabel(xcol)
            ax.set_ylabel(ycol)
            ax.set_title(f"{ycol} vs {xcol}")
        else:
            ax.axis("off")
        idx += 1

plt.tight_layout()
plt.show()


## Series temporales por sujeto (si hay fecha/visita e ID)

In [None]:
date_cols = [c for c in data.columns if pd.api.types.is_datetime64_any_dtype(data[c])]
visit_cols = [c for c in data.columns if c.upper() in {"VISCODE","VISIT","VISITCODE"}]
id_cols = [c for c in data.columns if c.upper() in {"RID","PTID","SUBJECT","SUBJECTID","USUBJID","ID"}]

if (date_cols or visit_cols) and id_cols:
    time_col = date_cols[0] if date_cols else visit_cols[0]
    sid = id_cols[0]
    bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)[:6]  # sólo unos pocos para legibilidad
    dd = data[[sid, time_col] + bio_cols].dropna(subset=[sid, time_col]).copy()
    dd = dd.sort_values([sid, time_col])
    counts = dd[sid].value_counts()
    multi = counts[counts >= 3].index.tolist()[:4]  # hasta 4 sujetos
    plt_count = len(multi) * len(bio_cols)
    rows, cols = ncols_nrows(plt_count, max_cols=3)

    fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 3.8*rows))
    axes = np.array(axes).reshape(rows, cols)

    idx = 0
    for s in multi:
        dds = dd[dd[sid] == s]
        for col in bio_cols:
            ax = axes[idx // cols, idx % cols]
            ax.plot(dds[time_col], dds[col], marker="o", linewidth=1)
            ax.set_title(f"{col} — Sujeto {s}")
            ax.set_xlabel(str(time_col))
            ax.set_ylabel(col)
            idx += 1

    while idx < rows*cols:
        ax = axes[idx // cols, idx % cols]
        ax.axis("off")
        idx += 1

    plt.tight_layout()
    plt.show()
else:
    print("No se detectaron columnas de fecha/visita + ID. Omite series temporales.")


## Exportación de subconjunto limpio

In [None]:
bio_cols = numeric_biomarkers(data, MIN_NONNA_RATIO)
out_path = Path("/mnt/data/biomarkers_clean_export.csv")
data[bio_cols].to_csv(out_path, index=False)
print("Exportado:", out_path)
