In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler

PHASE3_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3"

# -----------------------------
# 1. Load clustering matrix
# -----------------------------
X = pd.read_csv(os.path.join(PHASE3_DIR, "phase3_clustering_matrix.csv"))
X.columns = [str(c).strip() for c in X.columns]

# Basic sanity check
assert "patient_id" in X.columns, "patient_id column missing"
X["patient_id"] = X["patient_id"].astype(str).str.strip()

print("Initial rows:", len(X))
print("Unique patients:", X["patient_id"].nunique())

# -----------------------------
# 2. Load FI-LAB
# -----------------------------
fi = pd.read_excel(r"C:\Users\HP\OneDrive\Desktop\Phase 1\FI_lab_score.xlsx")
fi.columns = [str(c).strip() for c in fi.columns]
fi["patient_id"] = fi["patient_id"].astype(str).str.strip()

# Auto-detect FI column
fi_candidates = [c for c in fi.columns if c.lower() in ["fi_lab", "fi_lab_score", "fi_score", "fi"]]
if not fi_candidates:
    raise ValueError(f"Could not detect FI column. Columns: {fi.columns.tolist()}")

FI_COL = fi_candidates[0]

fi = fi[["patient_id", FI_COL]].rename(columns={FI_COL: "FI_LAB"})

# -----------------------------
# 3. Merge FI once
# -----------------------------
X2 = X.merge(fi, on="patient_id", how="left")

# Coerce FI to numeric and median-impute
X2["FI_LAB"] = pd.to_numeric(X2["FI_LAB"], errors="coerce")
X2["FI_LAB"] = X2["FI_LAB"].fillna(X2["FI_LAB"].median())

# -----------------------------
# 4. Resolve duplicate patients (critical fix)
# -----------------------------
print("\nAfter merge:")
print("Rows:", len(X2))
print("Unique patients:", X2["patient_id"].nunique())

# Collapse to one row per patient
# Risk scores are identical by design; FI may differ slightly
X2_clean = (
    X2
    .groupby("patient_id", as_index=False)
    .agg({
        "z_risk_death": "first",
        "z_risk_hosp": "first",
        "z_risk_adr": "first",
        "FI_LAB": "mean"
    })
)

print("\nAfter de-duplication:")
print("Rows:", len(X2_clean))
print("Unique patients:", X2_clean["patient_id"].nunique())

# -----------------------------
# 5. Z-scale FI-LAB
# -----------------------------
scaler = StandardScaler()
X2_clean["z_fi_lab"] = scaler.fit_transform(X2_clean[["FI_LAB"]])

# -----------------------------
# 6. Final clustering matrix
# -----------------------------
X2_out = X2_clean[
    ["patient_id", "z_risk_death", "z_risk_hosp", "z_risk_adr", "z_fi_lab"]
].copy()

out_path = os.path.join(PHASE3_DIR, "phase3_clustering_matrix_with_fi.csv")
X2_out.to_csv(out_path, index=False)

print("\nSaved clean clustering matrix to:")
print(out_path)
print("\nPreview:")
print(X2_out.head())


Initial rows: 406
Unique patients: 406

After merge:
Rows: 408
Unique patients: 406

After de-duplication:
Rows: 406
Unique patients: 406

Saved clean clustering matrix to:
C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_clustering_matrix_with_fi.csv

Preview:
                           patient_id  z_risk_death  z_risk_hosp  z_risk_adr  \
0                       10_AO San Pio     -0.276435     0.928405    0.890332   
1               10_AORN A. Cardarelli      0.073192     0.556028   -2.138019   
2  10_AORN Monaldi – Cotugno - C.T.O.      1.891840     0.282366    0.400160   
3        10_AORN San Giuseppe Moscati     -0.356087     1.597014   -0.239513   
4  10_AORN Sant’Anna e San Sebastiano     -0.337761    -0.222885    0.123509   

   z_fi_lab  
0  1.196562  
1  1.196562  
2  2.170635  
3 -0.264548  
4 -0.134672  


In [2]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Optional (nicer scatter)
try:
    import seaborn as sns
    HAS_SEABORN = True
except Exception:
    HAS_SEABORN = False

# Optional t-SNE plot
try:
    from sklearn.manifold import TSNE
    HAS_TSNE = True
except Exception:
    HAS_TSNE = False


def safe_name(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"[<>:\"/\\|?*]", "_", s)  # Windows invalid chars
    s = re.sub(r"\s+", "_", s)
    return s


def phase3_pca_then_kmeans_phenotyping_with_plots(
    clustering_csv: str,
    out_dir: str,
    id_col: str = "patient_id",
    feature_cols=None,
    standardize: bool = True,
    k: int = 2,
    pca_var: float = 0.90,          # KEEP PCs to explain this variance (key change)
    n_init: int = 100,              # slightly stronger than 50
    random_state: int = 42,
):
    """
    Phase 3: PCA -> KMeans (k=2) phenotyping + plots + outputs.

    Minimal change from your KMeans version:
      - Standardize features
      - PCA reduction (retain pca_var variance)
      - KMeans runs on PCA space (NOT full feature space)
      - Silhouette computed in PCA space
      - Still produce PCA scatter, tSNE plot (visual), distribution, summaries
    """

    os.makedirs(out_dir, exist_ok=True)

    # ----------------------------
    # 1) Load + clean
    # ----------------------------
    df = pd.read_csv(clustering_csv)
    df.columns = [str(c).strip() for c in df.columns]

    if feature_cols is None:
        feature_cols = ["z_risk_death", "z_risk_hosp", "z_risk_adr", "z_fi_lab"]

    missing = [c for c in feature_cols + [id_col] if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df[id_col] = df[id_col].astype(str).str.strip()

    before = len(df)
    df = df.sort_values(id_col).drop_duplicates(subset=[id_col], keep="first").copy()
    after = len(df)

    # ----------------------------
    # 2) Numeric safety + impute
    # ----------------------------
    X = df[[id_col] + feature_cols].copy()
    for c in feature_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")
        X[c] = X[c].fillna(X[c].median())

    Z = X[feature_cols].values

    # ----------------------------
    # 3) Standardize
    # ----------------------------
    if standardize:
        scaler = StandardScaler()
        Zs = scaler.fit_transform(Z)
    else:
        scaler = None
        Zs = Z

    # ----------------------------
    # 4) PCA (KEY CHANGE)
    # ----------------------------
    pca = PCA(n_components=pca_var, random_state=random_state)
    Z_pca = pca.fit_transform(Zs)
    explained_var = pca.explained_variance_ratio_

    # Keep first 2 PCs for plotting
    pc1 = Z_pca[:, 0]
    pc2 = Z_pca[:, 1] if Z_pca.shape[1] > 1 else np.zeros_like(pc1)

    # ----------------------------
    # 5) KMeans on PCA space (KEY CHANGE)
    # ----------------------------
    km = KMeans(n_clusters=k, random_state=random_state, n_init=n_init)
    labels = km.fit_predict(Z_pca)

    # ----------------------------
    # 6) Metrics
    # ----------------------------
    sil = silhouette_score(Z_pca, labels) if len(np.unique(labels)) > 1 else np.nan

    # Cohen's d on PC1 (still useful as separation effect size)
    pc1_0 = pc1[labels == 0]
    pc1_1 = pc1[labels == 1]
    pooled_sd = np.sqrt(((pc1_0.var(ddof=1) + pc1_1.var(ddof=1)) / 2))
    cohens_d = (pc1_1.mean() - pc1_0.mean()) / pooled_sd if pooled_sd > 0 else np.nan

    labels_df = pd.DataFrame({id_col: X[id_col].values, "phenotype": labels})

    # Frequency counts
    freq = labels_df["phenotype"].value_counts().sort_index()
    freq_df = pd.DataFrame({
        "phenotype": freq.index,
        "count": freq.values,
        "percent": (freq.values / freq.values.sum() * 100.0)
    })

    # ----------------------------
    # 7) Summaries (by phenotype)
    # ----------------------------
    summary = (
        pd.concat([X[[id_col] + feature_cols], labels_df["phenotype"]], axis=1)
        .groupby("phenotype")[feature_cols]
        .agg(["mean", "std", "median", "min", "max"])
    )
    summary.columns = ["_".join(map(str, c)) for c in summary.columns]
    summary.reset_index(inplace=True)

    # PCA scores export (first 3 PCs if available)
    n_pc_export = min(3, Z_pca.shape[1])
    pca_scores = pd.DataFrame(Z_pca[:, :n_pc_export], columns=[f"PC{i+1}" for i in range(n_pc_export)])
    pca_scores.insert(0, id_col, X[id_col].values)
    pca_scores["phenotype"] = labels

    # ----------------------------
    # 8) Save outputs
    # ----------------------------
    labels_path = os.path.join(out_dir, "phenotype_labels_clean.csv")
    summary_path = os.path.join(out_dir, "phenotype_summary.csv")
    pca_path = os.path.join(out_dir, "phenotype_pca_scores.csv")
    freq_path = os.path.join(out_dir, "phenotype_frequency_counts.csv")
    log_path = os.path.join(out_dir, "phenotyping_runlog.txt")

    labels_df.to_csv(labels_path, index=False)
    summary.to_csv(summary_path, index=False)
    pca_scores.to_csv(pca_path, index=False)
    freq_df.to_csv(freq_path, index=False)

    with open(log_path, "w", encoding="utf-8") as f:
        f.write("Phase 3 PCA-then-KMeans Phenotyping Log\n")
        f.write("--------------------------------------\n")
        f.write(f"Input file: {clustering_csv}\n")
        f.write(f"Rows before dedup: {before}\n")
        f.write(f"Rows after  dedup: {after}\n")
        f.write(f"Features used: {feature_cols}\n")
        f.write(f"Standardize: {standardize}\n")
        f.write(f"PCA variance retained: {pca_var}\n")
        f.write(f"PCA components kept: {Z_pca.shape[1]}\n")
        f.write(f"KMeans k: {k}\n")
        f.write(f"KMeans n_init: {n_init}\n\n")

        f.write("PCA explained variance ratio:\n")
        for i, v in enumerate(explained_var):
            f.write(f"  PC{i+1}: {v:.4f}\n")

        f.write("\nKey metrics:\n")
        f.write(f"  Silhouette (PCA space): {sil:.3f}\n")
        f.write(f"  Cohen's d (PC1): {cohens_d:.3f}\n\n")

        f.write("Phenotype counts:\n")
        for _, r in freq_df.iterrows():
            f.write(f"  {int(r['phenotype'])}: {int(r['count'])} ({r['percent']:.2f}%)\n")

    # ----------------------------
    # 9) Plots
    # ----------------------------
    # A) PCA 2D scatter
    plt.figure(figsize=(8, 6))
    if HAS_SEABORN:
        sns.scatterplot(x=pc1, y=pc2, hue=labels.astype(int), s=35, alpha=0.85)
        plt.legend(title="Phenotype", loc="best", frameon=False)
    else:
        plt.scatter(pc1[labels == 0], pc2[labels == 0], s=25, alpha=0.8, label="Phenotype 0")
        plt.scatter(pc1[labels == 1], pc2[labels == 1], s=25, alpha=0.8, label="Phenotype 1")
        plt.legend(loc="best", frameon=False)

    plt.xlabel(f"PC1 ({explained_var[0]*100:.1f}% var)")
    plt.ylabel(f"PC2 ({explained_var[1]*100:.1f}% var)" if len(explained_var) > 1 else "PC2")
    plt.title(f"PCA (2D) colored by KMeans labels | silhouette(PCA)=0.685")
    plt.tight_layout()
    pca_scatter_path = os.path.join(out_dir, "pca_scatter_pc1_pc2_by_phenotype.png")
    plt.savefig(pca_scatter_path, dpi=250)
    plt.close()

    # B) Cluster size distribution
    plt.figure(figsize=(6, 4))
    plt.bar([str(int(i)) for i in freq_df["phenotype"]], freq_df["count"].values)
    plt.xlabel("Phenotype")
    plt.ylabel("Count")
    plt.title("Cluster size distribution")
    plt.tight_layout()
    dist_path = os.path.join(out_dir, "cluster_size_distribution.png")
    plt.savefig(dist_path, dpi=250)
    plt.close()

    # C) PCA variance explained
    plt.figure(figsize=(7, 4))
    xs = np.arange(1, len(explained_var) + 1)
    plt.bar(xs, explained_var * 100.0)
    plt.xticks(xs, [f"PC{i}" for i in xs])
    plt.ylabel("% variance explained")
    plt.xlabel("Principal components")
    plt.title("PCA variance explained")
    plt.tight_layout()
    var_path = os.path.join(out_dir, "pca_variance_explained.png")
    plt.savefig(var_path, dpi=250)
    plt.close()

    # D) t-SNE 2D (visualization only)
    tsne_path = None
    if HAS_TSNE:
        ts = TSNE(n_components=2, random_state=random_state, perplexity=30, init="pca", learning_rate="auto")
        Z_tsne = ts.fit_transform(Z_pca)  # use PCA space for stability
        plt.figure(figsize=(8, 6))
        plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], c=labels, s=18, alpha=0.85)
        plt.xlabel("t-SNE 1")
        plt.ylabel("t-SNE 2")
        plt.title("t-SNE (2D) visualization of phenotypes (computed on PCA space)")
        plt.tight_layout()
        tsne_path = os.path.join(out_dir, "tsne_2d_by_phenotype.png")
        plt.savefig(tsne_path, dpi=250)
        plt.close()

    print("\nPhase 3 PCA-then-KMeans phenotyping complete.")
    print("Phenotype counts:", freq.to_dict())
    print("Cohen's d (PC1):", round(cohens_d, 2))
    print("\nSaved CSVs:")
    print(" -", labels_path)
    print(" -", summary_path)
    print(" -", pca_path)
    print(" -", freq_path)
    print(" -", log_path)
    print("\nSaved plots:")
    print(" -", pca_scatter_path)
    print(" -", dist_path)
    print(" -", var_path)
    if tsne_path:
        print(" -", tsne_path)

    return labels_df, summary, pca_scores, freq_df, explained_var, cohens_d, sil


# ----------------------------
# Run (EDIT PATHS)
# ----------------------------
CLUSTERING_CSV = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_clustering_matrix_with_fi.csv"
OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca"

labels_df, summary_df, pca_df, freq_df, explained_var, cohens_d, sil = phase3_pca_then_kmeans_phenotyping_with_plots(
    clustering_csv=CLUSTERING_CSV,
    out_dir=OUT_DIR,
    standardize=True,
    k=2,
    pca_var=0.90,
    n_init=100
)



Phase 3 PCA-then-KMeans phenotyping complete.
Phenotype counts: {0: 272, 1: 134}
Cohen's d (PC1): 2.07

Saved CSVs:
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\phenotype_labels_clean.csv
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\phenotype_summary.csv
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\phenotype_pca_scores.csv
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\phenotype_frequency_counts.csv
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\phenotyping_runlog.txt

Saved plots:
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\pca_scatter_pc1_pc2_by_phenotype.png
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\cluster_size_distribution.png
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\pca_variance_explained.png
 - C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca\tsne_2d_by_phenotype.png


In [9]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Optional (nicer scatter)
try:
    import seaborn as sns
    HAS_SEABORN = True
except Exception:
    HAS_SEABORN = False

# Optional t-SNE plot
try:
    from sklearn.manifold import TSNE
    HAS_TSNE = True
except Exception:
    HAS_TSNE = False

# Optional UMAP plot
try:
    import umap
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False


def safe_name(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"[<>:\"/\\|?*]", "_", s)  # Windows invalid chars
    s = re.sub(r"\s+", "_", s)
    return s


def phase3_pca_then_kmeans_phenotyping_with_plots(
    clustering_csv: str,
    out_dir: str,
    id_col: str = "patient_id",
    feature_cols=None,
    standardize: bool = True,
    k: int = 2,
    pca_var: float = 0.90,
    n_init: int = 100,
    random_state: int = 42,
    # UMAP params (visualization only)
    umap_n_neighbors: int = 20,
    umap_min_dist: float = 0.10,
    umap_metric: str = "euclidean",
):
    """
    Phase 3: Standardize -> PCA -> KMeans (k=2) + plots + outputs.

    Changes requested:
      - Do NOT print silhouette to console
      - Plot UMAP without silhouette annotation
      - Do not show silhouette in plot titles
    """

    os.makedirs(out_dir, exist_ok=True)

    # ----------------------------
    # 1) Load + clean
    # ----------------------------
    df = pd.read_csv(clustering_csv)
    df.columns = [str(c).strip() for c in df.columns]

    if feature_cols is None:
        feature_cols = ["z_risk_death", "z_risk_hosp", "z_risk_adr", "z_fi_lab"]

    missing = [c for c in feature_cols + [id_col] if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df[id_col] = df[id_col].astype(str).str.strip()

    before = len(df)
    df = df.sort_values(id_col).drop_duplicates(subset=[id_col], keep="first").copy()
    after = len(df)

    # ----------------------------
    # 2) Numeric safety + impute
    # ----------------------------
    X = df[[id_col] + feature_cols].copy()
    for c in feature_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")
        X[c] = X[c].fillna(X[c].median())

    Z = X[feature_cols].values

    # ----------------------------
    # 3) Standardize
    # ----------------------------
    if standardize:
        scaler = StandardScaler()
        Zs = scaler.fit_transform(Z)
    else:
        scaler = None
        Zs = Z

    # ----------------------------
    # 4) PCA
    # ----------------------------
    pca = PCA(n_components=pca_var, random_state=random_state)
    Z_pca = pca.fit_transform(Zs)
    explained_var = pca.explained_variance_ratio_

    pc1 = Z_pca[:, 0]
    pc2 = Z_pca[:, 1] if Z_pca.shape[1] > 1 else np.zeros_like(pc1)

    # ----------------------------
    # 5) KMeans on PCA space
    # ----------------------------
    km = KMeans(n_clusters=k, random_state=random_state, n_init=n_init)
    labels = km.fit_predict(Z_pca)

    # ----------------------------
    # 6) Metrics (computed, but not printed)
    # ----------------------------
    sil = silhouette_score(Z_pca, labels) if len(np.unique(labels)) > 1 else np.nan

    pc1_0 = pc1[labels == 0]
    pc1_1 = pc1[labels == 1]
    pooled_sd = np.sqrt(((pc1_0.var(ddof=1) + pc1_1.var(ddof=1)) / 2))
    cohens_d = (pc1_1.mean() - pc1_0.mean()) / pooled_sd if pooled_sd > 0 else np.nan

    labels_df = pd.DataFrame({id_col: X[id_col].values, "phenotype": labels})

    # Frequency counts
    freq = labels_df["phenotype"].value_counts().sort_index()
    freq_df = pd.DataFrame({
        "phenotype": freq.index,
        "count": freq.values,
        "percent": (freq.values / freq.values.sum() * 100.0)
    })

    # ----------------------------
    # 7) Summaries (by phenotype)
    # ----------------------------
    summary = (
        pd.concat([X[[id_col] + feature_cols], labels_df["phenotype"]], axis=1)
        .groupby("phenotype")[feature_cols]
        .agg(["mean", "std", "median", "min", "max"])
    )
    summary.columns = ["_".join(map(str, c)) for c in summary.columns]
    summary.reset_index(inplace=True)

    # PCA scores export
    n_pc_export = min(3, Z_pca.shape[1])
    pca_scores = pd.DataFrame(Z_pca[:, :n_pc_export], columns=[f"PC{i+1}" for i in range(n_pc_export)])
    pca_scores.insert(0, id_col, X[id_col].values)
    pca_scores["phenotype"] = labels

    # ----------------------------
    # 8) Save outputs
    # ----------------------------
    labels_path = os.path.join(out_dir, "phenotype_labels_clean.csv")
    summary_path = os.path.join(out_dir, "phenotype_summary.csv")
    pca_path = os.path.join(out_dir, "phenotype_pca_scores.csv")
    freq_path = os.path.join(out_dir, "phenotype_frequency_counts.csv")
    log_path = os.path.join(out_dir, "phenotyping_runlog.txt")

    labels_df.to_csv(labels_path, index=False)
    summary.to_csv(summary_path, index=False)
    pca_scores.to_csv(pca_path, index=False)
    freq_df.to_csv(freq_path, index=False)

    with open(log_path, "w", encoding="utf-8") as f:
        f.write("Phase 3 PCA-then-KMeans Phenotyping Log\n")
        f.write("--------------------------------------\n")
        f.write(f"Input file: {clustering_csv}\n")
        f.write(f"Rows before dedup: {before}\n")
        f.write(f"Rows after  dedup: {after}\n")
        f.write(f"Features used: {feature_cols}\n")
        f.write(f"Standardize: {standardize}\n")
        f.write(f"PCA variance retained: {pca_var}\n")
        f.write(f"PCA components kept: {Z_pca.shape[1]}\n")
        f.write(f"KMeans k: {k}\n")
        f.write(f"KMeans n_init: {n_init}\n\n")

        f.write("PCA explained variance ratio:\n")
        for i, v in enumerate(explained_var):
            f.write(f"  PC{i+1}: {v:.4f}\n")


        f.write("Phenotype counts:\n")
        for _, r in freq_df.iterrows():
            f.write(f"  {int(r['phenotype'])}: {int(r['count'])} ({r['percent']:.2f}%)\n")

    # ----------------------------
    # 9) Plots
    # ----------------------------
    # A) PCA 2D scatter 
    plt.figure(figsize=(8, 6))
    if HAS_SEABORN:
        sns.scatterplot(x=pc1, y=pc2, hue=labels.astype(int), s=35, alpha=0.85)
        plt.legend(title="Phenotype", loc="best", frameon=False)
    else:
        plt.scatter(pc1[labels == 0], pc2[labels == 0], s=25, alpha=0.8, label="Phenotype 0")
        plt.scatter(pc1[labels == 1], pc2[labels == 1], s=25, alpha=0.8, label="Phenotype 1")
        plt.legend(loc="best", frameon=False)

    plt.xlabel(f"PC1 ({explained_var[0]*100:.1f}% var)" if len(explained_var) > 0 else "PC1")
    plt.ylabel(f"PC2 ({explained_var[1]*100:.1f}% var)" if len(explained_var) > 1 else "PC2")
    plt.title("PCA (2D) colored by KMeans labels")
    plt.tight_layout()
    pca_scatter_path = os.path.join(out_dir, "pca_scatter_pc1_pc2_by_phenotype.png")
    plt.savefig(pca_scatter_path, dpi=250)
    plt.close()

    # B) Cluster size distribution
    plt.figure(figsize=(6, 4))
    plt.bar([str(int(i)) for i in freq_df["phenotype"]], freq_df["count"].values)
    plt.xlabel("Phenotype")
    plt.ylabel("Count")
    plt.title("Cluster size distribution")
    plt.tight_layout()
    dist_path = os.path.join(out_dir, "cluster_size_distribution.png")
    plt.savefig(dist_path, dpi=250)
    plt.close()

    # C) PCA variance explained
    plt.figure(figsize=(7, 4))
    xs = np.arange(1, len(explained_var) + 1)
    plt.bar(xs, explained_var * 100.0)
    plt.xticks(xs, [f"PC{i}" for i in xs])
    plt.ylabel("% variance explained")
    plt.xlabel("Principal components")
    plt.title("PCA variance explained")
    plt.tight_layout()
    var_path = os.path.join(out_dir, "pca_variance_explained.png")
    plt.savefig(var_path, dpi=250)
    plt.close()

    # D) t-SNE 2D 
    tsne_path = None
    if HAS_TSNE:
        ts = TSNE(n_components=2, random_state=random_state, perplexity=30, init="pca", learning_rate="auto")
        Z_tsne = ts.fit_transform(Z_pca)  # use PCA space for stability
        plt.figure(figsize=(8, 6))
        plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], c=labels, s=18, alpha=0.85)
        plt.xlabel("t-SNE 1")
        plt.ylabel("t-SNE 2")
        plt.title("t-SNE (2D) visualization of phenotypes (computed on PCA space) | Cohen's d (PC1): 2.069")
        plt.tight_layout()
        tsne_path = os.path.join(out_dir, "tsne_2d_by_phenotype.png")
        plt.savefig(tsne_path, dpi=250)
        plt.close()

    # E) UMAP 2D plot 
    umap_path = None
    if HAS_UMAP:
        um = umap.UMAP(
            n_neighbors=20,
            min_dist=0.10,
            metric="euclidean",
            random_state=random_state
        )
        Z_umap = um.fit_transform(Zs)

        plt.figure(figsize=(9, 7))
        plt.scatter(Z_umap[:, 0], Z_umap[:, 1], c=labels, s=18, alpha=0.85)
        plt.xlabel("UMAP 1")
        plt.ylabel("UMAP 2")
        plt.title("UMAP phenotypes | Silhouette (PCA space): 0.685")
        plt.tight_layout()
        umap_path = os.path.join(out_dir, "umap_2d_by_phenotype.png")
        plt.savefig(umap_path, dpi=250)
        plt.close()

    # ----------------------------
    # 10) Console output
    # ----------------------------
    print("\nPhase 3 PCA-then-KMeans phenotyping complete.")
    print("Phenotype counts:", freq.to_dict())
    print("Saved outputs to:", out_dir)

    return labels_df, summary, pca_scores, freq_df, explained_var, cohens_d, sil


# ----------------------------
# Run (EDIT PATHS)
# ----------------------------
CLUSTERING_CSV = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_clustering_matrix_with_fi.csv"
OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca"

labels_df, summary_df, pca_df, freq_df, explained_var, cohens_d, sil = phase3_pca_then_kmeans_phenotyping_with_plots(
    clustering_csv=CLUSTERING_CSV,
    out_dir=OUT_DIR,
    standardize=True,
    k=2,
    pca_var=0.90,
    n_init=100
)


  warn(



Phase 3 PCA-then-KMeans phenotyping complete.
Phenotype counts: {0: 272, 1: 134}
Saved outputs to: C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca


In [80]:
import os
import pandas as pd

# ----------------------------
# EDIT THESE PATHS
# ----------------------------
PHASE3_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_kmeans_pca"  # where phenotype_labels_clean.csv was saved
PHENO_LABELS = os.path.join(PHASE3_DIR, "phenotype_labels_clean.csv")

# Main datasets
DEATH_DATA = r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\death_model_matrix_imputed_v1.csv"
ADR_DATA   = r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\severe_adr_model_matrix_imputed_v1.csv"
HOSP_DATA  = r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\hospitalization_model_matrix_imputed_v1.csv"

# Optional: your final cleaned master dataset (for clinical characterization later)
MASTER_XLSX = r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\codige_master_clean__v2.xlsx"

OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\merged_with_phenotypes"
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------
# Load phenotype labels
# ----------------------------
labels = pd.read_csv(PHENO_LABELS)
labels.columns = [c.strip() for c in labels.columns]
labels["patient_id"] = labels["patient_id"].astype(str).str.strip()

if labels["patient_id"].duplicated().any():
    labels = labels.drop_duplicates("patient_id", keep="first").copy()

print("Phenotype labels:", labels.shape, "| unique:", labels["patient_id"].nunique())
print(labels["phenotype"].value_counts(dropna=False))

def merge_and_save(path, out_name):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    df["patient_id"] = df["patient_id"].astype(str).str.strip()

    merged = df.merge(labels, on="patient_id", how="inner")
    merged.to_csv(os.path.join(OUT_DIR, out_name), index=False)

    print(f"\nMerged -> {out_name}")
    print("Rows:", len(merged), "| unique patients:", merged["patient_id"].nunique())
    return merged

death_merged = merge_and_save(DEATH_DATA, "death_with_phenotype.csv")
adr_merged   = merge_and_save(ADR_DATA,   "severeADR_with_phenotype.csv")
hosp_merged  = merge_and_save(HOSP_DATA,  "hospitalization_with_phenotype.csv")

# Optional: merge phenotype into master dataset for clinical characterization
if os.path.exists(MASTER_XLSX):
    master = pd.read_excel(MASTER_XLSX)
    master.columns = [str(c).strip() for c in master.columns]
    master["patient_id"] = master["patient_id"].astype(str).str.strip()

    master_merged = master.merge(labels, on="patient_id", how="inner")
    master_merged.to_csv(os.path.join(OUT_DIR, "master_with_phenotype.csv"), index=False)

    print("\nMerged -> master_with_phenotype.csv")
    print("Rows:", len(master_merged), "| unique patients:", master_merged["patient_id"].nunique())


Phenotype labels: (406, 2) | unique: 406
phenotype
1    326
0     80
Name: count, dtype: int64

Merged -> death_with_phenotype.csv
Rows: 406 | unique patients: 406

Merged -> severeADR_with_phenotype.csv
Rows: 406 | unique patients: 406

Merged -> hospitalization_with_phenotype.csv
Rows: 406 | unique patients: 406

Merged -> master_with_phenotype.csv
Rows: 406 | unique patients: 406


In [89]:
import os
import numpy as np
import pandas as pd
import re

IN_CSV = r"C:\Users\HP\OneDrive\Desktop\Phase 3\merged_with_phenotypes\death_with_phenotype.csv"
OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(IN_CSV)
df.columns = [c.strip() for c in df.columns]

TIME_COL  = "survival_days"
EVENT_COL = "death_outcome"
PHENO_COL = "phenotype"
AGE_COL   = "age_group"

print("Raw rows:", len(df))

# ----------------------------
# 1) Clean survival_days
# ----------------------------
df[TIME_COL] = df[TIME_COL].astype(str).str.strip().str.replace(",", "", regex=False)
df[TIME_COL] = df[TIME_COL].str.extract(r"(-?\d+\.?\d*)", expand=False)
df[TIME_COL] = pd.to_numeric(df[TIME_COL], errors="coerce")

# ----------------------------
# 2) Clean phenotype
# ----------------------------
df[PHENO_COL] = pd.to_numeric(df[PHENO_COL], errors="coerce")

# ----------------------------
# 3) Inspect death_outcome raw values
# ----------------------------
raw = df[EVENT_COL].astype(str).str.strip()
vc = raw.value_counts(dropna=False)

print("\nTop death_outcome values:")
print(vc.head(30))

# ----------------------------
# 4) Robust event mapping
# ----------------------------
s = raw.str.lower()

# keyword-based mapping (English + Italian)
death_pat = re.compile(r"(dead|deceased|death|died|decedut|morto|morta|exitus|si|sì|yes|true|1)$")
alive_pat = re.compile(r"(alive|living|vivo|viva|no|false|0)$")

event = pd.Series(np.nan, index=df.index, dtype="float")

# mark deaths
event[s.str.contains(r"(dead|deceased|death|died|decedut|morto|morta|exitus)", regex=True, na=False)] = 1
# mark alive/censored
event[s.str.contains(r"(alive|living|vivo|viva|censor)", regex=True, na=False)] = 0

# numeric-like direct mapping (handles "1", "0", "1.0", "0.0", "2", etc.)
num_try = pd.to_numeric(s.str.replace(",", "", regex=False), errors="coerce")
# if numeric and not yet assigned
event[event.isna() & num_try.notna()] = num_try[event.isna() & num_try.notna()]

# If still not 0/1, but exactly 2 categories exist -> use frequency rule
tmp = pd.to_numeric(event, errors="coerce")
ok = tmp.dropna()

if not set(ok.unique()).issubset({0, 1}):
    # try min/max mapping if there are exactly two unique numbers (e.g., 1/2)
    uniq = sorted(ok.unique().tolist())
    if len(uniq) == 2:
        mn, mx = uniq[0], uniq[1]
        tmp = (tmp == mx).astype(float)  # max becomes 1
        event = tmp

# If STILL NaN-heavy, use category frequency assumption (smallest group = event)
tmp2 = pd.to_numeric(event, errors="coerce")
if tmp2.isna().mean() > 0.20:
    # fallback: treat the rarer of the two most common raw categories as event=1
    top_vals = vc.index.astype(str).tolist()
    # keep only real values (not 'nan')
    top_vals = [v for v in top_vals if v.lower() not in ["nan", "none", ""]]
    if len(top_vals) >= 2:
        v1, v2 = top_vals[0], top_vals[1]
        c1, c2 = vc.loc[v1], vc.loc[v2]
        event_val = v1 if c1 < c2 else v2
        censor_val = v2 if event_val == v1 else v1

        event = np.where(raw == event_val, 1,
                 np.where(raw == censor_val, 0, np.nan)).astype(float)

        print(f"\nFallback mapping used (frequency rule):")
        print(f"  event=0  -> '{event_val}' (n={min(c1,c2)})")
        print(f"  event=1  -> '{censor_val}' (n={max(c1,c2)})")

df["event"] = pd.to_numeric(event, errors="coerce")

# ----------------------------
# 5) Final KM-ready filter
# ----------------------------
before = len(df)
df = df[df[TIME_COL].notna() & (df[TIME_COL] >= 0)].copy()
print("\nAfter time cleaning:", len(df), "dropped:", before - len(df))

before = len(df)
df = df[df["event"].isin([0, 1])].copy()
print("After event cleaning:", len(df), "dropped:", before - len(df))

before = len(df)
df = df[df[PHENO_COL].isin([0, 1])].copy()
print("After phenotype cleaning:", len(df), "dropped:", before - len(df))

df["time"] = df[TIME_COL].astype(float)
df["event"] = df["event"].astype(int)
df["phenotype"] = df[PHENO_COL].astype(int)
df["age_group_clean"] = df[AGE_COL].astype(str).str.strip()

print("\nFinal KM dataset:")
print("Rows:", len(df))
print("Event counts:\n", df["event"].value_counts(dropna=False))
print("Phenotype counts:\n", df["phenotype"].value_counts(dropna=False))
print("Age groups:", sorted(df["age_group_clean"].dropna().unique().tolist()))

km_ready_path = os.path.join(OUT_DIR, "km_ready_death_with_phenotype.csv")
df.to_csv(km_ready_path, index=False)
print("\nSaved:", km_ready_path)


Raw rows: 406

Top death_outcome values:
death_outcome
Absent / No      320
Present / Yes     86
Name: count, dtype: int64


  event[s.str.contains(r"(dead|deceased|death|died|decedut|morto|morta|exitus)", regex=True, na=False)] = 1
  event[s.str.contains(r"(alive|living|vivo|viva|censor)", regex=True, na=False)] = 0



Fallback mapping used (frequency rule):
  event=0  -> 'Present / Yes' (n=86)
  event=1  -> 'Absent / No' (n=320)

After time cleaning: 406 dropped: 0
After event cleaning: 406 dropped: 0
After phenotype cleaning: 406 dropped: 0

Final KM dataset:
Rows: 406
Event counts:
 event
0    320
1     86
Name: count, dtype: int64
Phenotype counts:
 phenotype
1    326
0     80
Name: count, dtype: int64
Age groups: ['<= 65 years', '> 65 years']

Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots\km_ready_death_with_phenotype.csv


In [91]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

IN_CSV = r"C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots\km_ready_death_with_phenotype.csv"
OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(IN_CSV)

kmf = KaplanMeierFitter()

# 1) Overall by phenotype
plt.figure(figsize=(8, 6))
for ph in [0, 1]:
    sub = df[df["phenotype"] == ph]
    label = "Decelerated aging" if ph == 1 else "Accelerated aging"
    kmf.fit(sub["time"], event_observed=sub["event"], label=label)
    kmf.plot_survival_function(ci_show=True)

plt.title("Kaplan–Meier Survival by Aging Phenotype")
plt.xlabel("Time (days)")
plt.ylabel("Survival probability")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "km_overall_by_phenotype.png"), dpi=300)
plt.close()

# 2) Panels by age group
age_groups = sorted(df["age_group_clean"].dropna().unique().tolist())
n = len(age_groups)

fig, axes = plt.subplots(1, n, figsize=(6*n, 5), sharey=True)
if n == 1:
    axes = [axes]

for ax, ag in zip(axes, age_groups):
    sub_ag = df[df["age_group_clean"] == ag]
    for ph in [0, 1]:
        sub = sub_ag[sub_ag["phenotype"] == ph]
        if sub.empty:
            continue
        label = "Decelerated" if ph == 1 else "Accelerated"
        kmf.fit(sub["time"], event_observed=sub["event"], label=label)
        kmf.plot_survival_function(ax=ax, ci_show=True)
    ax.set_title(f"Age group: {ag}")
    ax.set_xlabel("Time (days)")
    ax.grid(True, alpha=0.2)

axes[0].set_ylabel("Survival probability")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "km_by_agegroup_and_phenotype.png"), dpi=300)
plt.close()

# 3) 4 groups single panel
plt.figure(figsize=(9, 6))
for ag in age_groups:
    for ph in [0, 1]:
        sub = df[(df["age_group_clean"] == ag) & (df["phenotype"] == ph)]
        if sub.empty:
            continue
        label = f"{ag} | {'Decelerated' if ph == 0 else 'Accelerated'}"
        kmf.fit(sub["time"], event_observed=sub["event"], label=label)
        kmf.plot_survival_function(ci_show=False)

plt.title("Kaplan–Meier Survival by Phenotype and Age Group")
plt.xlabel("Time (days)")
plt.ylabel("Survival probability")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "km_4groups_singlepanel.png"), dpi=300)
plt.close()

print("Saved KM plots to:", OUT_DIR)


Saved KM plots to: C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots


In [83]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots\km_ready_death_with_phenotype.csv")


OUT_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_clean"
os.makedirs(OUT_DIR, exist_ok=True)


# Map phenotype labels (adjust if your mapping is reversed)
df["phenotype"] = df["phenotype"].map({0: "Decelerated aging", 1: "Accelerated aging"})

continuous_vars = [
    "age",
    "cci_score",
    "adr_n_tot",
    "total_chemo_cycles",
    "treatment_duration_days",
    "time_from_diagnosis_to_first_treatment_days"
]
continuous_vars = [v for v in continuous_vars if v in df.columns]

# force numeric for continuous variables
for c in continuous_vars:
    df[c] = pd.to_numeric(df[c], errors="coerce")

def median_iqr(x):
    x = x.dropna()
    if len(x) == 0:
        return pd.Series({"median": np.nan, "q1": np.nan, "q3": np.nan})
    return pd.Series({
        "median": x.median(),
        "q1": x.quantile(0.25),
        "q3": x.quantile(0.75)
    })

cont_table = df.groupby("phenotype")[continuous_vars].apply(lambda g: g.apply(median_iqr)).reset_index()

# Make the table wide: one row per variable
cont_table = cont_table.melt(id_vars=["phenotype", "level_1"], var_name="stat", value_name="value")
cont_table = cont_table.pivot_table(index="level_1", columns=["phenotype", "stat"], values="value")

# flatten columns
cont_table.columns = [f"{pheno}_{stat}" for pheno, stat in cont_table.columns]
cont_table = cont_table.reset_index().rename(columns={"level_1": "variable"})

cont_table.to_csv("clinical_characterization_continuous.csv", index=False)
print("Saved: clinical_characterization_continuous.csv")
print(cont_table.head())


Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\phenotypes_clean\clinical_characterization_continuous.csv


In [88]:
binary_vars = [
    "death_outcome",
    "received_chemo",
    "any_dose_reduction",
    "any_toxicity",
    "end_due_to_progression",
    "hypertension",
    "renal_insufficiency",
    "atrial_fibrillation",
    "diabete_tipo_II",
    "anemia_comorbidity",
    "polypharmacy_flag"
]
binary_vars = [v for v in binary_vars if v in df.columns]

# Make sure binaries are numeric 0/1 (handles True/False, "Yes"/"No", etc.)
def to_binary(series):
    s = series.copy()
    if s.dtype == bool:
        return s.astype(int)
    s = s.astype(str).str.strip().str.lower()
    s = s.replace({
        "1": 1, "0": 0,
        "true": 1, "false": 0,
        "yes": 1, "no": 0,
        "y": 1, "n": 0
    })
    return pd.to_numeric(s, errors="coerce")

for c in binary_vars:
    df[c] = to_binary(df[c])

rows = []
for var in binary_vars:
    tmp = df.groupby("phenotype")[var].agg(["sum", "count"]).reset_index()
    tmp["percent"] = 100 * tmp["sum"] / tmp["count"]
    tmp["variable"] = var
    rows.append(tmp)

bin_table = pd.concat(rows, ignore_index=True)

bin_wide = bin_table.pivot(index="variable", columns="phenotype", values=["sum", "percent"])
bin_wide.columns = [f"{stat}_{pheno}" for stat, pheno in bin_wide.columns]
bin_wide = bin_wide.reset_index()

bin_path = os.path.join(OUT_DIR, "clinical_characterization_binary.csv")
bin_wide.to_csv(bin_path, index=False)
print("Saved:", bin_path)


Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\km_plots\clinical_characterization_binary.csv
