In [8]:
import pandas as pd

# Load all three model matrices
death_df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\death_model_matrix_imputed_v1.csv")
hosp_df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\hospitalization_model_matrix_imputed_v1.csv")
adr_df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\severe_adr_model_matrix_imputed_v1.csv")

# 1) Start from death_df as the canonical analytic dataset
integrated = death_df.copy()

# 2) Define ONLY the extra columns you want from each additional matrix
#    (everything else will come from death_df to avoid triplication)

hosp_outcome_cols = [
    "hospitalization_flag",
]

adr_outcome_cols = [
    "severe_adr_flag",
]

# 3) Filter these lists to:
#    - keep only columns that actually exist in the respective df
#    - avoid any column names that are already present in `integrated`

hosp_outcome_cols = [
    c for c in hosp_outcome_cols
    if c in hosp_df.columns and c not in integrated.columns
]

adr_outcome_cols = [
    c for c in adr_outcome_cols
    if c in adr_df.columns and c not in integrated.columns
]

# 4) Merge ONLY patient_id + these new outcome columns

if hosp_outcome_cols:
    integrated = integrated.merge(
        hosp_df[["patient_id"] + hosp_outcome_cols],
        on="patient_id",
        how="left"
    )

if adr_outcome_cols:
    integrated = integrated.merge(
        adr_df[["patient_id"] + adr_outcome_cols],
        on="patient_id",
        how="left"
    )

# 5) Final sanity check for any duplicated columns
dup_cols = integrated.columns[integrated.columns.duplicated()].tolist()
print("Duplicated columns:", dup_cols)

# If this prints an empty list, you are safe.
# If anything shows up, you can decide which version to keep.
# But with the filters above it should normally be [].

print(integrated.shape)
integrated.to_csv("phase3_integrated_base.csv", index=False)


Duplicated columns: []
(406, 77)


In [None]:
# Example if you ever see suffixed columns
cols_to_drop = [c for c in integrated.columns if c.endswith("_y")]
integrated = integrated.drop(columns=cols_to_drop)

# And optionally rename *_x columns back to base names:
integrated.columns = [
    c[:-2] if c.endswith("_x") else c
    for c in integrated.columns
]

In [None]:
import pandas as pd

# Death univariate (with effect size)
uni_death = pd.read_excel(r"C:/Users\HP/OneDrive/Desktop/Results/binary_models/univariate_screening_death_death_outcome_with_effect_size.xlsx")

# Normalise column names
uni_death_norm = uni_death.rename(columns={
    "Test_Used": "Test Used",
    "p_value": "P-Value",
    "Effect_Size": "Effect Size",
    "p_value_fdr_bh": "Corrected P-Value"
})
# Make sure Outcome is consistently labelled
uni_death_norm["Outcome"] = "Death"

# Hospitalization
uni_hosp = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/binary_models/univariate_screening_hospitalization_with_effects.xlsx")
uni_hosp_norm = uni_hosp.copy()  # already has Outcome / Feature / etc.
uni_hosp_norm["Outcome"] = "Hospitalization"

# Severe ADR
uni_sev = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/binary_models/univariate_screening_severeADR_with_effects.xlsx")
uni_sev_norm = uni_sev.copy()
uni_sev_norm["Outcome"] = "Severe_ADR"

# Stack them into one long table
uni_all = pd.concat(
    [uni_death_norm, uni_hosp_norm, uni_sev_norm],
    ignore_index=True
)

# Keep only the columns we need
uni_all = uni_all[["Outcome", "Feature", "Test Used", "P-Value", "Effect Size", "Corrected P-Value"]]

uni_all.to_csv("phase3_univariate_all_outcomes.csv", index=False)


In [4]:
# Death: use lambda_ (time scaling) coefficients as the main effect
aft_coef = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/Death Outcome Models/weibull_aft_death_model_summary_covset.xlsx")
coef_death = aft_coef[aft_coef["param"] == "lambda_"].copy()

coef_death_norm = coef_death.rename(columns={
    "covariate": "feature",
    "Time_Ratio": "OR"   
})
coef_death_norm["Outcome"] = "Death"

# Hospitalization logistic model
coef_hosp = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/binary_models/logreg_multivariable_Hospitalization_coefficients.xlsx")
coef_hosp_norm = coef_hosp.copy()
coef_hosp_norm["Outcome"] = "Hospitalization"

# Severe ADR logistic model
coef_sev = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/binary_models/logreg_multivariable_Severe_ADR_coefficients.xlsx")
coef_sev_norm = coef_sev.copy()
coef_sev_norm["Outcome"] = "Severe_ADR"

# Align column names and stack
coef_all = pd.concat([
    coef_death_norm[["feature", "coef", "OR", "Outcome"]],
    coef_hosp_norm[["feature", "coef", "OR", "Outcome"]],
    coef_sev_norm[["feature", "coef", "OR", "Outcome"]],
], ignore_index=True)

coef_all.to_csv("phase3_coefficients_all_outcomes.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/HP/OneDrive/Desktop/Results/binary_models/logreg_multivariable_Hospitalization_coefficients.xlsx'

In [None]:
ri_death = pd.read_excel("C:/Users/HP/OneDrive/Desktop/Results/survival_models/RI_Significance_Table_robust_death.xlsx")
# We will mostly need: variable, Combined_RI, Mean_log_effect
ri_death_small = ri_death[["variable", "Combined_RI", "Mean_log_effect"]].copy()
ri_death_small.to_csv("phase3_ri_death_simple.csv", index=False)


In [5]:
import numpy as np
import pandas as pd

# 1. Load integrated base dataset
integrated = pd.read_csv("C:/Users/HP/OneDrive/Desktop/phase3_phenotypes/phase3_integrated_base.csv")

# 2. Define which columns are *not* candidate features
non_features = {
    "patient_id",
    "death_outcome",
    "survival_days",
    "hospitalization_flag",
    "severe_adr_flag",
}

candidate_features = [c for c in integrated.columns if c not in non_features]

# 3. Load normalised univariate and coefficient tables, and RI
uni_all = pd.read_csv("C:/Users/HP/OneDrive/Desktop/phase3_phenotypes/phase3_univariate_all_outcomes.csv")
coef_all = pd.read_csv("C:/Users/HP/OneDrive/Desktop/phase3_phenotypes/phase3_coefficients_all_outcomes.csv")
ri_death_small = pd.read_csv("C:/Users/HP/OneDrive/Desktop/phase3_phenotypes/phase3_ri_death_simple.csv")

# 4. Build an empty meta table over candidate features
meta = pd.DataFrame({"feature": candidate_features})

# Example: link RI by exact match between feature name and 'variable'
meta = meta.merge(
    ri_death_small.rename(columns={"variable": "feature"}),
    on="feature",
    how="left"
)

# 5. Aggregate univariate stats per feature across outcomes
uni_agg = (
    uni_all.groupby("Feature")
           .agg(
               min_p=("Corrected P-Value", "min"),
               max_abs_effect=("Effect Size", lambda x: np.nanmax(np.abs(x)))
           )
           .reset_index()
           .rename(columns={"Feature": "feature"})
)

meta = meta.merge(uni_agg, on="feature", how="left")

# 6. Aggregate coefficients per feature across outcomes
coef_agg = (
    coef_all.groupby("feature")
            .agg(
                max_abs_coef=("coef", lambda x: np.nanmax(np.abs(x))),
                max_or=("OR", "max")
            )
            .reset_index()
)

meta = meta.merge(coef_agg, on="feature", how="left")

# 7. Normalise each metric to [0, 1] for scoring
def to_0_1(series):
    s = series.copy()
    s = (s - s.min()) / (s.max() - s.min()) if s.notna().any() and s.min() != s.max() else s * 0.0
    return s

meta["score_RI"] = to_0_1(meta["Combined_RI"])
meta["score_effect"] = to_0_1(meta["max_abs_effect"])
meta["score_coef"] = to_0_1(meta["max_abs_coef"])

# For p-values, lower is better, so invert
meta["score_p"] = to_0_1(-np.log10(meta["min_p"]))

# 8. Combine scores (you can adjust weights to match your old notebook)
meta["feature_score"] = (
    0.35 * meta["score_RI"] +
    0.25 * meta["score_effect"] +
    0.25 * meta["score_coef"] +
    0.15 * meta["score_p"]
)

# 9. Sort and inspect top features
meta_sorted = meta.sort_values("feature_score", ascending=False)
meta_sorted.to_csv("phase3_feature_meta_scores.csv", index=False)


In [6]:
TOP_N = 45
always_include = ["age_group", "FI_LAB"]  # expand this with your must-have variables

top_features = meta_sorted["feature"].head(TOP_N).tolist()
for col in always_include:
    if col in candidate_features and col not in top_features:
        top_features.append(col)

len(top_features), top_features[:10]

(45,
 ['surgical_intervention',
  'end_reason_progression_any_line',
  'genotipo_DPYD_type',
  'end_due_to_progression',
  'adr_description',
  'received_targeted_therapy',
  'dyslipidemia',
  'cardiovascular_disorders',
  'cci_score',
  'total_unique_active_drugs'])

In [7]:
phenotype_cols = ["patient_id", "death_outcome", "survival_days",
                  "hospitalization_flag", "severe_adr_flag"] + top_features

phenotype_df = integrated[phenotype_cols].copy()
phenotype_df.to_csv("phase3_phenotyping_dataset.csv", index=False)

In [9]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.preprocessing import StandardScaler

# Paths (edit as needed)
BASE_DIR = r"C:\Users\HP\OneDrive\Desktop\Phase 3"
os.makedirs(BASE_DIR, exist_ok=True)

death_df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\death_model_matrix_imputed_v1.csv")
hosp_df  = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\hospitalization_model_matrix_imputed_v1.csv")
adr_df   = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Phase 1\Clean Data\severe_adr_model_matrix_imputed_v1.csv")

cox_coef = pd.read_excel(r"C:\Users\HP\OneDrive\Desktop\Phase 2\Death Outcome Model\survival_models\coxph_death_model_summary_penalized.xlsx")
hosp_coef = pd.read_excel(r"C:\Users\HP\OneDrive\Desktop\Phase 2\Binary Models\logreg_multivariable_Hospitalization_coefficients.xlsx")
adr_coef  = pd.read_excel(r"C:\Users\HP\OneDrive\Desktop\Phase 2\Binary Models\logreg_multivariable_Severe_ADR_coefficients.xlsx")


In [10]:
def feature_to_series(df: pd.DataFrame, feat: str):
    feat = str(feat)

    # direct numeric/binary column
    if feat in df.columns:
        return pd.to_numeric(df[feat], errors="coerce").fillna(0)

    # Cox-style dummy: var=value
    if "=" in feat:
        base, level = feat.split("=", 1)
        base = base.strip()
        level = level.strip()
        if base in df.columns:
            return (df[base].astype(str).str.strip() == level).astype(int)

    # Logistic-style dummy: var_level (but var itself may contain underscores)
    candidates = [c for c in df.columns if isinstance(c, str) and feat.startswith(c + "_")]
    if candidates:
        base = max(candidates, key=len)  # longest matching prefix
        level = feat[len(base) + 1:].strip()
        return (df[base].astype(str).str.strip() == level).astype(int)

    return None


In [11]:
def compute_score(df, feat_col, coef_col, id_col="patient_id"):
    score = np.zeros(len(df), dtype=float)
    missing = []

    for feat, coefv in zip(feat_col.astype(str), coef_col.astype(float)):
        s = feature_to_series(df, feat)
        if s is None:
            missing.append(feat)
            continue
        score += coefv * s.values

    out = pd.DataFrame({id_col: df[id_col].values, "score": score})
    return out, missing

death_score, miss_death = compute_score(death_df, cox_coef["covariate"], cox_coef["coef"])
hosp_score,  miss_hosp  = compute_score(hosp_df, hosp_coef["feature"], hosp_coef["coef"])
adr_score,   miss_adr   = compute_score(adr_df,  adr_coef["feature"],  adr_coef["coef"])

print("Missing death features:", len(miss_death))
print("Missing hosp features:", len(miss_hosp))
print("Missing adr features:", len(miss_adr))


Missing death features: 0
Missing hosp features: 0
Missing adr features: 0


In [12]:
# Pick baseline columns that help interpretation later (safe baseline anchors)
base_cols = ["patient_id", "age", "age_group", "gender", "tumor_type", "cci_score"]

phase3_base = death_df[base_cols].copy()

phase3_base = phase3_base.merge(death_score, on="patient_id", how="left").rename(columns={"score":"risk_death"})
phase3_base = phase3_base.merge(hosp_score,  on="patient_id", how="left").rename(columns={"score":"risk_hosp"})
phase3_base = phase3_base.merge(adr_score,   on="patient_id", how="left").rename(columns={"score":"risk_adr"})


In [13]:
scaler = StandardScaler()
phase3_base[["z_risk_death","z_risk_hosp","z_risk_adr"]] = scaler.fit_transform(
    phase3_base[["risk_death","risk_hosp","risk_adr"]]
)

phase3_base.to_csv(os.path.join(BASE_DIR, "phase3_integrated_base_clean.csv"), index=False)
print("Saved:", os.path.join(BASE_DIR, "phase3_integrated_base_clean.csv"))


Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_integrated_base_clean.csv


In [14]:
cluster_features = ["z_risk_death","z_risk_hosp","z_risk_adr"]
X_cluster = phase3_base[["patient_id"] + cluster_features].copy()
X_cluster.to_csv(os.path.join(BASE_DIR, "phase3_clustering_matrix.csv"), index=False)

print("Saved:", os.path.join(BASE_DIR, "phase3_clustering_matrix.csv"))


Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_clustering_matrix.csv


In [15]:
LEAK_PAT = re.compile(r"(outcome|survival|death_outcome|hospitalization_flag|severe_adr_flag|adr_n_|grado|count|event|time_)", re.I)

bad_cols = [c for c in phase3_base.columns if LEAK_PAT.search(c)]
print("Potential leakage columns in phase3_base:", bad_cols)


Potential leakage columns in phase3_base: []


In [16]:
eval_df = death_df[["patient_id","death_outcome","survival_days"]].copy() \
    .merge(hosp_df[["patient_id","hospitalization_flag"]], on="patient_id", how="left") \
    .merge(adr_df[["patient_id","severe_adr_flag"]], on="patient_id", how="left")

eval_df.to_csv(os.path.join(BASE_DIR, "phase3_outcomes_for_validation_only.csv"), index=False)
print("Saved:", os.path.join(BASE_DIR, "phase3_outcomes_for_validation_only.csv"))


Saved: C:\Users\HP\OneDrive\Desktop\Phase 3\phase3_outcomes_for_validation_only.csv
