In [1]:
# ==============================================
# 02_label_patients.ipynb
# Purpose: Build a unified patient-level label file for the entire dataset.
# ==============================================

import os
import pandas as pd
from pathlib import Path


In [2]:
# ------------------------------------------------------------
# Step 1: Define paths
# ------------------------------------------------------------
os.chdir(r"C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project")

RAW_TCGA = Path("data/raw/tcga/tcga_clinical_manifest.csv")
OUT_DIR = Path("data/processed/clinical")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_FILE = OUT_DIR / "clinical_features_with_id.csv"

In [3]:
# ------------------------------------------------------------
# Step 2: Load manifest (extracted from Step 01)
# ------------------------------------------------------------
if not RAW_TCGA.exists():
    raise FileNotFoundError(f"‚ùå {RAW_TCGA} not found. Run 01_extract_clinical.ipynb first.")

df = pd.read_csv(RAW_TCGA, dtype=str)
print(f"‚úÖ Loaded {len(df)} patient records from TCGA clinical manifest")

‚úÖ Loaded 633 patient records from TCGA clinical manifest


In [4]:
# ------------------------------------------------------------
# Step 3: Clean and normalize column names
# ------------------------------------------------------------
df.columns = df.columns.str.strip().str.lower()
expected_cols = [
    "case_id", "submitter_id", "primary_diagnosis",
    "gender", "age_at_diagnosis", "tumor_stage",
    "metastasis_status", "vital_status", "days_to_death"
]
missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    print(f"‚ö†Ô∏è Missing expected columns: {missing_cols}")

In [5]:
# ------------------------------------------------------------
# Step 4: Create consistent patient_id
# ------------------------------------------------------------
# Use submitter_id if available (e.g., TCGA-XX-XXXX format)
# Otherwise fall back to case_id
df["patient_id"] = df["submitter_id"].fillna(df["case_id"])

# Ensure uniqueness
dupes = df["patient_id"].duplicated().sum()
if dupes > 0:
    print(f"‚ö†Ô∏è Found {dupes} duplicate patient_ids; keeping first occurrence only.")
    df = df.drop_duplicates(subset="patient_id", keep="first")

In [6]:
# ------------------------------------------------------------
# Step 5: Clean metastasis_status values
# ------------------------------------------------------------
# Convert metastasis_status to numeric (0, 1, or NA)
df["metastasis_status"] = (
    df["metastasis_status"]
    .replace({"True": 1, "False": 0, "yes": 1, "no": 0, "metastatic": 1, "non-metastatic": 0})
    .astype(str)
    .replace("nan", pd.NA)
)

# Convert numerics safely
def safe_num(x):
    try:
        return int(float(x))
    except:
        return pd.NA

df["metastasis_status"] = df["metastasis_status"].apply(safe_num)

In [7]:
# ------------------------------------------------------------
# Step 6: Diagnostics
# ------------------------------------------------------------
total = len(df)
with_labels = df["metastasis_status"].notna().sum()
metastatic = (df["metastasis_status"] == 1).sum()
non_metastatic = (df["metastasis_status"] == 0).sum()

print(f"üìä Total patients: {total}")
print(f"üìä Patients with metastasis labels: {with_labels}")
print(f"   ‚û§ Metastatic: {metastatic}")
print(f"   ‚û§ Non-metastatic: {non_metastatic}")

üìä Total patients: 633
üìä Patients with metastasis labels: 522
   ‚û§ Metastatic: 66
   ‚û§ Non-metastatic: 456


In [8]:
# ------------------------------------------------------------
# Step 7: Save clean output (overwrite existing)
# ------------------------------------------------------------
if OUT_FILE.exists():
    OUT_FILE.unlink()

df.to_csv(OUT_FILE, index=False)
print(f"‚úÖ Saved clean labeled file: {OUT_FILE}")

‚úÖ Saved clean labeled file: data\processed\clinical\clinical_features_with_id.csv


In [9]:
# ------------------------------------------------------------
# Step 8: Verify key columns exist
# ------------------------------------------------------------
expected_minimum = ["patient_id", "case_id", "submitter_id", "metastasis_status"]
missing = [c for c in expected_minimum if c not in df.columns]
if missing:
    print(f"‚ö†Ô∏è Missing key columns after export: {missing}")
else:
    print("‚úÖ All required key columns present: patient_id, case_id, submitter_id, metastasis_status")



‚úÖ All required key columns present: patient_id, case_id, submitter_id, metastasis_status


In [10]:
# ------------------------------------------------------------
# Step 9: Quick sanity preview
# ------------------------------------------------------------
print("\nSample rows:")
print(df.head(10))


Sample rows:
                                case_id  submitter_id  \
0  0011a67b-1ba9-4a32-a6b8-7850759a38cf  TCGA-DC-6158   
1  01240896-3f3f-4bf9-9799-55c87bfacf36  TCGA-F4-6854   
2  016c9c14-4c88-49f5-a11a-dd4bc282f11e  TCGA-DC-5337   
3  01ad5016-f691-4bca-82a0-910429d8d25b  TCGA-AA-3561   
4  01f493d4-229d-47a6-baa8-32a342c65d01  TCGA-AA-A00O   
5  022f39e9-57ee-4b2b-8b3a-8929e3d69a37  TCGA-DM-A28F   
6  02f9668c-71e6-485f-88b1-b37dc8bdd2ab  TCGA-AA-3866   
7  03a9dd9d-62ae-4acd-9272-389274858f3d  TCGA-AF-3913   
8  03efbc94-a43d-4db0-9377-e397348430a6  TCGA-AA-3524   
9  04178a4f-14b0-45ba-aa3e-f21638e23765  TCGA-AG-3578   

         primary_diagnosis  gender age_at_diagnosis tumor_stage  \
0      Adenocarcinoma, NOS    male            25842     Stage I   
1      Adenocarcinoma, NOS  female            28272   Stage IIA   
2      Adenocarcinoma, NOS    male            25202     Stage I   
3      Adenocarcinoma, NOS    male            26420   Stage IIA   
4      Adenocarcinoma, 