In [None]:
# src/clinical/make_tcga_manifest.py
import pandas as pd
import glob
import os

raw_clinical_dir = "data/raw/clinical"
out_dir = "data/raw/tcga"
os.makedirs(out_dir, exist_ok=True)

# find all TSVs (COAD + READ)
files = glob.glob(os.path.join(raw_clinical_dir, "*.tsv"))
print("Found clinical files:", files)

# helper to load a tsv into a flat dataframe (keep nested column names like diagnoses.*)
def load_tsv(path):
    df = pd.read_csv(path, sep='\t', dtype=str)
    # ensure no duplicate column names; keep as-is
    return df

# combine all clinical-style TSVs that include 'cases.case_id' or 'cases.submitter_id'
dfs = []
for f in files:
    df = load_tsv(f)
    # standardize common identifiers if present
    if 'cases.submitter_id' in df.columns:
        df = df.rename(columns={'cases.submitter_id':'submitter_id', 'cases.case_id':'case_id'})
    dfs.append(df)

if not dfs:
    raise SystemExit("No clinical TSVs found in " + raw_clinical_dir)

# concatenate (outer join on columns; many columns will be NaN)
big = pd.concat(dfs, axis=0, ignore_index=True, sort=False)

# keep one row per case_id / submitter_id (some files may have repeated rows) — prefer non-null values
# first ensure we have case_id or submitter_id
if 'case_id' not in big.columns and 'submitter_id' not in big.columns:
    # try fallback columns
    if 'cases.case_id' in big.columns:
        big = big.rename(columns={'cases.case_id':'case_id'})
    if 'cases.submitter_id' in big.columns:
        big = big.rename(columns={'cases.submitter_id':'submitter_id'})

# prefer 'case_id' if available; else use submitter_id
id_col = 'case_id' if 'case_id' in big.columns else 'submitter_id'
if id_col not in big.columns:
    raise SystemExit("No case_id or submitter_id column found. Check TSVs.")

# collapse to one row per id: take first non-null for each column
big = big.groupby(id_col).agg(lambda s: s.dropna().iloc[0] if s.dropna().shape[0]>0 else pd.NA).reset_index()

# Helper: derive metastasis_status
def derive_m_status(row):
    # 1 = metastatic (M1); 0 = non-metastatic (M0); NA if unknown
    # try pathologic M
    for col in ['diagnoses.ajcc_pathologic_m','diagnoses.ajcc_clinical_m','diagnoses.ajcc_m_pathologic','diagnoses.ajcc_m']:
        v = row.get(col, None)
        if pd.notna(v):
            v = str(v).strip().upper()
            if v.startswith('M1') or v == '1' or v == 'M1.0' or 'M1' in v:
                return 1
            if v.startswith('M0') or v == '0' or 'M0' in v:
                return 0
    # try pathologic stage (Stage IV -> M1)
    for col in ['diagnoses.ajcc_pathologic_stage','diagnoses.ajcc_clinical_stage','diagnoses.uicc_pathologic_stage']:
        v = row.get(col, None)
        if pd.notna(v):
            vs = str(v).upper()
            if 'IV' in vs or 'STAGE 4' in vs or 'STAGE IV' in vs or '4' == vs.strip():
                return 1
            # stages I-III -> 0
            if any(x in vs for x in ['I','II','III','1','2','3']):
                # need to be careful: '3' alone could be ambiguous; but accept for now
                return 0
    # try metastasis_at_diagnosis field
    if pd.notna(row.get('diagnoses.metastasis_at_diagnosis', pd.NA)):
        v = str(row.get('diagnoses.metastasis_at_diagnosis')).lower()
        if 'yes' in v or 'present' in v or '1' in v:
            return 1
        if 'no' in v or 'absent' in v:
            return 0
    return pd.NA

# apply derive function for each row
big['metastasis_status'] = big.apply(derive_m_status, axis=1)

# create manifest columns
manifest_cols = {
    'submitter_id': big.columns[big.columns.str.contains('submitter_id', regex=False)][0] if any(big.columns.str.contains('submitter_id', regex=False)) else (id_col),
    'case_id': id_col,
    'primary_diagnosis': 'diagnoses.primary_diagnosis' if 'diagnoses.primary_diagnosis' in big.columns else ('diagnoses.primary_disease' if 'diagnoses.primary_disease' in big.columns else pd.NA),
    'gender': 'demographic.gender' if 'demographic.gender' in big.columns else ('demographic.sex' if 'demographic.sex' in big.columns else pd.NA),
    'age_at_diagnosis': 'diagnoses.age_at_diagnosis' if 'diagnoses.age_at_diagnosis' in big.columns else ('demographic.age_at_index' if 'demographic.age_at_index' in big.columns else pd.NA),
    'tumor_stage': 'diagnoses.ajcc_pathologic_stage' if 'diagnoses.ajcc_pathologic_stage' in big.columns else ('diagnoses.ajcc_clinical_stage' if 'diagnoses.ajcc_clinical_stage' in big.columns else pd.NA),
    'metastasis_status': 'metastasis_status',
    'vital_status': 'demographic.vital_status' if 'demographic.vital_status' in big.columns else ('demographic.vital_status' if 'demographic.vital_status' in big.columns else pd.NA),
    'days_to_death': 'demographic.days_to_death' if 'demographic.days_to_death' in big.columns else ('diagnoses.days_to_death' if 'diagnoses.days_to_death' in big.columns else pd.NA)
}

# build manifest df
manifest_df = pd.DataFrame()
for out_col, in_col in manifest_cols.items():
    if isinstance(in_col, str) and in_col in big.columns:
        manifest_df[out_col] = big[in_col]
    else:
        manifest_df[out_col] = pd.NA

# ensure id column present
manifest_df['case_id'] = big[id_col]

# Save files
full_csv = os.path.join(out_dir, "tcga_coad_read_clinical.csv")
manifest_csv = os.path.join(out_dir, "tcga_clinical_manifest.csv")

big.to_csv(full_csv, index=False)
manifest_df.to_csv(manifest_csv, index=False)
print("Saved:", full_csv)
print("Saved:", manifest_csv)


In [5]:
import pandas as pd
m = pd.read_csv("C:/Users/Negar/Desktop/paper_results/Myself/cr_coad_project/data/raw/tcga/tcga_clinical_manifest.csv")
print(m["metastasis_status"].value_counts(dropna=False))
m.head()

metastasis_status
0.0    456
NaN    111
1.0     66
Name: count, dtype: int64


Unnamed: 0,submitter_id,case_id,primary_diagnosis,gender,age_at_diagnosis,tumor_stage,metastasis_status,vital_status,days_to_death
0,TCGA-DC-6158,0011a67b-1ba9-4a32-a6b8-7850759a38cf,"Adenocarcinoma, NOS",male,25842,Stage I,0.0,Dead,334
1,TCGA-F4-6854,01240896-3f3f-4bf9-9799-55c87bfacf36,"Adenocarcinoma, NOS",female,28272,Stage IIA,0.0,Alive,'--
2,TCGA-DC-5337,016c9c14-4c88-49f5-a11a-dd4bc282f11e,"Adenocarcinoma, NOS",male,25202,Stage I,0.0,Alive,'--
3,TCGA-AA-3561,01ad5016-f691-4bca-82a0-910429d8d25b,"Adenocarcinoma, NOS",male,26420,Stage IIA,0.0,Alive,'--
4,TCGA-AA-A00O,01f493d4-229d-47a6-baa8-32a342c65d01,"Adenocarcinoma, NOS",female,30316,Stage IIIC,0.0,Alive,'--


In [2]:
import os, glob
print("cwd:", os.getcwd())
print("found:", glob.glob("**/tcga_clinical_manifest.csv", recursive=True))


cwd: c:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project\notebooks
found: []
