In [1]:
import os
import numpy as np
import pandas as pd

In [None]:
selected_column_list = ["cases.case_id", "cases.submitter_id", "demographic.age_at_index",
                        "demographic.ethnicity", "demographic.gender", "demographic.race", 
                        "diagnoses.primary_diagnosis"]

# TCGA-BRCA as example
df_clinical = pd.read_csv("clinical/tcga-luad_clinical.tsv", sep="\t", low_memory=False)
df_clinical = df_clinical[df_clinical["diagnoses.site_of_resection_or_biopsy"] != "Not Reported"]
df_clinical = df_clinical.filter(items=selected_column_list)
df_clinical.head()

Unnamed: 0,cases.case_id,cases.submitter_id,demographic.age_at_index,demographic.ethnicity,demographic.gender,demographic.race,diagnoses.primary_diagnosis
0,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,64,not hispanic or latino,male,white,"Adenocarcinoma, NOS"
1,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,64,not hispanic or latino,male,white,"Adenocarcinoma, NOS"
2,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,64,not hispanic or latino,male,white,"Adenocarcinoma, NOS"
3,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,64,not hispanic or latino,male,white,"Adenocarcinoma, NOS"
4,009be09b-f9f6-43b7-8f45-4a648f8123ce,TCGA-67-3773,84,not hispanic or latino,female,white,"Adenocarcinoma, NOS"


In [3]:
df_clinical.columns

Index(['cases.case_id', 'cases.submitter_id', 'demographic.age_at_index',
       'demographic.ethnicity', 'demographic.gender', 'demographic.race',
       'diagnoses.primary_diagnosis'],
      dtype='object')

In [4]:
df_clinical_select = df_clinical.rename(columns={
    "cases.case_id": "case_id",
    "cases.submitter_id": "submitter_id",
    "demographic.age_at_index": "age",
    "demographic.ethnicity": "ethnicity",
    "demographic.gender": "gender",
    "demographic.race": "race",
    "diagnoses.primary_diagnosis": "primary_diagnosis"
})

df_clinical_select = df_clinical_select.drop_duplicates()
df_clinical_select.head()

Unnamed: 0,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,64,not hispanic or latino,male,white,"Adenocarcinoma, NOS"
4,009be09b-f9f6-43b7-8f45-4a648f8123ce,TCGA-67-3773,84,not hispanic or latino,female,white,"Adenocarcinoma, NOS"
5,01e9888d-b5b9-48f1-8ba6-8a89af108a04,TCGA-NJ-A7XG,49,not hispanic or latino,male,black or african american,"Adenocarcinoma, NOS"
9,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,TCGA-55-6986,74,not reported,female,white,"Bronchiolo-alveolar carcinoma, non-mucinous"
11,035c0b2b-c722-443b-8962-db4ee92c7532,TCGA-86-6851,73,not hispanic or latino,female,white,Adenocarcinoma with mixed subtypes


In [5]:
print(len(df_clinical_select))
print(len(df_clinical_select["submitter_id"].unique()))
print(len(df_clinical_select["case_id"].unique()))

478
478
478


In [6]:
df_manifest = pd.read_csv("../tcga_download/check_results/tcga-luad.txt", delim_whitespace=True)
df_manifest["filename"] = df_manifest["filename"].str.replace('.svs', '', regex=False)
df_manifest.head()

  df_manifest = pd.read_csv("../tcga_download/check_results/tcga-luad.txt", delim_whitespace=True)


Unnamed: 0,id,filename,md5,size,state
0,f28b9a47-68b8-4549-abfd-06e3038dad7b,TCGA-38-4631-01Z-00-DX1.5e0c873a-9c4c-4e0b-bf2...,02ad15c5e1fcb45175f9b9e3cf4b7d31,230713723,released
1,a8db7877-6855-4b11-bfee-e73e429a5b47,TCGA-49-6742-01Z-00-DX2.2c6b4df0-867d-40c5-8be...,e09587214aa14c44b2d7a1f9f53b3953,1501773089,released
2,284ce9f3-c904-498e-8b33-778505364e5a,TCGA-49-6742-01Z-00-DX4.a11201e1-9eeb-40ea-9c8...,3f538029129992f007a799509fcf332c,1590526103,released
3,0edfc16d-25cc-403d-8862-38ec9c90060a,TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bdd...,14d5b3a89973ad3657085ee54d6147dc,673154131,released
4,6cec8838-5a03-4fca-a9b6-bfc5e4ef4fe6,TCGA-49-6742-01Z-00-DX5.74539ee6-0ac6-4663-89f...,25e1989b2109a1e55dadec41737a6f11,1264278319,released


In [7]:
df_manifest["submitter_id"] = df_manifest["filename"].str.extract(r'^(TCGA-[A-Z0-9]+-[A-Z0-9]+)')

df_manifest.head()

Unnamed: 0,id,filename,md5,size,state,submitter_id
0,f28b9a47-68b8-4549-abfd-06e3038dad7b,TCGA-38-4631-01Z-00-DX1.5e0c873a-9c4c-4e0b-bf2...,02ad15c5e1fcb45175f9b9e3cf4b7d31,230713723,released,TCGA-38-4631
1,a8db7877-6855-4b11-bfee-e73e429a5b47,TCGA-49-6742-01Z-00-DX2.2c6b4df0-867d-40c5-8be...,e09587214aa14c44b2d7a1f9f53b3953,1501773089,released,TCGA-49-6742
2,284ce9f3-c904-498e-8b33-778505364e5a,TCGA-49-6742-01Z-00-DX4.a11201e1-9eeb-40ea-9c8...,3f538029129992f007a799509fcf332c,1590526103,released,TCGA-49-6742
3,0edfc16d-25cc-403d-8862-38ec9c90060a,TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bdd...,14d5b3a89973ad3657085ee54d6147dc,673154131,released,TCGA-50-6597
4,6cec8838-5a03-4fca-a9b6-bfc5e4ef4fe6,TCGA-49-6742-01Z-00-DX5.74539ee6-0ac6-4663-89f...,25e1989b2109a1e55dadec41737a6f11,1264278319,released,TCGA-49-6742


In [8]:
print(len(df_manifest))

541


In [9]:
# 1. Merge: Add 'filename' from df_manifest to df_clinical_select based on 'submitter_id'
# 'how="right"' ensures that all rows from df_manifest are kept, even if not matched in df_clinical_select
df_merged = pd.merge(df_clinical_select, df_manifest[["submitter_id", "filename"]], on="submitter_id", how="right")

# 2. Reorder columns: Move 'filename' to the first column
cols = df_merged.columns.tolist()                     # Convert column names to a list
cols.insert(0, cols.pop(cols.index("filename")))      # Remove 'filename' and insert it at the beginning
df_merged = df_merged[cols]                           # Reassign the DataFrame with reordered columns

# Show the first few rows of the merged DataFrame
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,TCGA-38-4631-01Z-00-DX1.5e0c873a-9c4c-4e0b-bf2...,2483621a-4db3-41ab-aa33-b9427ea8a0af,TCGA-38-4631,72,not hispanic or latino,female,white,"Adenocarcinoma, NOS"
1,TCGA-49-6742-01Z-00-DX2.2c6b4df0-867d-40c5-8be...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma
2,TCGA-49-6742-01Z-00-DX4.a11201e1-9eeb-40ea-9c8...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma
3,TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bdd...,0d66bf6c-eed0-4726-bd5b-3bf6d610b4e0,TCGA-50-6597,79,not hispanic or latino,female,white,"Adenocarcinoma, NOS"
4,TCGA-49-6742-01Z-00-DX5.74539ee6-0ac6-4663-89f...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma


In [11]:
print(len(df_merged))
print(df_merged["primary_diagnosis"].value_counts())

541
primary_diagnosis
Adenocarcinoma, NOS                            333
Adenocarcinoma with mixed subtypes             103
Papillary adenocarcinoma, NOS                   29
Acinar cell carcinoma                           21
Mucinous adenocarcinoma                         17
Bronchiolo-alveolar carcinoma, non-mucinous     16
Solid carcinoma, NOS                             5
Bronchio-alveolar carcinoma, mucinous            5
Clear cell adenocarcinoma, NOS                   5
Bronchiolo-alveolar adenocarcinoma, NOS          3
Invasive micropapillary carcinoma                3
Signet ring cell carcinoma                       1
Name: count, dtype: int64


In [12]:
df_merged["label"] = "LUAD"
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis,label
0,TCGA-38-4631-01Z-00-DX1.5e0c873a-9c4c-4e0b-bf2...,2483621a-4db3-41ab-aa33-b9427ea8a0af,TCGA-38-4631,72,not hispanic or latino,female,white,"Adenocarcinoma, NOS",LUAD
1,TCGA-49-6742-01Z-00-DX2.2c6b4df0-867d-40c5-8be...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma,LUAD
2,TCGA-49-6742-01Z-00-DX4.a11201e1-9eeb-40ea-9c8...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma,LUAD
3,TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bdd...,0d66bf6c-eed0-4726-bd5b-3bf6d610b4e0,TCGA-50-6597,79,not hispanic or latino,female,white,"Adenocarcinoma, NOS",LUAD
4,TCGA-49-6742-01Z-00-DX5.74539ee6-0ac6-4663-89f...,21fb46f9-4bbb-441c-af19-a687e9138344,TCGA-49-6742,70,not hispanic or latino,male,white,Mucinous adenocarcinoma,LUAD


In [13]:
print(df_merged["label"].value_counts())
print(len(df_merged))

label
LUAD    541
Name: count, dtype: int64
541


In [14]:
df_merged.to_csv("clinical/tcga-luad_label.csv", index=False)