In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
selected_column_list = ["cases.case_id", "cases.submitter_id", "demographic.age_at_index",
                        "demographic.ethnicity", "demographic.gender", "demographic.race", 
                        "diagnoses.primary_diagnosis"]

# TCGA-BRCA as example
df_clinical = pd.read_csv("clinical/tcga-lusc_clinical.tsv", sep="\t", low_memory=False)
df_clinical = df_clinical[df_clinical["diagnoses.site_of_resection_or_biopsy"] != "Not Reported"]
df_clinical = df_clinical.filter(items=selected_column_list)
df_clinical.head()

Unnamed: 0,cases.case_id,cases.submitter_id,demographic.age_at_index,demographic.ethnicity,demographic.gender,demographic.race,diagnoses.primary_diagnosis
0,005669e5-1a31-45fb-ae97-9d450e74e7cb,TCGA-77-A5GF,70,not reported,male,not reported,"Squamous cell carcinoma, NOS"
1,005669e5-1a31-45fb-ae97-9d450e74e7cb,TCGA-77-A5GF,70,not reported,male,not reported,"Squamous cell carcinoma, NOS"
5,00fd9306-4a68-49ab-a768-e5fed126a765,TCGA-NC-A5HJ,59,not reported,male,white,"Squamous cell carcinoma, NOS"
6,00fd9306-4a68-49ab-a768-e5fed126a765,TCGA-NC-A5HJ,59,not reported,male,white,"Squamous cell carcinoma, NOS"
7,00fd9306-4a68-49ab-a768-e5fed126a765,TCGA-NC-A5HJ,59,not reported,male,white,"Squamous cell carcinoma, NOS"


In [3]:
df_clinical.columns

Index(['cases.case_id', 'cases.submitter_id', 'demographic.age_at_index',
       'demographic.ethnicity', 'demographic.gender', 'demographic.race',
       'diagnoses.primary_diagnosis'],
      dtype='object')

In [4]:
df_clinical_select = df_clinical.rename(columns={
    "cases.case_id": "case_id",
    "cases.submitter_id": "submitter_id",
    "demographic.age_at_index": "age",
    "demographic.ethnicity": "ethnicity",
    "demographic.gender": "gender",
    "demographic.race": "race",
    "diagnoses.primary_diagnosis": "primary_diagnosis"
})

df_clinical_select = df_clinical_select.drop_duplicates()
df_clinical_select.head()

Unnamed: 0,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,005669e5-1a31-45fb-ae97-9d450e74e7cb,TCGA-77-A5GF,70,not reported,male,not reported,"Squamous cell carcinoma, NOS"
5,00fd9306-4a68-49ab-a768-e5fed126a765,TCGA-NC-A5HJ,59,not reported,male,white,"Squamous cell carcinoma, NOS"
10,01417822-b608-4934-8fe0-594315212be5,TCGA-85-7843,50,not hispanic or latino,male,white,"Squamous cell carcinoma, NOS"
13,02ba8600-9d3d-489d-9f0b-f1c59085ecd6,TCGA-34-8455,67,not hispanic or latino,male,white,"Squamous cell carcinoma, NOS"
15,02eb85cc-a597-4eaa-9614-c94c0f035929,TCGA-39-5037,65,not hispanic or latino,male,white,"Squamous cell carcinoma, NOS"


In [5]:
print(len(df_clinical_select))
print(len(df_clinical_select["submitter_id"].unique()))
print(len(df_clinical_select["case_id"].unique()))

478
478
478


In [6]:
df_manifest = pd.read_csv("../tcga_download/check_results/tcga-lusc.txt", delim_whitespace=True)
df_manifest["filename"] = df_manifest["filename"].str.replace('.svs', '', regex=False)
df_manifest.head()

  df_manifest = pd.read_csv("../tcga_download/check_results/tcga-lusc.txt", delim_whitespace=True)


Unnamed: 0,id,filename,md5,size,state
0,c863b3b5-ebd2-43b6-99d6-d5741e872ff3,TCGA-56-7221-01Z-00-DX1.f897f1ee-2796-4183-931...,b7210021bb05b94574a60a357e11750c,89189949,released
1,420eca55-8832-46ec-bd6f-28add7b8b6b7,TCGA-21-A5DI-01Z-00-DX1.E9123261-ADE7-468C-9E9...,a29777226bb88b63331cf438699acff3,554422463,released
2,01a0a249-0ff2-492f-bab7-20121922906b,TCGA-77-8138-01Z-00-DX1.fac6dcf9-7367-4345-a76...,ef9300795780514be4493bef3bf9e01c,1425469791,released
3,e0bdb382-5337-4611-93b3-cfb1540016ba,TCGA-43-7657-01Z-00-DX1.d8a5d257-c5ca-4192-b6a...,b81545bfb79fa789e04c2a8842600359,177973501,released
4,b6d379dd-6213-48c3-b96c-577aca5520cb,TCGA-60-2695-01Z-00-DX1.4cc6c566-d60d-4ff9-9ea...,95c64d3940b5177638c21ac61c52ec9a,1073234257,released


In [7]:
df_manifest["submitter_id"] = df_manifest["filename"].str.extract(r'^(TCGA-[A-Z0-9]+-[A-Z0-9]+)')

df_manifest.head()

Unnamed: 0,id,filename,md5,size,state,submitter_id
0,c863b3b5-ebd2-43b6-99d6-d5741e872ff3,TCGA-56-7221-01Z-00-DX1.f897f1ee-2796-4183-931...,b7210021bb05b94574a60a357e11750c,89189949,released,TCGA-56-7221
1,420eca55-8832-46ec-bd6f-28add7b8b6b7,TCGA-21-A5DI-01Z-00-DX1.E9123261-ADE7-468C-9E9...,a29777226bb88b63331cf438699acff3,554422463,released,TCGA-21-A5DI
2,01a0a249-0ff2-492f-bab7-20121922906b,TCGA-77-8138-01Z-00-DX1.fac6dcf9-7367-4345-a76...,ef9300795780514be4493bef3bf9e01c,1425469791,released,TCGA-77-8138
3,e0bdb382-5337-4611-93b3-cfb1540016ba,TCGA-43-7657-01Z-00-DX1.d8a5d257-c5ca-4192-b6a...,b81545bfb79fa789e04c2a8842600359,177973501,released,TCGA-43-7657
4,b6d379dd-6213-48c3-b96c-577aca5520cb,TCGA-60-2695-01Z-00-DX1.4cc6c566-d60d-4ff9-9ea...,95c64d3940b5177638c21ac61c52ec9a,1073234257,released,TCGA-60-2695


In [8]:
print(len(df_manifest))

512


In [9]:
# 1. Merge: Add 'filename' from df_manifest to df_clinical_select based on 'submitter_id'
# 'how="right"' ensures that all rows from df_manifest are kept, even if not matched in df_clinical_select
df_merged = pd.merge(df_clinical_select, df_manifest[["submitter_id", "filename"]], on="submitter_id", how="right")

# 2. Reorder columns: Move 'filename' to the first column
cols = df_merged.columns.tolist()                     # Convert column names to a list
cols.insert(0, cols.pop(cols.index("filename")))      # Remove 'filename' and insert it at the beginning
df_merged = df_merged[cols]                           # Reassign the DataFrame with reordered columns

# Show the first few rows of the merged DataFrame
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,TCGA-56-7221-01Z-00-DX1.f897f1ee-2796-4183-931...,1dba835e-cbe9-41e0-883d-50bf86bc7821,TCGA-56-7221,79,not reported,male,white,"Squamous cell carcinoma, NOS"
1,TCGA-21-A5DI-01Z-00-DX1.E9123261-ADE7-468C-9E9...,a7cd7b0f-ab76-4f3f-a2d5-5ec04fe6c4f3,TCGA-21-A5DI,77,not hispanic or latino,male,white,"Squamous cell carcinoma, NOS"
2,TCGA-77-8138-01Z-00-DX1.fac6dcf9-7367-4345-a76...,283af87a-f962-47ab-8a7b-4fcd7e45cdf6,TCGA-77-8138,74,not reported,male,not reported,"Squamous cell carcinoma, NOS"
3,TCGA-43-7657-01Z-00-DX1.d8a5d257-c5ca-4192-b6a...,dbece124-c042-4adc-8136-90e7940ee6ad,TCGA-43-7657,68,not hispanic or latino,female,white,"Squamous cell carcinoma, NOS"
4,TCGA-60-2695-01Z-00-DX1.4cc6c566-d60d-4ff9-9ea...,acae4e48-57de-40c5-9ae5-4386144ebaea,TCGA-60-2695,74,not hispanic or latino,female,white,Basaloid squamous cell carcinoma


In [10]:
print(len(df_merged))
print(df_merged["primary_diagnosis"].value_counts())

512
primary_diagnosis
Squamous cell carcinoma, NOS                                 479
Basaloid squamous cell carcinoma                              13
Squamous cell carcinoma, keratinizing, NOS                    13
Papillary squamous cell carcinoma                              4
Squamous cell carcinoma, large cell, nonkeratinizing, NOS      3
Name: count, dtype: int64


In [11]:
df_merged["label"] = "LUSC"
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis,label
0,TCGA-56-7221-01Z-00-DX1.f897f1ee-2796-4183-931...,1dba835e-cbe9-41e0-883d-50bf86bc7821,TCGA-56-7221,79,not reported,male,white,"Squamous cell carcinoma, NOS",LUSC
1,TCGA-21-A5DI-01Z-00-DX1.E9123261-ADE7-468C-9E9...,a7cd7b0f-ab76-4f3f-a2d5-5ec04fe6c4f3,TCGA-21-A5DI,77,not hispanic or latino,male,white,"Squamous cell carcinoma, NOS",LUSC
2,TCGA-77-8138-01Z-00-DX1.fac6dcf9-7367-4345-a76...,283af87a-f962-47ab-8a7b-4fcd7e45cdf6,TCGA-77-8138,74,not reported,male,not reported,"Squamous cell carcinoma, NOS",LUSC
3,TCGA-43-7657-01Z-00-DX1.d8a5d257-c5ca-4192-b6a...,dbece124-c042-4adc-8136-90e7940ee6ad,TCGA-43-7657,68,not hispanic or latino,female,white,"Squamous cell carcinoma, NOS",LUSC
4,TCGA-60-2695-01Z-00-DX1.4cc6c566-d60d-4ff9-9ea...,acae4e48-57de-40c5-9ae5-4386144ebaea,TCGA-60-2695,74,not hispanic or latino,female,white,Basaloid squamous cell carcinoma,LUSC


In [12]:
print(df_merged["label"].value_counts())
print(len(df_merged))

label
LUSC    512
Name: count, dtype: int64
512


In [13]:
df_merged.to_csv("clinical/tcga-lusc_label.csv", index=False)