In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
selected_column_list = ["cases.case_id", "cases.submitter_id", "demographic.age_at_index",
                        "demographic.ethnicity", "demographic.gender", "demographic.race", 
                        "diagnoses.primary_diagnosis"]

# TCGA-BRCA as example
df_clinical = pd.read_csv("clinical/tcga-brca_clinical.tsv", sep="\t", low_memory=False)
df_clinical = df_clinical[df_clinical["diagnoses.site_of_resection_or_biopsy"] != "Not Reported"]
df_clinical = df_clinical.filter(items=selected_column_list)
df_clinical.head()

Unnamed: 0,cases.case_id,cases.submitter_id,demographic.age_at_index,demographic.ethnicity,demographic.gender,demographic.race,diagnoses.primary_diagnosis
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,60,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
1,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,60,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
2,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,60,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
3,0045349c-69d9-4306-a403-c9c1fa836644,TCGA-A1-A0SB,70,not hispanic or latino,female,white,Adenoid cystic carcinoma
4,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,TCGA-A2-A04W,50,not hispanic or latino,female,white,Apocrine adenocarcinoma


In [3]:
df_clinical.columns

Index(['cases.case_id', 'cases.submitter_id', 'demographic.age_at_index',
       'demographic.ethnicity', 'demographic.gender', 'demographic.race',
       'diagnoses.primary_diagnosis'],
      dtype='object')

In [5]:
df_clinical_select = df_clinical.rename(columns={
    "cases.case_id": "case_id",
    "cases.submitter_id": "submitter_id",
    "demographic.age_at_index": "age",
    "demographic.ethnicity": "ethnicity",
    "demographic.gender": "gender",
    "demographic.race": "race",
    "diagnoses.primary_diagnosis": "primary_diagnosis"
})

df_clinical_select = df_clinical_select.drop_duplicates()
df_clinical_select.head()

Unnamed: 0,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,60,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
3,0045349c-69d9-4306-a403-c9c1fa836644,TCGA-A1-A0SB,70,not hispanic or latino,female,white,Adenoid cystic carcinoma
4,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,TCGA-A2-A04W,50,not hispanic or latino,female,white,Apocrine adenocarcinoma
11,00a2d166-78c9-4687-a195-3d6315c27574,TCGA-AN-A0AM,56,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
17,00b11ca8-8540-4a3d-b602-ec754b00230b,TCGA-LL-A440,61,not hispanic or latino,female,white,"Lobular carcinoma, NOS"


In [6]:
print(len(df_clinical_select))
print(len(df_clinical_select["submitter_id"].unique()))
print(len(df_clinical_select["case_id"].unique()))

1098
1098
1098


In [11]:
df_manifest = pd.read_csv("../tcga_download/check_results/tcga-brca.txt", delim_whitespace=True)
df_manifest["filename"] = df_manifest["filename"].str.replace('.svs', '', regex=False)
df_manifest.head()

  df_manifest = pd.read_csv("../tcga_download/check_results/tcga-brca.txt", delim_whitespace=True)


Unnamed: 0,id,filename,md5,size,state
0,decbdda7-e62a-4436-b233-28c5353d0f61,TCGA-A2-A25D-01Z-00-DX1.41DADDB8-3E3F-4F8F-8BE...,b31591d202ef59947fdc1f9e4f3c11e0,1703054197,released
1,eccd20fd-f1f1-4d8f-9104-6e28cedb00f2,TCGA-AR-A5QP-01Z-00-DX1.256FDB13-1F81-42DA-AF6...,742556e7202089f53c57bf426f1262e9,1227414145,released
2,c2c93798-a4df-47ff-a281-8960ae8c5c41,TCGA-C8-A12P-01Z-00-DX1.670B5DE8-07B0-4E4C-93F...,151f025bee789c6b68f4f15eb3159f39,300148309,released
3,a0c8d155-7c15-447f-be5f-813348d8a3eb,TCGA-E9-A1N8-01Z-00-DX1.1243AB1C-75A3-4A5E-967...,87e69a0aee1ee0075f34bf3c9a84b58c,780016529,released
4,ea0ddd49-2f89-4044-a12e-c7e51b172a74,TCGA-A2-A0YF-01Z-00-DX1.6166E995-0669-43D6-B9C...,8dcbfc943955f8de2a61d2567d4e9e56,1310284033,released


In [12]:
df_manifest["submitter_id"] = df_manifest["filename"].str.extract(r'^(TCGA-[A-Z0-9]+-[A-Z0-9]+)')

df_manifest.head()

Unnamed: 0,id,filename,md5,size,state,submitter_id
0,decbdda7-e62a-4436-b233-28c5353d0f61,TCGA-A2-A25D-01Z-00-DX1.41DADDB8-3E3F-4F8F-8BE...,b31591d202ef59947fdc1f9e4f3c11e0,1703054197,released,TCGA-A2-A25D
1,eccd20fd-f1f1-4d8f-9104-6e28cedb00f2,TCGA-AR-A5QP-01Z-00-DX1.256FDB13-1F81-42DA-AF6...,742556e7202089f53c57bf426f1262e9,1227414145,released,TCGA-AR-A5QP
2,c2c93798-a4df-47ff-a281-8960ae8c5c41,TCGA-C8-A12P-01Z-00-DX1.670B5DE8-07B0-4E4C-93F...,151f025bee789c6b68f4f15eb3159f39,300148309,released,TCGA-C8-A12P
3,a0c8d155-7c15-447f-be5f-813348d8a3eb,TCGA-E9-A1N8-01Z-00-DX1.1243AB1C-75A3-4A5E-967...,87e69a0aee1ee0075f34bf3c9a84b58c,780016529,released,TCGA-E9-A1N8
4,ea0ddd49-2f89-4044-a12e-c7e51b172a74,TCGA-A2-A0YF-01Z-00-DX1.6166E995-0669-43D6-B9C...,8dcbfc943955f8de2a61d2567d4e9e56,1310284033,released,TCGA-A2-A0YF


In [13]:
print(len(df_manifest))

1133


In [14]:
# 1. Merge: Add 'filename' from df_manifest to df_clinical_select based on 'submitter_id'
# 'how="right"' ensures that all rows from df_manifest are kept, even if not matched in df_clinical_select
df_merged = pd.merge(df_clinical_select, df_manifest[["submitter_id", "filename"]], on="submitter_id", how="right")

# 2. Reorder columns: Move 'filename' to the first column
cols = df_merged.columns.tolist()                     # Convert column names to a list
cols.insert(0, cols.pop(cols.index("filename")))      # Remove 'filename' and insert it at the beginning
df_merged = df_merged[cols]                           # Reassign the DataFrame with reordered columns

# Show the first few rows of the merged DataFrame
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis
0,TCGA-A2-A25D-01Z-00-DX1.41DADDB8-3E3F-4F8F-8BE...,3b963d72-ba5c-467b-83c9-fbdb462510a3,TCGA-A2-A25D,89,not hispanic or latino,female,white,"Lobular carcinoma, NOS"
1,TCGA-AR-A5QP-01Z-00-DX1.256FDB13-1F81-42DA-AF6...,3c275152-d04b-440c-9621-2fc05ea977b6,TCGA-AR-A5QP,54,not hispanic or latino,female,white,"Lobular carcinoma, NOS"
2,TCGA-C8-A12P-01Z-00-DX1.670B5DE8-07B0-4E4C-93F...,abdc76db-f85e-4337-a57e-6d098789da03,TCGA-C8-A12P,55,not hispanic or latino,female,asian,"Infiltrating duct carcinoma, NOS"
3,TCGA-E9-A1N8-01Z-00-DX1.1243AB1C-75A3-4A5E-967...,ac075bc0-1b59-4557-beea-541694faee03,TCGA-E9-A1N8,48,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS"
4,TCGA-A2-A0YF-01Z-00-DX1.6166E995-0669-43D6-B9C...,ae8c77fe-e6c8-44d5-8265-4a38c637bbef,TCGA-A2-A0YF,67,not hispanic or latino,female,black or african american,"Infiltrating duct carcinoma, NOS"


In [15]:
print(len(df_merged))
print(df_merged["primary_diagnosis"].value_counts())

1133
primary_diagnosis
Infiltrating duct carcinoma, NOS                            805
Lobular carcinoma, NOS                                      204
Infiltrating duct and lobular carcinoma                      27
Infiltrating duct mixed with other types of carcinoma        20
Mucinous adenocarcinoma                                      20
Metaplastic carcinoma, NOS                                   13
Intraductal papillary adenocarcinoma with invasion            7
Medullary carcinoma, NOS                                      7
Infiltrating lobular mixed with other types of carcinoma      7
Invasive micropapillary carcinoma                             4
Pleomorphic carcinoma                                         3
Paget disease and infiltrating duct carcinoma of breast       3
Papillary carcinoma, NOS                                      2
Phyllodes tumor, malignant                                    2
Tubular adenocarcinoma                                        2
Secretory carcino

In [16]:
# Map diagnosis to oncotree code

'''
diagnosis_to_oncotree = {
    "Infiltrating duct carcinoma, NOS": "IDC",
    "Lobular carcinoma, NOS": "ILC",
    "Infiltrating duct and lobular carcinoma": "MDLC",
    "Infiltrating duct mixed with other types of carcinoma": "BRCNOS",
    "Mucinous adenocarcinoma": "BRSRCC",
    "Metaplastic carcinoma, NOS": "SPC",
    "Medullary carcinoma, NOS": "IMMC",
    "Intraductal papillary adenocarcinoma with invasion": "IDC",
    "Infiltrating lobular mixed with other types of carcinoma": "BRCNOS",
    "Invasive micropapillary carcinoma": "IDC",
    "Pleomorphic carcinoma": "SPC",
    "Paget disease and infiltrating duct carcinoma of breast": "IDC",
    "Phyllodes tumor, malignant": None,
    "Papillary carcinoma, NOS": "IDC",
    "Tubular adenocarcinoma": "IDC",
    "Carcinoma, NOS": "BRCNOS",
    "Adenoid cystic carcinoma": "ACBC",
    "Apocrine adenocarcinoma": "BRCANOS",
    "Secretory carcinoma of breast": "BRCANOS",
    "Cribriform carcinoma, NOS": "IDC",
    "Basal cell carcinoma, NOS": None,
    "Large cell neuroendocrine carcinoma": "BRCNOS",
}
'''

diagnosis_to_oncotree = {
    "Infiltrating duct carcinoma, NOS": "IDC",
    "Lobular carcinoma, NOS": "ILC",
}

In [17]:
df_merged["label"] = df_merged["primary_diagnosis"].map(diagnosis_to_oncotree)
df_merged.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis,label
0,TCGA-A2-A25D-01Z-00-DX1.41DADDB8-3E3F-4F8F-8BE...,3b963d72-ba5c-467b-83c9-fbdb462510a3,TCGA-A2-A25D,89,not hispanic or latino,female,white,"Lobular carcinoma, NOS",ILC
1,TCGA-AR-A5QP-01Z-00-DX1.256FDB13-1F81-42DA-AF6...,3c275152-d04b-440c-9621-2fc05ea977b6,TCGA-AR-A5QP,54,not hispanic or latino,female,white,"Lobular carcinoma, NOS",ILC
2,TCGA-C8-A12P-01Z-00-DX1.670B5DE8-07B0-4E4C-93F...,abdc76db-f85e-4337-a57e-6d098789da03,TCGA-C8-A12P,55,not hispanic or latino,female,asian,"Infiltrating duct carcinoma, NOS",IDC
3,TCGA-E9-A1N8-01Z-00-DX1.1243AB1C-75A3-4A5E-967...,ac075bc0-1b59-4557-beea-541694faee03,TCGA-E9-A1N8,48,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS",IDC
4,TCGA-A2-A0YF-01Z-00-DX1.6166E995-0669-43D6-B9C...,ae8c77fe-e6c8-44d5-8265-4a38c637bbef,TCGA-A2-A0YF,67,not hispanic or latino,female,black or african american,"Infiltrating duct carcinoma, NOS",IDC


In [19]:
print(df_merged["label"].value_counts())
print(len(df_merged))

label
IDC    805
ILC    204
Name: count, dtype: int64
1133


In [20]:
df_merged_idc_ilc = df_merged[(df_merged["label"] == "IDC") | (df_merged["label"] == "ILC")]
df_merged_idc_ilc.head()

Unnamed: 0,filename,case_id,submitter_id,age,ethnicity,gender,race,primary_diagnosis,label
0,TCGA-A2-A25D-01Z-00-DX1.41DADDB8-3E3F-4F8F-8BE...,3b963d72-ba5c-467b-83c9-fbdb462510a3,TCGA-A2-A25D,89,not hispanic or latino,female,white,"Lobular carcinoma, NOS",ILC
1,TCGA-AR-A5QP-01Z-00-DX1.256FDB13-1F81-42DA-AF6...,3c275152-d04b-440c-9621-2fc05ea977b6,TCGA-AR-A5QP,54,not hispanic or latino,female,white,"Lobular carcinoma, NOS",ILC
2,TCGA-C8-A12P-01Z-00-DX1.670B5DE8-07B0-4E4C-93F...,abdc76db-f85e-4337-a57e-6d098789da03,TCGA-C8-A12P,55,not hispanic or latino,female,asian,"Infiltrating duct carcinoma, NOS",IDC
3,TCGA-E9-A1N8-01Z-00-DX1.1243AB1C-75A3-4A5E-967...,ac075bc0-1b59-4557-beea-541694faee03,TCGA-E9-A1N8,48,not hispanic or latino,female,white,"Infiltrating duct carcinoma, NOS",IDC
4,TCGA-A2-A0YF-01Z-00-DX1.6166E995-0669-43D6-B9C...,ae8c77fe-e6c8-44d5-8265-4a38c637bbef,TCGA-A2-A0YF,67,not hispanic or latino,female,black or african american,"Infiltrating duct carcinoma, NOS",IDC


In [21]:
print(len(df_merged_idc_ilc))

1009


In [22]:
df_merged_idc_ilc.to_csv("clinical/tcga-brca_label.csv", index=False)