In this notebook, using R and TCGABiolinks we download the clinical data associated with many TCGA patients. Both from the TCGA study and other studies. The studies included are:

[
    'LAML', 'ACC', 'BLCA', 'LGG', 'BRCA', 'CESC', 'CHOL', 'LCML', 'COAD',
    'CNTL', 'ESCA', 'FPPP', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LIHC',
    'LUAD', 'LUSC', 'DLBC', 'MESO', 'MISC', 'OV', 'PAAD', 'PCPG', 'PRAD',
    'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THYM', 'THCA', 'UCS', 'UCEC',
    'UVM'
]

This is to prepare an experiment where we attempt to separate TGCT samples from other types of cancer.

In [1]:
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("TCGAbiolinks", ask = FALSE, verbose = TRUE)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com

Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.0 (2025-04-11)

Installing package(s) 'BiocVersion', 'TCGAbiolinks'

system (cmd0): /usr/lib/R/bin/R CMD INSTALL

also installing the dependencies ‘plogr’, ‘png’, ‘Biostrings’, ‘RSQLite’, ‘KEGGREST’, ‘filelock’, ‘UCSC.utils’, ‘GenomeInfoDbData’, ‘matrixStats’, ‘abind’, ‘SparseArray’, ‘AnnotationDbi’, ‘BiocFileCache’, ‘BiocGenerics’, ‘GenomeInfoDb’, ‘XVector’, ‘R.oo’, ‘R.methodsS3’, ‘MatrixGenerics’, ‘Biobase’, ‘S4Arrays’, ‘DelayedArray’, ‘downloader’, ‘biomaRt’, ‘GenomicRanges’, ‘XML’, ‘plyr’, ‘IRanges’, ‘S4Vectors’, ‘R.utils’, ‘SummarizedExperiment’, ‘TCGAbiolinksGUI.data’


foundpkgs: plogr, png, Biostrings, RSQLite, KEGGREST, filelock, UCSC.utils, GenomeInfo

In [1]:
library(TCGAbiolinks)
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
tcga_projects <- c(
  'LAML', 'ACC', 'BLCA', 'LGG', 'BRCA', 'CESC', 'CHOL', 'LCML', 'COAD',
  'CNTL', 'ESCA', 'FPPP', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LIHC',
  'LUAD', 'LUSC', 'DLBC', 'MESO', 'MISC', 'OV', 'PAAD', 'PCPG', 'PRAD',
  'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THYM', 'THCA', 'UCS', 'UCEC',
  'UVM'
)

project_ids <- paste0("TCGA-", tcga_projects)

clinical_data_list <- list()

In [3]:
for (project in project_ids) {
  message("Downloading clinical data for: ", project)
  try({
    clinical_data <- GDCquery_clinic(project = project, type = "clinical")

    # Add a column to indicate project source
    clinical_data$project_id <- project

    # Store in list
    clinical_data_list[[project]] <- clinical_data
  }, silent = FALSE)
}

Downloading clinical data for: TCGA-LAML



Error in dplyr::select(., c(submitter_id, days_to_follow_up, disease_response)) : 
  Can't select columns that don't exist.
[31m✖[39m Column `disease_response` doesn't exist.


Downloading clinical data for: TCGA-ACC

Downloading clinical data for: TCGA-BLCA

Downloading clinical data for: TCGA-LGG

Downloading clinical data for: TCGA-BRCA

Downloading clinical data for: TCGA-CESC

Downloading clinical data for: TCGA-CHOL

Downloading clinical data for: TCGA-LCML





|id                        |name                                                                                                |
|:-------------------------|:---------------------------------------------------------------------------------------------------|
|HCMI-CMDC                 |NCI Cancer Model Development for the Human Cancer Model Initiative                                  |
|MMRF-COMMPASS             |Multiple Myeloma CoMMpass Study                                                                     |
|CDDP_EAGLE-1              |CDDP Integrative Analysis of Lung Adenocarcinoma (Phase 2)                                          |
|MP2PRT-WT                 |Molecular Profiling to Predict Response to Treatment - Wilms Tumor                                  |
|TCGA-BRCA                 |Breast Invasive Carcinoma                                                                           |
|TCGA-UCEC                 |Uterine Corpus Endometrial Carcinoma                        

Downloading clinical data for: TCGA-COAD

Downloading clinical data for: TCGA-CNTL





|id                        |name                                                                                                |
|:-------------------------|:---------------------------------------------------------------------------------------------------|
|HCMI-CMDC                 |NCI Cancer Model Development for the Human Cancer Model Initiative                                  |
|MMRF-COMMPASS             |Multiple Myeloma CoMMpass Study                                                                     |
|CDDP_EAGLE-1              |CDDP Integrative Analysis of Lung Adenocarcinoma (Phase 2)                                          |
|MP2PRT-WT                 |Molecular Profiling to Predict Response to Treatment - Wilms Tumor                                  |
|TCGA-BRCA                 |Breast Invasive Carcinoma                                                                           |
|TCGA-UCEC                 |Uterine Corpus Endometrial Carcinoma                        

Downloading clinical data for: TCGA-ESCA

Downloading clinical data for: TCGA-FPPP





|id                        |name                                                                                                |
|:-------------------------|:---------------------------------------------------------------------------------------------------|
|HCMI-CMDC                 |NCI Cancer Model Development for the Human Cancer Model Initiative                                  |
|MMRF-COMMPASS             |Multiple Myeloma CoMMpass Study                                                                     |
|CDDP_EAGLE-1              |CDDP Integrative Analysis of Lung Adenocarcinoma (Phase 2)                                          |
|MP2PRT-WT                 |Molecular Profiling to Predict Response to Treatment - Wilms Tumor                                  |
|TCGA-BRCA                 |Breast Invasive Carcinoma                                                                           |
|TCGA-UCEC                 |Uterine Corpus Endometrial Carcinoma                        

Downloading clinical data for: TCGA-GBM

Downloading clinical data for: TCGA-HNSC

Downloading clinical data for: TCGA-KICH

Downloading clinical data for: TCGA-KIRC

Downloading clinical data for: TCGA-KIRP

Downloading clinical data for: TCGA-LIHC

Downloading clinical data for: TCGA-LUAD

Downloading clinical data for: TCGA-LUSC

Downloading clinical data for: TCGA-DLBC

Downloading clinical data for: TCGA-MESO

Downloading clinical data for: TCGA-MISC





|id                        |name                                                                                                |
|:-------------------------|:---------------------------------------------------------------------------------------------------|
|HCMI-CMDC                 |NCI Cancer Model Development for the Human Cancer Model Initiative                                  |
|MMRF-COMMPASS             |Multiple Myeloma CoMMpass Study                                                                     |
|CDDP_EAGLE-1              |CDDP Integrative Analysis of Lung Adenocarcinoma (Phase 2)                                          |
|MP2PRT-WT                 |Molecular Profiling to Predict Response to Treatment - Wilms Tumor                                  |
|TCGA-BRCA                 |Breast Invasive Carcinoma                                                                           |
|TCGA-UCEC                 |Uterine Corpus Endometrial Carcinoma                        

Downloading clinical data for: TCGA-OV

Downloading clinical data for: TCGA-PAAD

Downloading clinical data for: TCGA-PCPG

Downloading clinical data for: TCGA-PRAD

Downloading clinical data for: TCGA-READ

Downloading clinical data for: TCGA-SARC

Downloading clinical data for: TCGA-SKCM

Downloading clinical data for: TCGA-STAD

Downloading clinical data for: TCGA-TGCT

Downloading clinical data for: TCGA-THYM

Downloading clinical data for: TCGA-THCA

Downloading clinical data for: TCGA-UCS

Downloading clinical data for: TCGA-UCEC

Downloading clinical data for: TCGA-UVM



In [8]:
all_clinical_data <- bind_rows(clinical_data_list)

In [11]:
columns_to_keep <- c(
  "submitter_id", "sample_type", "sample_type_id", "tumor_descriptor", "specimen_type",
  "synchronous_malignancy", "site_of_resection_or_biopsy", "age_at_index", "days_to_birth",
  "primary_diagnosis", "prior_treatments", "prior_malignancy", "tissue_or_organ_of_origin",
  "ajcc_pathologic_stage", "diagnosis_is_primary_disease", "gender"
)

all_clinical_data <- all_clinical_data %>% select(any_of(columns_to_keep))

In [12]:
head(all_clinical_data)

Unnamed: 0_level_0,submitter_id,synchronous_malignancy,site_of_resection_or_biopsy,age_at_index,days_to_birth,primary_diagnosis,prior_malignancy,tissue_or_organ_of_origin,ajcc_pathologic_stage,diagnosis_is_primary_disease,gender
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<lgl>,<chr>
1,TCGA-OR-A5LF,No,"Adrenal gland, NOS",74,-27179,Adrenal cortical carcinoma,no,Cortex of adrenal gland,,True,female
2,TCGA-OR-A5KB,No,"Adrenal gland, NOS",61,-22550,Adrenal cortical carcinoma,no,Cortex of adrenal gland,,True,female
3,TCGA-OR-A5JH,No,"Adrenal gland, NOS",32,-11994,Adrenal cortical carcinoma,no,Cortex of adrenal gland,,True,female
4,TCGA-PK-A5HC,No,"Adrenal gland, NOS",44,-16182,Adrenal cortical carcinoma,no,Cortex of adrenal gland,,True,female
5,TCGA-OR-A5L2,No,"Adrenal gland, NOS",83,-30535,Adrenal cortical carcinoma,no,Cortex of adrenal gland,,True,female
6,TCGA-P6-A5OG,No,"Adrenal gland, NOS",45,-16659,Adrenal cortical carcinoma,yes,Cortex of adrenal gland,,True,female


In [13]:
write.csv(all_clinical_data, "TCGA_sample_clin_data.csv", row.names = FALSE)