# **Exploratory Data Analysis of GDC API Data**
GDC: Genomic Data Commons

# Importing Libraries

In [1]:
import json
import os
import re

import pandas as pd
import requests

# Constants and Paths

In [2]:
# Base URL HTML from GDC API
GDC_API_URL = 'https://api.gdc.cancer.gov'

# Endpoint for files download
DATA_ENDPOINT = f'{GDC_API_URL}/data'

# Endpoint of files related to cases
FILES_ENDPOINT = f'{GDC_API_URL}/files'

# Data folder path
DATA_PATH = '../../data'

# External data folder path
EXTERNAL_DATA_PATH = f'{DATA_PATH}/external/gdc-tcga'

# Interim data folder path
INTERIM_DATA_PATH = f'{DATA_PATH}/interim/gdc-tcga'

# Data Loading

In [3]:
# DataFrame with the cases of interest
df_cases = pd.read_csv(
    f'{INTERIM_DATA_PATH}/gdc-cases-of-interest.csv'
)

# DataFrame with the files of interest
df_files = pd.read_csv(
    f'{INTERIM_DATA_PATH}/gdc-files-of-interest.csv'
)

# Files Quantification

In [4]:
# Columns used in aggregation
columns = [
    'access',
    'experimental_strategy',
    'data_category',
    'data_type',
    'data_format'
]

# Aggregate data about files and count them
df_files_agg = df_files \
    .groupby(columns) \
    .agg(count=pd.NamedAgg(column='file_id', aggfunc='count'))

# Make a normalized count of the files
df_files_agg['%count'] = df_files_agg['count'] / df_files_agg['count'].sum()
df_files_agg['%count'] = (df_files_agg['%count'] * 100).round(2)

# Print the result of aggregation
df_files_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,%count
access,experimental_strategy,data_category,data_type,data_format,Unnamed: 5_level_1,Unnamed: 6_level_1
controlled,RNA-Seq,Sequencing Reads,Aligned Reads,BAM,59954,25.78
controlled,RNA-Seq,Structural Variation,Transcript Fusion,BEDPE,39538,17.0
controlled,RNA-Seq,Structural Variation,Transcript Fusion,TSV,37403,16.08
controlled,RNA-Seq,Transcriptome Profiling,Splice Junction Quantification,TSV,20412,8.78
controlled,miRNA-Seq,Sequencing Reads,Aligned Reads,BAM,18278,7.86
open,RNA-Seq,Transcriptome Profiling,Gene Expression Quantification,TSV,20412,8.78
open,miRNA-Seq,Transcriptome Profiling,Isoform Expression Quantification,TSV,6617,2.85
open,miRNA-Seq,Transcriptome Profiling,Isoform Expression Quantification,TXT,11661,5.01
open,miRNA-Seq,Transcriptome Profiling,miRNA Expression Quantification,TSV,6617,2.85
open,miRNA-Seq,Transcriptome Profiling,miRNA Expression Quantification,TXT,11661,5.01


# Cases and Files
Transcriptome Profiling Open Files

In [5]:
# Define the filtering condition
condition = '(data_category == "Transcriptome Profiling") and (access == "open")'

# Filter files and retrieve case informations
df_files_and_cases = df_files \
    .query(condition) \
    .reset_index(drop=True) \
    .merge(right=df_cases, on='case_id', how='inner')

## Primary Sites by Case

In [6]:
# Aggregate cases according to their primary sites and count them
df_files_and_cases \
    .groupby(['case_id', 'primary_site']) \
    .agg(distinct_sites=pd.NamedAgg(column='primary_site', aggfunc='nunique')) \
    .sort_values(by='distinct_sites', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,distinct_sites
case_id,primary_site,Unnamed: 2_level_1
0004d251-3f70-4395-b175-c94c2f5b1b81,Liver and intrahepatic bile ducts,1
aadcf26b-c398-489f-a86a-e8db1b5db456,Bladder,1
aaea5f9c-b35b-47f8-93cb-bdc7b81844ff,Breast,1
aaeb1d59-d6ce-4422-9639-f1e83c002d84,Prostate gland,1
aaee2c46-a0c1-4494-9d7f-c98b15a890d9,Hematopoietic and reticuloendothelial systems,1
...,...,...
54d21956-25e4-42df-adbe-6907721fc4b5,"Heart, mediastinum, and pleura",1
54d9a54a-7f5d-4ce4-a1a2-edcb0d3389fd,Hematopoietic and reticuloendothelial systems,1
54d9fa54-0508-4d83-babd-02151ebc24c5,Hematopoietic and reticuloendothelial systems,1
54dab158-eb39-4334-8b5a-bf290e6b8bb1,Other and unspecified parts of tongue,1


## Disease Types by Case

In [7]:
# Aggregate cases according to their disease types and count them
df_files_and_cases \
    .groupby(['case_id', 'disease_type']) \
    .agg(distinct_diseases=pd.NamedAgg(column='disease_type', aggfunc='nunique')) \
    .sort_values(by='distinct_diseases', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,distinct_diseases
case_id,disease_type,Unnamed: 2_level_1
0004d251-3f70-4395-b175-c94c2f5b1b81,Adenomas and Adenocarcinomas,1
aadb8624-dbd7-424a-ab0b-ed326c088c70,Adenomas and Adenocarcinomas,1
aae0fca8-99e9-4fde-a2e9-b0ea500737f8,Adenomas and Adenocarcinomas,1
aaea5f9c-b35b-47f8-93cb-bdc7b81844ff,Ductal and Lobular Neoplasms,1
aaeb1d59-d6ce-4422-9639-f1e83c002d84,Adenomas and Adenocarcinomas,1
...,...,...
54d21956-25e4-42df-adbe-6907721fc4b5,Mesothelial Neoplasms,1
54d9a54a-7f5d-4ce4-a1a2-edcb0d3389fd,Myeloid Leukemias,1
54d9fa54-0508-4d83-babd-02151ebc24c5,Myeloid Leukemias,1
54dab158-eb39-4334-8b5a-bf290e6b8bb1,Squamous Cell Neoplasms,1


## Files by Case

In [8]:
# Aggregate cases according to the data type of the files and count them
df_files_and_cases \
    .groupby(['case_id', 'data_type']) \
    .agg(distinct_files=pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='distinct_files', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,distinct_files
case_id,data_type,Unnamed: 2_level_1
842402de-519e-4588-ad49-19df18db899b,Gene Expression Quantification,8
3a4afbe7-60d6-4a0d-8166-56a04ff127b0,Gene Expression Quantification,8
9ff6d022-6e23-4f44-a480-1b61929e6ee3,Gene Expression Quantification,6
9ff6d022-6e23-4f44-a480-1b61929e6ee3,Isoform Expression Quantification,6
9ff6d022-6e23-4f44-a480-1b61929e6ee3,miRNA Expression Quantification,6
...,...,...
5c48e21f-1fe9-4007-9b0c-7ca766586fff,miRNA Expression Quantification,1
5c4aebeb-fb6f-4d66-b24a-89ef20de971e,Gene Expression Quantification,1
5c4aebeb-fb6f-4d66-b24a-89ef20de971e,Isoform Expression Quantification,1
5c4aebeb-fb6f-4d66-b24a-89ef20de971e,miRNA Expression Quantification,1


In [9]:
# Calculate the number of cases with more than one associated miRNA-Seq or RNA-Seq file
df_files_and_cases \
    .groupby(['case_id', 'data_type']) \
    .agg(distinct_files=pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .query('distinct_files > 1') \
    .reset_index() \
    .value_counts(subset='case_id') \
    .count()

3433

## Files by Primary Site

In [10]:
# Aggregate files according to their primary site and count them
df_files_and_cases \
    .groupby('primary_site') \
    .agg(distinct_files=pd.NamedAgg(column='file_id', aggfunc='nunique')) \
    .sort_values(by='distinct_files', ascending=False) \
    .reset_index()

Unnamed: 0,primary_site,distinct_files
0,Hematopoietic and reticuloendothelial systems,9753
1,Bronchus and lung,5483
2,Kidney,5465
3,Thyroid gland,4630
4,Breast,4019
5,Brain,3509
6,Colon,1997
7,Corpus uteri,1845
8,Ovary,1784
9,Prostate gland,1656


# Files
Case ID: 9ff6d022-6e23-4f44-a480-1b61929e6ee3

## Samples

In [11]:
# Fields of interest for the endpoint request
fields = [
    'access',
    'cases.samples.tissue_type',
    'cases.samples.tumor_descriptor',
    'cases.samples.sample_type',
    'created_datetime',
    'data_category',
    'data_format',
    'data_type',
    'experimental_strategy',
    'updated_datetime'
]
fields = ','.join(fields)

# List of UUIDs of files of interest
file_ids = df_files \
    .query('case_id == "9ff6d022-6e23-4f44-a480-1b61929e6ee3"') \
    ['file_id'].to_list()

# Initializa the files DataFrame for this case
df_files_info = pd.DataFrame()

# Request informations about the files of interest to the endpoint
for file_id in file_ids:
    # Filter used in the endpoint request
    filter = {
        'op': '=',
        'content': {
            'field': 'file_id',
            'value': file_id
        }
    }

    # Parameters for the endpoint request
    params = {
        'fields': fields,
        'filters': filter,
        'size': '1'
    }

    # Request file information to the endpoint
    response = requests.post(
        url=FILES_ENDPOINT,
        headers={'Content-Type': 'application/json'},
        json=params
    )

    # Transform the response content to a DataFrame
    json_response = json.loads(response.content.decode('utf-8'))
    df_file_info = pd.json_normalize(json_response['data']['hits'])

    # Concatenate the cases of this project with the others
    if df_files_info.empty == False:
        df_files_info = pd.concat([df_files_info, df_file_info], ignore_index=True)
    else:
        df_files_info = df_file_info.copy()

In [12]:
# Print the records of interest from the DataFrame
pd.set_option('display.max_colwidth', 100)
df_files_info \
    .query('data_category == "Transcriptome Profiling"') \
    .sort_values(by=['data_type', 'created_datetime']) \
    .reset_index(drop=True) \
    [[
        'id',
        'data_type',
        'cases',
        'created_datetime',
        'updated_datetime'
    ]]

Unnamed: 0,id,data_type,cases,created_datetime,updated_datetime
0,b96247db-6f2a-4d26-9ac4-142e1c079e0e,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2022-01-06T09:46:53.769541-06:00,2024-07-30T21:45:03.814480-05:00
1,93c8678d-7afd-4be2-92cb-c39ad7701b43,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Not Applicable', 'sample_type': 'Solid Tissue Normal', 'tiss...",2022-01-06T09:46:56.688571-06:00,2024-07-30T21:47:29.474323-05:00
2,d12e199e-8471-4f60-8da2-2b479db61ab4,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2022-01-06T09:47:32.472999-06:00,2024-07-30T21:53:34.939697-05:00
3,167073fa-9e38-4f8d-af1f-301ed3a8b5f7,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2022-01-06T09:47:37.123345-06:00,2024-07-30T21:41:07.815061-05:00
4,c54a604e-9379-46f1-938c-e2d09f8538d8,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2022-01-06T09:47:57.280420-06:00,2024-07-30T21:43:27.830713-05:00
5,ceb4dd9b-6f10-4358-8312-3b126952d3cc,Gene Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2022-01-06T09:55:35.726930-06:00,2024-07-30T21:41:18.605327-05:00
6,0cf6cded-942f-4141-a4a5-35afb7082f37,Isoform Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Not Applicable', 'sample_type': 'Solid Tissue Normal', 'tiss...",2019-10-10T11:23:13.777284-05:00,2024-07-29T13:21:12.651533-05:00
7,d39cc122-925b-4292-9fe1-cab1d031bbd7,Isoform Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2019-12-13T08:15:27.709025-06:00,2024-07-29T10:48:30.938949-05:00
8,0afd73ac-8fbc-418f-9d58-b1da06da4c98,Isoform Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2020-10-16T17:02:06.560539-05:00,2024-07-29T15:23:17.906969-05:00
9,c22a38cc-064b-46b8-a576-2fcbcfec7ceb,Isoform Expression Quantification,"[{'samples': [{'tumor_descriptor': 'Primary', 'sample_type': 'Primary Tumor', 'tissue_type': 'Tu...",2020-10-16T17:06:28.600735-05:00,2024-07-29T15:24:06.021117-05:00


## Download

In [13]:
# UUIDs of files of interest
file_ids = [
    'b96247db-6f2a-4d26-9ac4-142e1c079e0e', # Gene Expression Quantification
    '0cf6cded-942f-4141-a4a5-35afb7082f37', # Isoform Expression Quantification
    '01e3d493-7e2a-4a50-b8cb-2597143a8e1a'  # miRNA Expression Quantification
]

# Download each of the files of interest
for file_id in file_ids:
    # Request file download to the endpoint
    response = requests.get(
        url=f'{DATA_ENDPOINT}/{file_id}', 
        headers={'Content-Type': 'application/json'}
    )

    # Get the file name from endpoint response
    response_head_cd = response.headers['Content-Disposition']
    file_name = re.findall('filename=(.+)', response_head_cd)[0]

    # Store the file in the external data folder
    file_path = os.path.join(EXTERNAL_DATA_PATH, file_name)
    with open(file_path, 'wb') as output_file:
        output_file.write(response.content)