# **Downloading Files of Interest from TCGA-BRCA**
TCGA: The Cancer Genome Atlas

# Importing Libraries

In [1]:
import json
import os

import pandas as pd
import requests

# Constant and Path

In [2]:
# Endpoint for files download
DATA_ENDPOINT = 'https://api.gdc.cancer.gov/data'

# Data folder path
DATA_FOLDER = '../../data'

# External data folder path
EXTERNAL_DATA_PATH = f'{DATA_FOLDER}/external/tcga-brca'

# Interim data folder path
INTERIM_DATA_PATH = f'{DATA_FOLDER}/interim/tcga-brca'

# Function

In [None]:
def gdc_api_files_download(files, path):
    """
    Downloads files from the GDC (Genomic Data Commons) API and saves them locally.

    Parameters:
    -----------
    files : list of dict
        A list of dictionaries where each dictionary contains metadata for a file. 
        Expected keys in each dictionary:
        - 'file_id' (str): Unique identifier for the file.
        - 'experimental_strategy' (str): Description of the experiment type.
        - 'data_format' (str): File format (e.g., 'TXT', 'CSV').
    path : str
        The local directory where downloaded files will be saved.
    """
    # Download each file and save it using file metadata
    for file in files:
        # Important file metadata
        file_id = file['file_id']
        file_type = file['experimental_strategy'].lower()
        file_format = file['data_format'].lower()

        # Request the object of interest to the endpoint
        response = requests.get(
            url=f'{DATA_ENDPOINT}/{file_id}',
            headers={'Content-Type': 'application/json'}
        )

        # Write the file in the external data folder
        file_name = f'{file_type}_{file_id}.{file_format}'
        file_path = os.path.join(path, file_name)
        with open(file_path, 'wb') as output_file:
            output_file.write(response.content)

# Data Loading and Preparation

In [None]:
# DataFrame with the files of interest associated with normal tissue analysis
df_normal_files = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-files-normal-analysis.csv'
)

# DataFrame with the cases of interest associated with tumor tissue analysis
df_tumor_cases = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-cases-tumor-analysis.csv'
)

# DataFrame with the files of interest associated with tumor tissue analysis
df_tumor_files = pd.read_csv(
    f'{INTERIM_DATA_PATH}/brca-api-files-tumor-analysis.csv'
)

# DataFrame with cases and files associated with tumor tissue analysis
df_tumor_cases_files = df_tumor_cases \
    .merge(
        right=df_tumor_files,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    ) \
    [['file_id', 'case_id', 'experimental_strategy', 'data_format', 'pam50_mrna']]

# Tumor Tissue Analysis Files

## Basal-like

In [None]:
# Filter files related to Basal-like subtype
filter = 'Basal-like'
files = df_tumor_cases_files \
    .query(f'pam50_mrna == "{filter}"') \
    [['file_id', 'experimental_strategy', 'data_format']] \
    .to_dict(orient='records')

# Folder path to write files
path = f'{EXTERNAL_DATA_PATH}/{filter.lower()}-files'

# Download all files of interest related to the specific molecular subtype
gdc_api_files_download(files, path)

## HER2-enriched

In [None]:
# Filter files related to HER2-enriched subtype
filter = 'HER2-enriched'
files = df_tumor_cases_files \
    .query(f'pam50_mrna == "{filter}"') \
    [['file_id', 'experimental_strategy', 'data_format']] \
    .to_dict(orient='records')

# Folder path to write files
path = f'{EXTERNAL_DATA_PATH}/{filter.lower()}-files'

# Download all files of interest related to the specific molecular subtype
gdc_api_files_download(files, path)

## Luminal A

In [None]:
# Filter files related to Luminal A subtype
filter = 'Luminal A'
files = df_tumor_cases_files \
    .query(f'pam50_mrna == "{filter}"') \
    [['file_id', 'experimental_strategy', 'data_format']] \
    .to_dict(orient='records')

# Folder path to write files
folder_name = (filter.replace(' ', '-')).lower()
path = f'{EXTERNAL_DATA_PATH}/{folder_name}-files'

# Download all files of interest related to the specific molecular subtype
gdc_api_files_download(files, path)

## Luminal B

In [None]:
# Filter files related to Luminal B subtype
filter = 'Luminal B'
files = df_tumor_cases_files \
    .query(f'pam50_mrna == "{filter}"') \
    [['file_id', 'experimental_strategy', 'data_format']] \
    .to_dict(orient='records')

# Folder path to write files
folder_name = (filter.replace(' ', '-')).lower()
path = f'{EXTERNAL_DATA_PATH}/{folder_name}-files'

# Download all files of interest related to the specific molecular subtype
gdc_api_files_download(files, path)

# Normal Tissue Analysis Files

In [None]:
# Filter files related to normal tissue
files = df_normal_files \
    [['file_id', 'experimental_strategy', 'data_format']] \
    .to_dict(orient='records')

# Folder path to write files
path = f'{EXTERNAL_DATA_PATH}/normal-tissue-files'

# Download all files of interest related to normal tissue
gdc_api_files_download(files, path)