# **Downloading Files of Interest from TCGA-BRCA**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import logging
import os
import sys
from time import sleep

import numpy as np
import pandas as pd
import requests

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    BRCA_RAW_FILES_DIRS,
    BRCA_PROCESSED_FILES_PATHS,
    GDC_API_ENDPOINTS,
)

# Functions

In [None]:
def gdc_api_files_download(files, dir_path, retries=5, delay=5):
    """
    Download files from the Genomic Data Commons (GDC) API and save them locally.

    Parameters:
    -----------
    files : list of dict
        A list of dictionaries where each dictionary contains metadata for a file. 
        Expected keys in each dictionary:
        - 'file_id' (str): unique identifier for the file.
        - 'experimental_strategy' (str): description of the experiment type.
        - 'data_format' (str): file format (e.g., 'TXT', 'CSV').
    dir_path : str
        The local directory where downloaded files will be saved.
    retries : int
        Number of times to retry a download on failure (default is 5).
    delay : int
        Seconds to wait between download retries (default is 5).
    """
    # Download each file contained in the list
    for file in files:
        # Define all file metadata
        file_id = file['file_id']
        file_type = file['experimental_strategy'].lower()
        file_format = file['data_format'].lower()
        file_name = f'{file_type}_{file_id}.{file_format}'
        file_path = os.path.join(dir_path, file_name)

        # Retry logic
        for attempt in range(retries):
            try:
                # Request the file donwload
                logging.info(f'Downloading file {file_id} [attempt {attempt + 1}]...')
                response = requests.get(
                    url=os.path.join(GDC_API_ENDPOINTS['data'], file_id),
                    headers={'Content-Type': 'application/json'},
                    timeout=30
                )
                response.raise_for_status()

                # Write the file in the cohort raw data directory
                with open(file_path, 'wb') as output_file:
                    output_file.write(response.content)

                # Exit retry loop on success
                logging.info(f'Downloaded and saved to {file_path}')
                break

            except requests.exceptions.RequestException as req_err:
                logging.error(f'Request failed: {req_err}')
            except OSError as os_err:
                logging.error(f'File write failed: {os_err}')
                break
            except Exception as e:
                logging.error(f'Unexpected error: {e}')

            if attempt < retries - 1:
                logging.info(f'Retrying in {delay} seconds...')
                sleep(delay)
            else:
                logging.warning(f'Failed to download the file after {retries} attempts')

In [None]:
def download_cohort_files(df_files, cohort):
    """
    Download all files associated with a specific cohort.

    Parameters:
    -----------
    df_files : DataFrame
        DataFrame containing file metadata, including 'cohort'.
    cohort : str
        Name of the cohort to filter files by.
    """
    # Select all files related to the cohort
    files = df_files \
        .query(f'cohort == "{cohort}"') \
        .to_dict(orient='records')

    # Raw directory path to write the files
    dir_base_name = (cohort.lower()).replace(' ', '-')
    path = BRCA_RAW_FILES_DIRS[dir_base_name]

    # Download all files of interest related to the cohort
    gdc_api_files_download(files, path)

# Data Loading and Preparation

In [2]:
# DataFrame with the cases of interest
df_cases = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['cases']) \
    .query('is_case_of_interest == 1')
    
# DataFrame with the files of interest
df_files = pd.read_csv(BRCA_PROCESSED_FILES_PATHS['files']) \
    .query('is_file_of_interest == 1')

# DataFrame with files and cases of interest
df_files_cohort = df_files \
    .merge(
        right=df_cases,
        left_on='case_id',
        right_on='case_id',
        how='inner'
    )

# Characterizes the cohort to which the file belongs
df_files_cohort['cohort'] = np.where(
    df_files_cohort['is_tumor_file_of_interest'] == 1,
    df_files_cohort['pam50_mrna'],
    'Paired Normal'
)

# Drop the unnecessary columns
df_files_cohort = df_files_cohort \
    [['file_id', 'experimental_strategy', 'data_format', 'cohort']]

In [3]:
# Print the DataFrame that associates each file with its cohort
df_files_cohort

Unnamed: 0,file_id,experimental_strategy,data_format,cohort
0,dd0e32d6-4704-4abd-82e4-2041fbc19a79,miRNA-Seq,TXT,Luminal B
1,8644d07c-979f-4ec6-928a-afe9229fb9f1,RNA-Seq,TSV,Luminal B
2,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,RNA-Seq,TSV,HER2-enriched
3,02fc7dd7-7e17-4930-8424-52173c33b29b,miRNA-Seq,TXT,HER2-enriched
4,f8e5691a-e24d-4dba-873e-9ffa3ca12bb6,RNA-Seq,TSV,Luminal B
...,...,...,...,...
1079,863c79a7-57d7-4a02-89ab-4abe5db7e4ff,miRNA-Seq,TXT,Luminal A
1080,24d0d1a9-ce92-4db6-8224-e3dd67733712,miRNA-Seq,TXT,Luminal A
1081,15d7e135-acb7-46ef-a8b1-f424e40a9e4d,RNA-Seq,TSV,Luminal A
1082,8ecef704-ed0f-4700-a34e-46a174033332,miRNA-Seq,TXT,HER2-enriched


# Basal-like Files

In [6]:
# Download the files related to Basal-like tumor tissue
download_cohort_files(df_files_cohort, 'Basal-like')

INFO:Downloading file a473ce1d-15e1-4ca2-bcba-fa36367a7f0d [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/mirna-seq_a473ce1d-15e1-4ca2-bcba-fa36367a7f0d.txt
INFO:Downloading file ed797859-4d7e-40e6-bae3-54ef15ad0f0a [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/rna-seq_ed797859-4d7e-40e6-bae3-54ef15ad0f0a.tsv
INFO:Downloading file 7fa9ad2e-67a7-449f-be73-9e18dc46b529 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/rna-seq_7fa9ad2e-67a7-449f-be73-9e18dc46b529.tsv
INFO:Downloading file 2355503e-8013-4092-9733-eabd4ca15ce6 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/basal-like-files/mirna-seq_2355503e-8013-4092-9733-eabd4ca15ce6.txt
INFO:Downloading file 809f13dc-caad-4bde-9b5e-606e5d229d42 [atte

# HER2-enriched Files

In [7]:
# Download the files related to HER2-enriched tumor tissue
download_cohort_files(df_files_cohort, 'HER2-enriched')

INFO:Downloading file 09a491b5-1bfb-47d9-963f-69f6c7e6c1e5 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/rna-seq_09a491b5-1bfb-47d9-963f-69f6c7e6c1e5.tsv
INFO:Downloading file 02fc7dd7-7e17-4930-8424-52173c33b29b [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/mirna-seq_02fc7dd7-7e17-4930-8424-52173c33b29b.txt
INFO:Downloading file cee943a8-ea26-4b2f-b8a4-66300ccacfb5 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/rna-seq_cee943a8-ea26-4b2f-b8a4-66300ccacfb5.tsv
INFO:Downloading file f3a7612e-35c3-43be-8f96-795874f8225c [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/her2-enriched-files/mirna-seq_f3a7612e-35c3-43be-8f96-795874f8225c.txt
INFO:Downloading file 12f8a326-1417-478e-a45e-abdff8

# Luminal A Files

In [6]:
# Download the files related to Luminal A tumor tissue
download_cohort_files(df_files_cohort, 'Luminal A')

INFO:Downloading file 072fae9f-12d7-4a6f-bc6d-ffecf0262c16 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/mirna-seq_072fae9f-12d7-4a6f-bc6d-ffecf0262c16.txt
INFO:Downloading file debafec8-b9f4-45f3-aa4c-8e48e58e6a49 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/rna-seq_debafec8-b9f4-45f3-aa4c-8e48e58e6a49.tsv
INFO:Downloading file 5dee4a6f-9cf4-4360-a37b-e7b94aa70dac [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/rna-seq_5dee4a6f-9cf4-4360-a37b-e7b94aa70dac.tsv
INFO:Downloading file dc459048-1b84-4b42-945f-cefda93b3bf8 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-a-files/mirna-seq_dc459048-1b84-4b42-945f-cefda93b3bf8.txt
INFO:Downloading file 4af55932-ab7a-41e5-aa7f-fb16ab18c629 [attempt 

# Luminal B Files

In [7]:
# Download the files related to Luminal B tumor tissue
download_cohort_files(df_files_cohort, 'Luminal B')

INFO:Downloading file dd0e32d6-4704-4abd-82e4-2041fbc19a79 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/mirna-seq_dd0e32d6-4704-4abd-82e4-2041fbc19a79.txt
INFO:Downloading file 8644d07c-979f-4ec6-928a-afe9229fb9f1 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/rna-seq_8644d07c-979f-4ec6-928a-afe9229fb9f1.tsv
INFO:Downloading file f8e5691a-e24d-4dba-873e-9ffa3ca12bb6 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/rna-seq_f8e5691a-e24d-4dba-873e-9ffa3ca12bb6.tsv
INFO:Downloading file 65654739-bed4-4ea7-b75c-ca1f278adcc9 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/luminal-b-files/mirna-seq_65654739-bed4-4ea7-b75c-ca1f278adcc9.txt
INFO:Downloading file cf5ab327-47ee-40eb-a06c-3a8fef2a1ebb [attempt 

# Paired Normal Files

In [8]:
# Download the files related to Paired Normal tissue
download_cohort_files(df_files_cohort, 'Paired Normal')

INFO:Downloading file 3e75bf39-6c1e-460b-956e-3e999daa919c [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/mirna-seq_3e75bf39-6c1e-460b-956e-3e999daa919c.txt
INFO:Downloading file ddb8fb65-cf53-41a0-8acd-a538a8754fa5 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/rna-seq_ddb8fb65-cf53-41a0-8acd-a538a8754fa5.tsv
INFO:Downloading file 7c3c4540-a63e-4737-a23d-a9c999c53f39 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/mirna-seq_7c3c4540-a63e-4737-a23d-a9c999c53f39.txt
INFO:Downloading file bd239a8e-56e2-45ca-bc44-bff98b72c1d6 [attempt 1]...
INFO:Downloaded and saved to /home/my-roberta/Unicamp/pan-cancer-analysis/data/raw/tcga-brca/paired-normal-files/rna-seq_bd239a8e-56e2-45ca-bc44-bff98b72c1d6.tsv
INFO:Downloading file 600fc778-f0e3-4362-ad0c-cdb64f