In [1]:
import pandas as pd
import tarfile
import tempfile
import concurrent.futures
import xml.etree.ElementTree as ET

from tqdm import tqdm
from pathlib import Path
from concurrent.futures import as_completed

from api_helpers import get_tcga_projects, get_uuid_from_manifest, download_files

# List all TCGA projects

In [2]:
tcga_projects = get_tcga_projects()

# Generate clinical data files for each TCGA project based on the manifest files

In [3]:
def parse_xml(file_path: str, project: str):
    # use the latest sequance number is the follow_ups
    # https://github.com/nchbcr/xsd/blob/master/tcga.nci/bcr/xml/clinical/acc/2.7/TCGA_BCR.ACC_Clinical.xsd#L167

    prefix = project.split('-')[1].lower()
    
    def get_namespaces(xml):
        return dict([
            node for _, node in ET.iterparse(xml, events=['start-ns'])
        ])

    root = ET.parse(file_path).getroot()
    namespaces = get_namespaces(file_path)

    patient_id = root.find('.//shared:bcr_patient_barcode', namespaces).text
    followup_has_no_data = False

    follow_ups = [(int(item.get('sequence')), item) for item in root.findall(f'.//{prefix}:follow_ups', namespaces)[0]]
    if follow_ups:
        _, latest_follow_up = max(follow_ups, key=lambda item: item[0])
        
        vital_status = latest_follow_up.find('.//clin_shared:vital_status', namespaces)
        vital_status_procurement = vital_status.attrib['procurement_status'] in ['Not Applicable', 'Not Available']
        vital_status = vital_status.text if vital_status is not None else None

        days_to_last_follow_up = latest_follow_up.find('.//clin_shared:days_to_last_followup', namespaces)
        days_to_last_follow_up_procurement = days_to_last_follow_up.attrib['procurement_status'] in ['Not Applicable', 'Not Available']
        days_to_last_follow_up = days_to_last_follow_up.text if days_to_last_follow_up is not None else None

        days_to_death = latest_follow_up.find('.//clin_shared:days_to_death', namespaces)
        days_to_death_procurement = days_to_death.attrib['procurement_status'] in ['Not Applicable', 'Not Available']
        days_to_death = days_to_death.text if days_to_death is not None else None

        followup_has_no_data = all([vital_status_procurement, days_to_last_follow_up_procurement, days_to_death_procurement])

    if not follow_ups or (follow_ups and followup_has_no_data):

        vital_status = root.find('.//clin_shared:vital_status', namespaces)
        vital_status = vital_status.text if vital_status is not None else None

        days_to_last_follow_up = root.find('.//clin_shared:days_to_last_followup', namespaces)
        days_to_last_follow_up = days_to_last_follow_up.text if days_to_last_follow_up is not None else None

        days_to_death = root.find('.//clin_shared:days_to_death', namespaces)
        days_to_death = days_to_death.text if days_to_death is not None else None

    return patient_id, vital_status, days_to_last_follow_up, days_to_death

def retrieve_clinical_data(project: str):
    file_uuids = get_uuid_from_manifest(f"manifest/clinical_data/{project}.tsv")

    with tempfile.TemporaryDirectory() as temp_dir:
    
        with tarfile.open(fileobj=download_files(file_uuids)) as tar:
            tar.extractall(temp_dir)

        patient_ids = []
        times = []
        events = []

        temp_dir_path = Path(temp_dir)
        for xml_file in temp_dir_path.glob('**/*.xml'):
            patient_id, vital_status, days_to_last_follow_up, days_to_death = parse_xml(xml_file, project)

            # sanity check
            if vital_status == 'Alive' and days_to_death is not None:
                raise ValueError(f'Alive but days_to_death is not None. {patient_id}')
            
            event = 1 if vital_status == 'Dead' else 0
            time = days_to_death if event else days_to_last_follow_up

            patient_ids.append(patient_id)
            times.append(time)
            events.append(event)

        df = pd.DataFrame({
            'samples': patient_ids,
            'time': times,
            'event': events,
        })


        df.to_csv(f'data/raw/{project}-survival.csv', index=False)

    return f"Done processing {project}"


with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(retrieve_clinical_data, project) for project in tcga_projects}
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        # This will raise an exception if one occurred in the thread.
        future.result()


100%|██████████| 33/33 [01:42<00:00,  3.12s/it]


In [4]:
# This is using the API directly. We prefer to use the manifest file to download the data.

# def get_clinical_data(project: str):
#     """ Retrieve Overall survival data for a given project"""

#     file_uuids = get_uuid_from_manifest(f"manifest/{project}.tsv")
#     file_uuid_to_sample_id = map_file_uuid_to_sample_id(file_uuids)
#     # sample ids without sample type indicator
#     sample_ids = [sample_id.rsplit('-', 1)[0] for sample_id in file_uuid_to_sample_id.values()]

#     endpoint = 'https://api.gdc.cancer.gov/cases'

#     params = {
#         'filters': json.dumps({
#             'op': 'and',
#             'content':[
#                 {
#                 'op': 'in',
#                 "content": {
#                     # 'project.project_id' - to query by project
#                     "field": "submitter_id", 
#                     "value": sample_ids
#                     }
#                 }
#             ]
#         }),
#         'fields': 'submitter_id,demographic.vital_status,demographic.days_to_death,diagnoses.days_to_last_follow_up',
#         'format': 'JSON',
#         'size': '10000'
#     }

#     response = requests.post(endpoint, json=params)

#     if response.status_code == 200:
#         data = response.json()['data']['hits']

#         patient_ids = []
#         times = []
#         events = []

#         for patient in data:
#             patient_ids.append(patient['submitter_id'])

#             demographic = patient.get('demographic', {})
#             diagnoses = patient.get('diagnoses', {})

#             # sanity checks
#             if len(diagnoses) == 1:
#                 diagnoses = diagnoses[0]
#             elif len(diagnoses) > 1:
#                 raise Exception(f'More than one diagnosis {diagnoses}')
            
#             vital_status = demographic.get('vital_status', None) # vital status (Alive/Dead)
#             days_to_death = demographic.get('days_to_death', None)
#             days_to_last_follow_up = diagnoses.get('days_to_last_follow_up', None)

#             # sanity checks
#             # If status is Dead and both days_to_death and days_to_last_follow_up are recorded, days to death should be greater than days to last follow up
#             if all(v is not None for v in [days_to_death, days_to_last_follow_up]) and vital_status == 'Dead' and days_to_death < days_to_last_follow_up:
#                 raise ValueError(f"days_to_death < days_to_last_follow_up with status Dead. {patient}")

#             if vital_status == 'Alive' and days_to_death is not None:
#                 raise ValueError(f'Alive but days_to_death is not None. {patient}')
            
#             event = 1 if vital_status == 'Dead' else 0

#             if event and all(v is not None for v in [days_to_death, days_to_last_follow_up]):
#                 # if event and both days_to_death and days_to_last_follow_up are recorded, use the max of the two
#                 time = max(days_to_death, days_to_last_follow_up)
#             elif event:
#                 # sometimes days_to_death is not recorded, but days_to_last_follow_up is
#                 time = days_to_death or days_to_last_follow_up
#             else:
#                 # if no event, use days_to_last_follow_up
#                 time = days_to_last_follow_up

#             times.append(time)
#             events.append(event)

#             df = pd.DataFrame({
#                 'samples': patient_ids,
#                 'time': times,
#                 'event': events,
#             })


#             df.to_csv(f'raw_data/{project}-survival.csv', index=False)

#     else:
#         raise ValueError(f'Request failed with status code {response.status_code}')