This metadata vignette lays out how to traverse the metadata in a set of data bundles downloaded from the HCA DSS.

Fields of interest we will extract include
- IDs for each entity
- Donor
    - species
    - sex
    - development stage
    - diseases
- Tissue, cell line or organoid
    - organ/organ part/ model organ
- Protocols
    - library construction method
    - sequencing method
    - cell count

In [38]:
import os
import json
from pprint import pprint
 
rootDir = '.'
all_properties = {}
for dirName, subdirList, fileList in os.walk(rootDir):
    if dirName != rootDir:
        for fname in fileList:
            if fname != 'links.json' and '.ipynb' not in fname:
                # print('Investigating: %s' % dirName+'/'+fname)
                with open(dirName+'/'+fname) as f:
                    data = json.load(f)
                    uuid = data['provenance']['document_id']
                    if uuid not in all_properties:
                        if 'donor_organism' in fname:      
                            donor = {}
                            donor['donor_id'] = data['biomaterial_core']['biomaterial_id']
                            donor['diseases'] = data['diseases']
                            donor['species'] = data['genus_species']
                            if 'development_stage' in data:
                                donor['dev_stage'] = data['development_stage']
                            donor['present_in_bundles'] = 1
                            all_properties[uuid] = donor
                            print(dirName+'/'+fname)
                        elif 'specimen' in fname:
                            specimen = {}
                            specimen['specimen_id'] = data['biomaterial_core']['biomaterial_id']
                            specimen['organ'] = data['organ']
                            specimen['organ_part'] = data['organ_part']
                            specimen['present_in_bundles'] = 1
                            all_properties[uuid] = specimen
                            print(dirName+'/'+fname)
                        elif 'cell_line' in fname:
                            cell_line = {}
                            cell_line['cell_line_id'] = data['biomaterial_core']['biomaterial_id']
                            cell_line['organ'] = data['model_organ']
                            cell_line['present_in_bundles'] = 1
                            all_properties[uuid] = cell_line
                            print(dirName+'/'+fname)
                        elif 'organoid' in fname:     
                            organoid = {}
                            organoid['organoid_id'] = data['biomaterial_core']['biomaterial_id']
                            organoid['organoid'] = data['model_organ']
                            organoid['present_in_bundles'] = 1
                            all_properties[uuid] = organoid
                            print(dirName+'/'+fname)
                        elif 'library_preparation' in fname:
                            protocol = {}
                            protocol['protocol_id'] = data['protocol_core']['protocol_id']
                            protocol['library_construction_approach'] = data['library_construction_approach']
                            protocol['present_in_bundles'] = 1
                            all_properties[uuid] = protocol
                            print(dirName+'/'+fname)
                        elif 'sequencing' in fname:
                            protocol = {}
                            protocol['protocol_id'] = data['protocol_core']['protocol_id']
                            protocol['sequencing_approach'] = data['sequencing_approach']
                            protocol['present_in_bundles'] = 1
                            all_properties[uuid] = protocol
                            print(dirName+'/'+fname)
                    else:
                        all_properties[uuid]['present_in_bundles'] +=1
pprint(all_properties)

./02e0b19e-6dc0-4d6c-bad3-066894607039/sequencing_protocol_0.json
./02e0b19e-6dc0-4d6c-bad3-066894607039/specimen_from_organism_0.json
./02e0b19e-6dc0-4d6c-bad3-066894607039/donor_organism_0.json
./02e0b19e-6dc0-4d6c-bad3-066894607039/library_preparation_protocol_0.json
./09013319-8bb3-45b4-8506-e24cde25bfa3/specimen_from_organism_0.json
./09013319-8bb3-45b4-8506-e24cde25bfa3/donor_organism_0.json
{'61e629ed-0135-4492-ac8a-5c4ab3ccca8a': {'present_in_bundles': 20,
                                          'protocol_id': 'sequencing_protocol_1',
                                          'sequencing_approach': {'ontology': 'EFO:0008896',
                                                                  'ontology_label': 'RNA-Seq',
                                                                  'text': 'RNA-Seq'}},
 '7b07b9d0-cc0e-4098-9f64-f4a569f7d746': {'diseases': [{'ontology': 'PATO:0000461',
                                                        'ontology_label': 'normal',
      