### Kids First: Pediatric Brain Tumor Atlas: PNOC


You will need the following python library
### fasp-scripts

Clone the **fasp-client** branch of fasp-scripts

```git clone -b fasp-client --single-branch https://github.com/ga4gh/fasp-scripts.git```

change directory to local copy of fasp-scripts

Install into your favorite python environment

```pip install .```


In [1]:
import os
import sys
import json
import requests
import pandas as pd 
import numpy as np
#import seaborn as sns
from pathlib import Path
from datetime import datetime
import pprint
#from lifelines import KaplanMeierFitter

pprint = pprint.PrettyPrinter(indent=1).pprint
# pprint() is now available to pretty-print any JSON

FHIR_SERVER = 'https://kf-api-fhir-service.kidsfirstdrc.org'

# Optional: Turn off SSL verification. Useful when dealing with a corporate proxy with self-signed certificates.
# This should be set to True unless you actually see certificate errors.
VERIFY_SSL = False

if not VERIFY_SSL:
    requests.packages.urllib3.disable_warnings()



# Kids First uses cookie-based authentication, so we've pre-fetched a cookie and made it available here.
#kf_cookie = requests.get("https://raw.githubusercontent.com/mitre/fhir-exercises/main/kf_cookie.txt", verify=VERIFY_SSL).text.rstrip()

full_cookie_path = os.path.expanduser('~/.keys/ncpi_prod_fhir_cookie.json')

with open(full_cookie_path) as f:
    cookies = json.load(f)
kf_cookie = cookies['Cookie']


# We make a requests.Session to ensure consistent headers/cookie across all the requests we make
s = requests.Session()
s.headers.update({'Accept': 'application/fhir+json'})
s.verify = VERIFY_SSL
s.cookies['AWSELBAuthSessionCookie-0'] = kf_cookie


# Test out the cookie by querying the server metadata
r = s.get(f"{FHIR_SERVER}/metadata")

if "<!DOCTYPE html>" in r.text:
    sys.stderr.write('ERROR: Could not authenticate with Kids First. The cookie may need to be updated')
    

# This helper method allows us to easily switch between printing an entire Bundle, or just the first 20 lines.
# Set truncate_for_github = False for actual use,
# or just replace the function with a `return bundle`

def print_bundle(bundle, truncate_for_github = False):
    if not truncate_for_github:
        return bundle
    
    lines = json.dumps(bundle, indent=2).split('\n')
    if len(lines) <= 20:
        return bundle
    else:   
        print('\n'.join(lines[:20]))
        print('...\nBundle truncated. Change the "print_bundle" function above to print the full content.')

In [2]:
# Resolves all pages for the bundle. Returns an array with all Bundles, including the original Bundle.
def resolve_pages(bundle):
    next_page_link = next(filter(lambda link: link['relation'] == 'next', bundle['link']), None)
    if next_page_link:
        next_page = s.get(next_page_link['url']).json()
        return [bundle] + resolve_pages(next_page)
    else:
        return [bundle]

# NOTE: No cell output.

def runQuery(query):
    r = s.get(f"{FHIR_SERVER}/{query}")
    first_bundle = r.json()
    all_bundles = resolve_pages(first_bundle)

    resources = [entry['resource'] for bundle in all_bundles for entry in bundle['entry']]
    print(f"Total  Resources: {len(resources)}")
    return resources

In [48]:
study_id = 48656
studies = runQuery(f"ResearchStudy?_id={study_id}")
print(study_id)
print(studies[0]['title'])

kf_study_id = [id['value'] for id in studies[0]['identifier'] if id['system']== 'https://kf-api-dataservice.kidsfirstdrc.org/studies/'][0]
print (kf_study_id)
studies 

Total  Resources: 1
48656
Pediatric Brain Tumor Atlas: PNOC
SD_8Y99QZJJ


[{'resourceType': 'ResearchStudy',
  'id': '48656',
  'meta': {'versionId': '2',
   'lastUpdated': '2022-01-18T22:57:56.745+00:00',
   'source': '#83Q7YI9F69RaW6Gn',
   'profile': ['http://hl7.org/fhir/StructureDefinition/ResearchStudy']},
  'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/studies/',
    'value': 'SD_8Y99QZJJ'},
   {'system': 'urn:kids-first:unique-string',
    'value': 'ResearchStudy-SD_8Y99QZJJ'}],
  'title': 'Pediatric Brain Tumor Atlas: PNOC',
  'status': 'completed',
  'category': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '86049000',
      'display': 'Malignant neoplasm, primary (morphologic abnormality)'}],
    'text': 'CANCER'}],
  'keyword': [{'coding': [{'code': 'Pediatric Brain Tumor Atlas'}]},
   {'coding': [{'code': 'PBTA-PNOC'}]}]}]

In [4]:
patients = runQuery(f"Patient?_has:ResearchSubject:individual:study={study_id}")

Total  Resources: 84


In [5]:
import json
from collections import Counter
import pandas as pd

rTypes = Counter()

for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    if 'extension' in p:
        for e in p['extension']:
            extCounter[e['url']] +=1
    #print(extCounter)
print (json.dumps(rTypes, indent=3))

{
   "Patient": 84
}


In [6]:
n=53
patient_id = patients[n]['id']
patients[n]


{'resourceType': 'Patient',
 'id': '48592',
 'meta': {'versionId': '2',
  'lastUpdated': '2021-11-16T08:30:46.741+00:00',
  'source': '#8OzwRRXz63pnmm2D',
  'profile': ['http://hl7.org/fhir/StructureDefinition/Patient'],
  'tag': [{'code': 'SD_8Y99QZJJ'}]},
 'extension': [{'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race',
   'extension': [{'url': 'text', 'valueString': 'White'},
    {'url': 'ombCategory',
     'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238',
      'code': '2106-3',
      'display': 'White'}}]},
  {'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity',
   'extension': [{'url': 'text', 'valueString': 'Not Hispanic or Latino'},
    {'url': 'ombCategory',
     'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238',
      'code': '2186-5',
      'display': 'Not Hispanic or Latino'}}]}],
 'identifier': [{'value': 'C3078075'},
  {'system': 'https://kf-api-dataservice.kidsfirstdrc.org/participants/',
   'value': 'PT_YR

We can see the data in more compact form via a DataFrame. This shows that the same set of three attributes are available for all patients.

In [25]:
patient_dict = {}
patient_list = []
patient_ids = []
std_attributes = {'gender':'','birthDate':'','maritalStatus':'','multipleBirthBoolean':''}
for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    pRef = f"Patient/{p['id']}"
    if 'extension' in p:
        for e in p['extension']:
            #print(e)
            ext = e['url'].split('/')[-1]
            extval = e['extension'][0]["valueString"]
            if pRef not in patient_dict:
                patient_dict[pRef] = {ext:extval}
            else:
                patient_dict[pRef][ext] = extval
    for att in std_attributes.keys():
        if att in p:
            if pRef not in patient_dict:
                patient_dict[pRef] = {att:p[att]}
            else:
                patient_dict[pRef][att] = p[att]

    patient_list.append(extCounter)
    patient_ids.append(pRef)


pd.set_option("display.max_rows", 30, "display.max_columns", None)
#pdf = pd.DataFrame(patient_list)    
pdf = pd.DataFrame.from_dict(patient_dict, orient="index")
pdf

Unnamed: 0,us-core-race,us-core-ethnicity,gender
Patient/687105,Reported Unknown,Not Hispanic or Latino,male
Patient/687104,White,Not Hispanic or Latino,male
Patient/687103,White,Reported Unknown,female
Patient/48650,Not Reported,Not Reported,female
Patient/48647,White,Not Hispanic or Latino,female
...,...,...,...
Patient/48613,Not Reported,Not Reported,unknown
Patient/48583,White,Not Hispanic or Latino,male
Patient/48608,Not Reported,Not Reported,female
Patient/48587,Asian,Not Hispanic or Latino,male


The attributes above are informative but are unlikely to part of a meaningful query in building an asthma cohort.

What else might we look at? Let's try Observations.

In [32]:
import pandas as pd

resources = runQuery(f"Patient?_has:ResearchSubject:individual:study={study_id}&_revinclude=Observation:subject")

patient_observations_dict = {}
observations = []
obsCounter  = Counter()
codeCounter = Counter()
vccCounter = Counter()
printObsCounts = False
for r in resources:

    if r['resourceType'] == 'Observation':
        subject_id = r['subject']['reference']
        obsCounter[subject_id] +=1
        obs_display_name = r['code']['coding'][0]['display']
        vcc_text = r['valueCodeableConcept']['text']
        codeCounter[obs_display_name] +=1
        vccCounter[vcc_text] +=1
        observations.append(r)
        
        if subject_id not in patient_observations_dict:
            patient_observations_dict[subject_id] = {obs_display_name: vcc_text}
        else:
            patient_observations_dict[subject_id][obs_display_name] = vcc_text


#Summarize
print(f"Number of patients with observations {len(obsCounter.keys())}")

if printObsCounts:
    print("Observation count per patient")
    print(json.dumps(obsCounter, indent=3))
print("Coding counts")
#print(json.dumps(codeCounter, indent=3))
df = pd.DataFrame.from_dict(codeCounter,  orient='index')
pd.set_option("display.max_rows", 30, "display.max_columns", None)
display(df)
vccdf= pd.DataFrame.from_dict(vccCounter,  orient='index')
display(vccdf)

Total  Resources: 162
Number of patients with observations 78
Coding counts


Unnamed: 0,0
Clinical status,78


Unnamed: 0,0
Deceased,37
Alive,41


In [9]:
observations[1]

{'resourceType': 'Observation',
 'id': '48835',
 'meta': {'versionId': '2',
  'lastUpdated': '2021-11-16T08:30:55.161+00:00',
  'source': '#oDhSRqN6No2PCaln',
  'profile': ['https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/vital-status'],
  'tag': [{'code': 'SD_8Y99QZJJ'}]},
 'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/outcomes/',
   'value': 'OC_17WZQWAC'},
  {'system': 'urn:kids-first:unique-string',
   'value': 'Observation-SD_8Y99QZJJ-OC_17WZQWAC'}],
 'status': 'final',
 'code': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '263493007',
    'display': 'Clinical status'}],
  'text': 'Clinical status'},
 'subject': {'reference': 'Patient/48621'},
 'valueCodeableConcept': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '438949009',
    'display': 'Alive'}],
  'text': 'Alive'}}

In [11]:
print(f"Patient/{patient_id}")
documents = runQuery(f"DocumentReference?subject=Patient/{patient_id}")

Patient/48592
Total  Resources: 61


In [12]:
def download(url, file_path):
    '''Download a file from a URL to a local file path'''
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

In [50]:
# Set up drs client
from fasp.loc import kfDRSClient

cl = kfDRSClient("~/.keys/kf_credentials.json")

In [14]:
query = "DocumentReference?type:text=Gene%20Expression&security-label=U&_tag=SD_8Y99QZJJ"
exp_docs = runQuery(query)

Total  Resources: 64


In [15]:
def download_study_files(documents, folder):
    for d in exp_docs:
        url = d['content'][0]['attachment']['url']
        print(d['type']['text'])

        print(url)
        drs_id = url.split('/')[-1]
        drs_response = cl.get_object(drs_id)
        #print(json.dumps(drs_response, indent=3))
        file_name = drs_response["name"]
        print(file_name)
        d_url= cl.get_access_url(drs_id, 's3')
        #download(d_url,f'{folder}/{file_name}')



    print(d['content'][1]['format']['display'])

    print('_'*80)

In [27]:
folder = '/Users/yourpath/yourfolder'
download_study_files(exp_docs, folder)

Gene Expression
drs://data.kidsfirstdrc.org/65ee3d14-a471-42f2-857f-e75dbd5e8756
fe12f0da-e170-4345-b55c-889c27333ab9.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/b841fc14-0769-4a95-9824-4d70b75cd5d6
808baf95-9a60-4ecc-a393-5910d054b581.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/8b8f6e3b-65be-4f93-a259-75fc306ba14a
9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/eb3d5066-5af5-4935-82aa-13fe9f68c3d5
0524d200-76d3-4929-87d1-d68784df44f1.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/d4541867-d8f1-45a0-b889-4229b53e1ab9
fe12f0da-e170-4345-b55c-889c27333ab9.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/bff411a0-b356-4042-b3eb-283556935472
9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/38d738d1-c8ec-4be8-be1d-1d88c29e16e0
487ba87c-053d-47dc-a5bc-2e7e77733ace.kallisto.abundance.tsv.gz
Gene Exp

Gene Expression
drs://data.kidsfirstdrc.org/c88afa7f-5249-4ef8-8fe5-f05499eb0c9f
740540f5-54e6-419e-938b-e99604c733c7.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/d267d9e9-90be-4425-9d16-51f923587f6f
c9f21f29-7c3c-47ee-94ea-c4a7559d251d.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/26648053-fd97-4066-bfa1-199af74d470a
701f0e2f-49de-4429-81c8-d5162cdc0990.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/4c493052-c2db-47ef-9eb1-8b247f2e2b27
701f0e2f-49de-4429-81c8-d5162cdc0990.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/6fc6b676-342f-4d31-b3cb-7be9456ddd6b
0fc85294-4ec8-43e9-bc03-b4e4cf894d2f.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/2316cb76-a032-474a-865a-b8220d284348
c9f21f29-7c3c-47ee-94ea-c4a7559d251d.rsem.genes.results.gz
tsv
________________________________________________________________________________


In [16]:
def query_kf_fhir(documents, verbose=True):
    if verbose:
        print("running query")
    #documents = runQuery(query)
    records = []
    if verbose:
        print("building dataframe")
    #patient_counter = Counter()
    for d in documents:
        url = d['content'][0]['attachment']['url']
        type = d['type']['text']
        drs_id = url.split('/')[-1]
        drs_response = cl.get_object(drs_id)
        #print(json.dumps(drs_response, indent=3))
        file_name = drs_response["name"]
        #print(file_name)
        subject_id = d['subject']['reference']
        #patient_counter[subject_id]+=1
        format = d['content'][1]['format']['display']
        records.append({"subject_id":subject_id,"file_name":file_name,
                        "format":format,"type":type,"drs_id":drs_id})

        
        #print('_'*80)
    #print(json.dumps(patient_counter, indent=3))
    df = pd.DataFrame(records)
    return df

In [51]:
def query_kf_fhir(documents, verbose=True):
    if verbose:
        print("running query")
    #documents = runQuery(query)
    records = []
    if verbose:
        print("building dataframe")
    base = FHIR_SERVER+'/'
    for d in documents:
        url = d['content'][0]['attachment']['url']
        attachmnt = d['content'][0]['attachment']
        if 'title' in attachmnt:
            title = attachmnt['title']
        else:
            title = ""
            #print(attachmnt)
        type = d['type']['text']
        drs_id = url.split('/')[-1]
        drs_response = cl.get_object(drs_id)
        #print(json.dumps(drs_response, indent=3))
        file_name = drs_response["name"]
        
        subject_id = d['subject']['reference']

        format = d['content'][1]['format']['display']
        doc_reference = f"DocumentReference/{d['id']}"
        
        if url.startswith('drs'):
            drs_uri = url
        else:
            drs_uri = ""
            
        study_attribs = patient_dict[subject_id]
        document_dict = {"document_reference_attachment_uri": url,
                        "drs_uri":drs_uri,
                        "document_reference_reference":base+doc_reference,
                        "file_path":file_name,
                        "specimen_bodySite": "per study",
                        "condition_code":"per study",
                        "research_study_reference":d['meta']['tag'][0]['code'],
                        "patient_reference":base+subject_id,
                        #"specimen_reference":base+d['context']['related'][0]['reference'],                        
                        "format":format,
                        "type":type,
                        "drs_id":drs_id}
        for k, v in study_attribs.items(): document_dict[k]=v
        if subject_id in patient_observations_dict:
            patient_obs = patient_observations_dict[subject_id]
            for k, v in patient_obs.items(): document_dict[k]=v
            
            
        records.append(document_dict)
        

    df = pd.DataFrame(records)
    return df

In [53]:
df = query_kf_fhir(exp_docs)
df

running query
building dataframe


Unnamed: 0,document_reference_attachment_uri,drs_uri,document_reference_reference,file_path,specimen_bodySite,condition_code,research_study_reference,patient_reference,format,type,drs_id,us-core-race,us-core-ethnicity,gender,Clinical status
0,drs://data.kidsfirstdrc.org/d4541867-d8f1-45a0...,drs://data.kidsfirstdrc.org/d4541867-d8f1-45a0...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,fe12f0da-e170-4345-b55c-889c27333ab9.kallisto....,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,d4541867-d8f1-45a0-b889-4229b53e1ab9,Not Reported,Not Reported,unknown,Deceased
1,drs://data.kidsfirstdrc.org/65ee3d14-a471-42f2...,drs://data.kidsfirstdrc.org/65ee3d14-a471-42f2...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,fe12f0da-e170-4345-b55c-889c27333ab9.rsem.gene...,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,65ee3d14-a471-42f2-857f-e75dbd5e8756,Not Reported,Not Reported,unknown,Deceased
2,drs://data.kidsfirstdrc.org/d31968c1-ac1f-40b0...,drs://data.kidsfirstdrc.org/d31968c1-ac1f-40b0...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,0524d200-76d3-4929-87d1-d68784df44f1.kallisto....,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,d31968c1-ac1f-40b0-bbc6-598b829205c0,Not Reported,Not Reported,unknown,Deceased
3,drs://data.kidsfirstdrc.org/bff411a0-b356-4042...,drs://data.kidsfirstdrc.org/bff411a0-b356-4042...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.rsem.gene...,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,bff411a0-b356-4042-b3eb-283556935472,Not Reported,Not Reported,unknown,Deceased
4,drs://data.kidsfirstdrc.org/8b8f6e3b-65be-4f93...,drs://data.kidsfirstdrc.org/8b8f6e3b-65be-4f93...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.kallisto....,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,8b8f6e3b-65be-4f93-a259-75fc306ba14a,Not Reported,Not Reported,unknown,Deceased
5,drs://data.kidsfirstdrc.org/eb3d5066-5af5-4935...,drs://data.kidsfirstdrc.org/eb3d5066-5af5-4935...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,0524d200-76d3-4929-87d1-d68784df44f1.rsem.gene...,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,eb3d5066-5af5-4935-82aa-13fe9f68c3d5,Not Reported,Not Reported,unknown,Deceased
6,drs://data.kidsfirstdrc.org/fdfc1715-a963-4914...,drs://data.kidsfirstdrc.org/fdfc1715-a963-4914...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,1a39a1bf-e744-42a9-ad5f-d89d28bb5fa5.rsem.gene...,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,fdfc1715-a963-4914-9bfc-ae5e3c1e6dca,Not Reported,Not Reported,unknown,Deceased
7,drs://data.kidsfirstdrc.org/8d5fd864-8345-4891...,drs://data.kidsfirstdrc.org/8d5fd864-8345-4891...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,1a39a1bf-e744-42a9-ad5f-d89d28bb5fa5.kallisto....,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,8d5fd864-8345-4891-a152-014ef7864250,Not Reported,Not Reported,unknown,Deceased
8,drs://data.kidsfirstdrc.org/be883ef8-4049-415f...,drs://data.kidsfirstdrc.org/be883ef8-4049-415f...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,487ba87c-053d-47dc-a5bc-2e7e77733ace.rsem.gene...,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,be883ef8-4049-415f-8444-3072fae1c71c,Not Reported,Not Reported,unknown,Deceased
9,drs://data.kidsfirstdrc.org/38d738d1-c8ec-4be8...,drs://data.kidsfirstdrc.org/38d738d1-c8ec-4be8...,https://kf-api-fhir-service.kidsfirstdrc.org/D...,487ba87c-053d-47dc-a5bc-2e7e77733ace.kallisto....,per study,per study,SD_8Y99QZJJ,https://kf-api-fhir-service.kidsfirstdrc.org/P...,tsv,Gene Expression,38d738d1-c8ec-4be8-be1d-1d88c29e16e0,Not Reported,Not Reported,unknown,Deceased


In [54]:
df.to_csv(f"{kf_study_id}_files.txt", sep='\t')