### Kids First: Discovering the Genetic Basis of Human Neuroblastoma; KF-NBL


You will need the following python library
### fasp-scripts

Clone the **fasp-client** branch of fasp-scripts

```git clone -b fasp-client --single-branch https://github.com/ga4gh/fasp-scripts.git```

change directory to local copy of fasp-scripts

Install into your favorite python environment

```pip install .```


In [1]:
import os
import sys
import json
import requests
import pandas as pd 
import numpy as np
#import seaborn as sns
from pathlib import Path
from datetime import datetime
import pprint
#from lifelines import KaplanMeierFitter

pprint = pprint.PrettyPrinter(indent=1).pprint
# pprint() is now available to pretty-print any JSON

FHIR_SERVER = 'https://kf-api-fhir-service.kidsfirstdrc.org'

# Optional: Turn off SSL verification. Useful when dealing with a corporate proxy with self-signed certificates.
# This should be set to True unless you actually see certificate errors.
VERIFY_SSL = False

if not VERIFY_SSL:
    requests.packages.urllib3.disable_warnings()



# Kids First uses cookie-based authentication, so we've pre-fetched a cookie and made it available here.
#kf_cookie = requests.get("https://raw.githubusercontent.com/mitre/fhir-exercises/main/kf_cookie.txt", verify=VERIFY_SSL).text.rstrip()

full_cookie_path = os.path.expanduser('~/.keys/ncpi_prod_fhir_cookie.json')

with open(full_cookie_path) as f:
    cookies = json.load(f)
kf_cookie = cookies['Cookie']


# We make a requests.Session to ensure consistent headers/cookie across all the requests we make
s = requests.Session()
s.headers.update({'Accept': 'application/fhir+json'})
s.verify = VERIFY_SSL
s.cookies['AWSELBAuthSessionCookie-0'] = kf_cookie


# Test out the cookie by querying the server metadata
r = s.get(f"{FHIR_SERVER}/metadata")

if "<!DOCTYPE html>" in r.text:
    sys.stderr.write('ERROR: Could not authenticate with Kids First. The cookie may need to be updated')
    

# This helper method allows us to easily switch between printing an entire Bundle, or just the first 20 lines.
# Set truncate_for_github = False for actual use,
# or just replace the function with a `return bundle`

def print_bundle(bundle, truncate_for_github = False):
    if not truncate_for_github:
        return bundle
    
    lines = json.dumps(bundle, indent=2).split('\n')
    if len(lines) <= 20:
        return bundle
    else:   
        print('\n'.join(lines[:20]))
        print('...\nBundle truncated. Change the "print_bundle" function above to print the full content.')

In [2]:
# Resolves all pages for the bundle. Returns an array with all Bundles, including the original Bundle.
def resolve_pages(bundle):
    next_page_link = next(filter(lambda link: link['relation'] == 'next', bundle['link']), None)
    if next_page_link:
        next_page = s.get(next_page_link['url']).json()
        return [bundle] + resolve_pages(next_page)
    else:
        return [bundle]

# NOTE: No cell output.

def runQuery(query):
    r = s.get(f"{FHIR_SERVER}/{query}")
    first_bundle = r.json()
    all_bundles = resolve_pages(first_bundle)

    resources = [entry['resource'] for bundle in all_bundles for entry in bundle['entry']]
    print(f"Total  Resources: {len(resources)}")
    return resources

In [5]:
study_id = 100031
kf_study_id = 'SD_DYPMEHHF'
phs_id = 'phs001436.v1.p1'

studies = runQuery(f"ResearchStudy?_id={study_id}")
print(study_id)
print(studies[0]['title'])
studies 

Total  Resources: 1
100031
Discovering the Genetic Basis of Human Neuroblastoma: A Gabriella Miller Kids First Pediatric Research Program (Kids First) Project


[{'resourceType': 'ResearchStudy',
  'id': '100031',
  'meta': {'versionId': '2',
   'lastUpdated': '2022-01-19T00:56:41.161+00:00',
   'source': '#N1PVao6WQD1cD5rc',
   'profile': ['http://hl7.org/fhir/StructureDefinition/ResearchStudy']},
  'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/studies/',
    'value': 'SD_DYPMEHHF'},
   {'system': 'urn:kids-first:unique-string',
    'value': 'ResearchStudy-SD_DYPMEHHF'},
   {'system': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=',
    'value': 'phs001436.v1.p1'}],
  'title': 'Discovering the Genetic Basis of Human Neuroblastoma: A Gabriella Miller Kids First Pediatric Research Program (Kids First) Project',
  'status': 'completed',
  'category': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '86049000',
      'display': 'Malignant neoplasm, primary (morphologic abnormality)'}],
    'text': 'CANCER'}],
  'keyword': [{'coding': [{'code': 'Kids First'}]},
   {'coding': [{'code':

In [6]:
patients = runQuery(f"Patient?_has:ResearchSubject:individual:study={study_id}")

Total  Resources: 1765


In [7]:
import json
from collections import Counter
import pandas as pd

rTypes = Counter()

for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    if 'extension' in p:
        for e in p['extension']:
            extCounter[e['url']] +=1
    #print(extCounter)
print (json.dumps(rTypes, indent=3))

{
   "Patient": 1765
}


In [8]:
n=283
patient_id = patients[n]['id']
patients[n]


{'resourceType': 'Patient',
 'id': '97800',
 'meta': {'versionId': '2',
  'lastUpdated': '2021-11-16T09:47:04.603+00:00',
  'source': '#N2VY8ugxOeBEjaPv',
  'profile': ['http://hl7.org/fhir/StructureDefinition/Patient'],
  'tag': [{'code': 'SD_DYPMEHHF'}]},
 'identifier': [{'value': 'GMKF-30-PATUVC13'},
  {'system': 'https://kf-api-dataservice.kidsfirstdrc.org/participants/',
   'value': 'PT_7K4J4EZP'},
  {'system': 'urn:kids-first:unique-string',
   'value': 'Patient-SD_DYPMEHHF-PT_7K4J4EZP'}],
 'gender': 'female'}

We can see the data in more compact form via a DataFrame. This shows that the same set of three attributes are available for all patients.

In [9]:
patient_list = []
patient_ids = []
std_attributes = {'gender':'','birthDate':'','maritalStatus':'','multipleBirthBoolean':''}
for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    for e in p['extension']:
        ext = e['url'].split('/')[-1]
        extCounter[ext] += 1
    for att in std_attributes.keys():
        if att in p:
            extCounter[att] += 1
    patient_list.append(extCounter)
    patient_ids.append(p['id'])

#pd.set_option("display.max_rows", None, "display.max_columns", None)
pdf = pd.DataFrame(patient_list)    
pdf

KeyError: 'extension'

The attributes above are informative but are unlikely to part of a meaningful query in building an asthma cohort.

What else might we look at? Let's try Observations.

In [10]:
import pandas as pd

resources = runQuery(f"Patient?_has:ResearchSubject:individual:study={study_id}&_revinclude=Observation:subject")

observations = []
obsCounter  = Counter()
codeCounter = Counter()
vccCounter = Counter()
printObsCounts = False
for r in resources:

    if r['resourceType'] == 'Observation':
        obsCounter[r['subject']['reference']] +=1
        obs_display_name = r['code']['coding'][0]['display']
        vcc_text = r['valueCodeableConcept']['text']
        codeCounter[obs_display_name] +=1
        vccCounter[vcc_text] +=1
        observations.append(r)



#Summarize
print(f"Number of patients with observations {len(obsCounter.keys())}")

if printObsCounts:
    print("Observation count per patient")
    print(json.dumps(obsCounter, indent=3))
print("Coding counts")
#print(json.dumps(codeCounter, indent=3))
df = pd.DataFrame.from_dict(codeCounter,  orient='index')
pd.set_option("display.max_rows", None, "display.max_columns", None)
display(df)
vccdf= pd.DataFrame.from_dict(vccCounter,  orient='index')
display(vccdf)

Total  Resources: 3555
Number of patients with observations 1702
Coding counts


Unnamed: 0,0
Clinical status,664
family member,1126


Unnamed: 0,0
Alive,488
Mother,610
Father,516
Deceased,176


In [11]:
observations[1]

{'resourceType': 'Observation',
 'id': '98310',
 'meta': {'versionId': '1',
  'lastUpdated': '2021-10-14T21:50:51.947+00:00',
  'source': '#HNzKoDXMgWrAaLRO',
  'profile': ['https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/family-relationship']},
 'identifier': [{'system': 'https://kf-api-dataservice.kidsfirstdrc.org/family-relationships/',
   'value': 'FR_W9Z95WTS'},
  {'system': 'urn:kids-first:unique-string',
   'value': 'Observation-PT_WFKZC2NN-Mother-PT_9A9Q2YB3'}],
 'status': 'final',
 'code': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-RoleCode',
    'code': 'FAMMEMB',
    'display': 'family member'}],
  'text': 'Family relationship'},
 'subject': {'reference': 'Patient/98003'},
 'focus': [{'reference': 'Patient/97990'}],
 'valueCodeableConcept': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-RoleCode',
    'code': 'MTH',
    'display': 'mother'}],
  'text': 'Mother'}}

In [12]:
print(f"Patient/{patient_id}")
documents = runQuery(f"DocumentReference?subject=Patient/{patient_id}")

Patient/97800
Total  Resources: 10


In [41]:
for d in documents:
    print(json.dumps(d['content'][0]['attachment'], indent=3))
    print('_'*40)
    print(json.dumps(d['content'][1], indent=3))
    print('_'*80)

{
   "url": "drs://data.kidsfirstdrc.org/ee38b0fe-8de8-47f5-9502-96deacfe09d2"
}
________________________________________
{
   "attachment": {
      "extension": [
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/file-size",
            "valueDecimal": 10311
         },
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/hashes",
            "valueCodeableConcept": {
               "coding": [
                  {
                     "display": "md5"
                  }
               ],
               "text": "15d2667aa2df5eeb8edb9e8017e7b833"
            }
         },
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/hashes",
            "valueCodeableConcept": {
               "coding": [
                  {
                     "display": "sha256"
                  }
               ],
               "text": "9969b80770b9fd85b48e4ecb4296742945969ae228601d391a06bf

{
   "attachment": {
      "extension": [
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/file-size",
            "valueDecimal": 87637
         },
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/hashes",
            "valueCodeableConcept": {
               "coding": [
                  {
                     "display": "md5"
                  }
               ],
               "text": "ae1a9a81fbae04b4cb10c78e0c5b6f4b"
            }
         },
         {
            "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/hashes",
            "valueCodeableConcept": {
               "coding": [
                  {
                     "display": "sha256"
                  }
               ],
               "text": "a559eb305018456b5fdd261ce6feb1583d50b5ce90b6b401af3d6c18287a102a"
            }
         }
      ],
      "url": "s3://kf-study-us-east-1-prd-sd-8y99qzjj/harmonized-data/simple

In [13]:
def download(url, file_path):
    '''Download a file from a URL to a local file path'''
    with open(os.path.expanduser(file_path), "wb") as file:
        response = requests.get(url)
        file.write(response.content)

In [14]:
# Set up drs client
from fasp.loc import kfDRSClient

cl = kfDRSClient("~/.keys/kf_credentials.json")

In [15]:
exp_docs = runQuery(f"DocumentReference?type:text=Gene%20Expression&security-label=U&_tag={kf_study_id}")

Total  Resources: 672


In [16]:
def download_study_files(documents, folder):
    for d in exp_docs:
        url = d['content'][0]['attachment']['url']
        print(d['type']['text'])

        print(url)
        drs_id = url.split('/')[-1]
        drs_response = cl.get_object(drs_id)
        #print(json.dumps(drs_response, indent=3))
        file_name = drs_response["name"]
        print(file_name)
        d_url= cl.get_access_url(drs_id, 's3')
        #download(d_url,f'{folder}/{file_name}')



    print(d['content'][1]['format']['display'])

    print('_'*80)

In [27]:
folder = '/Users/yourpath/yourfolder'
download_study_files(exp_docs, folder)

Gene Expression
drs://data.kidsfirstdrc.org/65ee3d14-a471-42f2-857f-e75dbd5e8756
fe12f0da-e170-4345-b55c-889c27333ab9.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/b841fc14-0769-4a95-9824-4d70b75cd5d6
808baf95-9a60-4ecc-a393-5910d054b581.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/8b8f6e3b-65be-4f93-a259-75fc306ba14a
9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/eb3d5066-5af5-4935-82aa-13fe9f68c3d5
0524d200-76d3-4929-87d1-d68784df44f1.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/d4541867-d8f1-45a0-b889-4229b53e1ab9
fe12f0da-e170-4345-b55c-889c27333ab9.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/bff411a0-b356-4042-b3eb-283556935472
9ece8e3a-6e4d-4fb4-b12f-c5ab2000fa38.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/38d738d1-c8ec-4be8-be1d-1d88c29e16e0
487ba87c-053d-47dc-a5bc-2e7e77733ace.kallisto.abundance.tsv.gz
Gene Exp

Gene Expression
drs://data.kidsfirstdrc.org/c88afa7f-5249-4ef8-8fe5-f05499eb0c9f
740540f5-54e6-419e-938b-e99604c733c7.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/d267d9e9-90be-4425-9d16-51f923587f6f
c9f21f29-7c3c-47ee-94ea-c4a7559d251d.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/26648053-fd97-4066-bfa1-199af74d470a
701f0e2f-49de-4429-81c8-d5162cdc0990.rsem.genes.results.gz
Gene Expression
drs://data.kidsfirstdrc.org/4c493052-c2db-47ef-9eb1-8b247f2e2b27
701f0e2f-49de-4429-81c8-d5162cdc0990.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/6fc6b676-342f-4d31-b3cb-7be9456ddd6b
0fc85294-4ec8-43e9-bc03-b4e4cf894d2f.kallisto.abundance.tsv.gz
Gene Expression
drs://data.kidsfirstdrc.org/2316cb76-a032-474a-865a-b8220d284348
c9f21f29-7c3c-47ee-94ea-c4a7559d251d.rsem.genes.results.gz
tsv
________________________________________________________________________________


In [20]:
import pandas as pd

def create_dataframe(documents):
    records = []
    patient_counter = Counter()
    for d in documents:
        url = d['content'][0]['attachment']['url']
        type = d['type']['text']
        drs_id = url.split('/')[-1]
        drs_response = cl.get_object(drs_id)
        #print(json.dumps(drs_response, indent=3))
        file_name = drs_response["name"]
        #print(file_name)
        subject_id = d['subject']['reference']
        patient_counter[subject_id]+=1
        format = d['content'][1]['format']['display']
        records.append({"subject_id":subject_id,"file_name":file_name,
                        "format":format,"type":type,"drs_id":drs_id})

    print('_'*80)
    print(json.dumps(patient_counter, indent=3))
    df = pd.DataFrame(records)
    return df

In [18]:
df = create_dataframe(exp_docs)
df

________________________________________________________________________________


Unnamed: 0,subject_id,file_name,format,type,drs_id
0,Patient/96519,6b856525-c67b-4d50-b74e-8b4601711e7a.rsem.gene...,tsv,Gene Expression,7dd6f51a-fc56-4d31-8a90-4d5411a364fb
1,Patient/96540,40196649-90f8-4e50-9b54-a29a0ee04fb7.rsem.gene...,tsv,Gene Expression,0480a807-36f5-4eb4-bf75-c3c5da3ad999
2,Patient/96519,6b856525-c67b-4d50-b74e-8b4601711e7a.rsem.isof...,results,Gene Expression,c76106f8-8082-4be1-a46f-2d9ddd9f5521
3,Patient/96519,6b856525-c67b-4d50-b74e-8b4601711e7a.kallisto....,tsv,Gene Expression,5b7f5a3d-6fb0-4fca-a6a2-c38269959c45
4,Patient/96540,40196649-90f8-4e50-9b54-a29a0ee04fb7.rsem.isof...,results,Gene Expression,45ace086-3346-4683-b60f-37beb187b53b
5,Patient/96540,40196649-90f8-4e50-9b54-a29a0ee04fb7.kallisto....,tsv,Gene Expression,93fb0a3b-33e4-4142-9fde-88fdb88f709e
6,Patient/96553,9f3ac9b7-3ed2-481c-87bd-2442166be180.rsem.isof...,results,Gene Expression,6d1ebbc3-b569-4859-805e-cf935b294f4a
7,Patient/96553,9f3ac9b7-3ed2-481c-87bd-2442166be180.kallisto....,tsv,Gene Expression,63309c39-03a9-4e09-aa8c-87148ef1b08e
8,Patient/96553,9f3ac9b7-3ed2-481c-87bd-2442166be180.rsem.gene...,tsv,Gene Expression,243a3228-4bee-4c05-acbe-bf0fd0512c51
9,Patient/96564,fcbdbdd8-6fd6-4bd5-a4bb-40233467bfd3.rsem.gene...,tsv,Gene Expression,cfe245e0-7666-4893-9ef0-aa207784bb86


In [19]:
print(json.dumps(patient_counter, indent=3))

NameError: name 'patient_counter' is not defined

In [21]:
df = create_dataframe(exp_docs)

________________________________________________________________________________
{
   "Patient/96519": 3,
   "Patient/96540": 3,
   "Patient/96553": 3,
   "Patient/96564": 3,
   "Patient/96573": 3,
   "Patient/96657": 3,
   "Patient/96618": 3,
   "Patient/96670": 3,
   "Patient/96716": 3,
   "Patient/96713": 3,
   "Patient/96719": 3,
   "Patient/96757": 3,
   "Patient/96792": 3,
   "Patient/96823": 3,
   "Patient/96857": 3,
   "Patient/96890": 3,
   "Patient/96884": 3,
   "Patient/96900": 3,
   "Patient/96921": 3,
   "Patient/96935": 3,
   "Patient/97009": 3,
   "Patient/96645": 3,
   "Patient/97016": 3,
   "Patient/96681": 3,
   "Patient/96873": 3,
   "Patient/97071": 3,
   "Patient/97113": 3,
   "Patient/97123": 3,
   "Patient/97112": 3,
   "Patient/97155": 3,
   "Patient/97156": 3,
   "Patient/96892": 3,
   "Patient/96882": 3,
   "Patient/97203": 3,
   "Patient/97181": 3,
   "Patient/97243": 3,
   "Patient/97259": 3,
   "Patient/96986": 3,
   "Patient/97285": 3,
   "Patient/97025": 