# Exploration of Queries

## About

This Jupyter notebook is designed to demonstrate interactions with the Blue Box query endpoint as well as the file index.


In [20]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import urllib.parse
import urllib.request
import json


# this allows plots to appear directly in the notebook
%matplotlib inline

# read URL
def read_url(url):
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        return(response.read())

# remove donors from one array based on content of another
def print_analysis_candidates(donors, donors_to_subtract):
    result = {}
    for donor in donors:
        print (donor)
        donor_id = donor.keys()[0]
        for donor_to_subtract in donors_to_subtract
    
    
# parse sample info
def download_sample_info(uuid, version):
    sample_info = read_url('https://dss.staging.data.humancellatlas.org/v1/files/'+uuid+'?replica=aws&version='+version)
    sample_data = json.loads(sample_info)
    print (sample_data['donor_id'])
    return(sample_data['donor_id'])
    
# parse bundle info to find donor ID
def parse_bundle_info(bundle_url):
    result_dict = {}
    bundle_info = read_url(bundle_url)
    bundle_data = json.loads(bundle_info)
    for file in bundle_data['bundle']['files']:
        if (file['name'] == 'sample.json'):
            donor_id = download_sample_info(file['uuid'], file['version'])
            result_dict[donor_id] = bundle_data
    return(result_dict)

# parse the query output and find bundle GUIDs
def parse_query_result(query_result):
    bundles = []
    q_results = json.loads(query_result)
    for bundle in q_results['results']:
        donor_info = parse_bundle_info(bundle['bundle_url'])
        bundles.append(donor_info)
    return(bundles)

# queries the search endpoint, returns the bundle GUIDs that match
def query_for_bundles(query, url):
    bundles = []
    headers = {"User-Agent": "Mozilla", 'accept': 'application/json', 'content-type': 'application/json'}
    data = json.dumps(query)
    data = data.encode('ascii') # data should be bytes
    req = urllib.request.Request(url, data, headers)
    with urllib.request.urlopen(req) as response:
        the_page = response.read()
        bundles = parse_query_result(the_page)
        return (bundles)
        
        
# queries
url = 'https://dss.staging.data.humancellatlas.org/v1/search?replica=aws'
query_analysis = {
 "es_query": {
   "query": {
     "bool": {
       "must": [
         {
           "match": {
             "manifest.files.name": "analysis.json"
           }
         },
         {
           "match": {
             "files.sample_json.donor.species.ontology": "9606"
           }
         },
         {
           "wildcard": {
             "manifest.files.name": "*fastq.gz"
           }
         }
       ]
     }
   }
 }
}
query_all = {
 "es_query": {
   "query": {
     "bool": {
       "must": [
         {
           "match": {
             "files.sample_json.donor.species.ontology": "9606"
           }
         },
         {
           "wildcard": {
             "manifest.files.name": "*fastq.gz"
           }
         }
       ]
     }
   }
 }
}

# now do the queries
donors_with_analysis = query_for_bundles(query_analysis, url)
donors_all = query_for_bundles(query_all, url)

# go ahead and print summary for donors that don't have analysis
print_analysis_candidates(donors_all, donors_with_analysis)



Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
Q3_DEMO_donor1
Q3_DEMO-donor_MGH30
Q3_DEMO-donor_MGH30
Q3_DEMO_donor1
{'Q3_DEMO-donor_MGH30': {'bundle': {'creator_uid': 8008, 'files': [{'content-type': 'binary/octet-stream', 'crc32c': '54da8bc5', 'indexed': False, 'name': 'Aligned.sortedByCoord.out.bam