## Import packages and define useful functions

In [None]:
import json
import sys
import os
import urllib
import string

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)


# Make a query to the API via a URL.
def QueryAPI(url,verbose=False):
    start_row = 0
    num_rows = 2000
    total_rows = -1
    rows = []
    done = False

    while not done:
        pagedUrl = url + '&start_row=%d&num_rows=%d' % (start_row,num_rows)
        
        if verbose:
            print(pagedUrl)
            
        source = urllib.request.urlopen(pagedUrl).read()

        response = json.loads(source)
        rows += response['msg']
        
        if total_rows < 0:
            total_rows = int(response['total_rows'])

        start_row += len(response['msg'])

        if start_row >= total_rows:
            done = True

    return rows

## Get all the SectionDataSets informations (~1minute)
Roughly 22000 datasets, for ISH data both Coronal and Sagittal.

Information on the genes analyzed in each specific SectionDataSet is retrieved with the &include+genes query and is stored in the column genes of the resulting dataFrame

In [None]:
graph_ID = 1
product_ID = 1

BASE = "http://api.brain-map.org/api/v2/data"
target = "/SectionDataSet/query.json?"
criteria = "criteria=[failed$eq'false'][expression$eq'true'],"
products = "products[id$eq{}]".format(product_ID)
inclusions = "&include=genes"
exclusions = "&except=blue_channel,delegate,expression,failed,failed_facet,green_channel," + \
                "name,qc_date,red_channel,rnaseq_design_id,sphinx_id,storage_directory,weight"

# url = BASE + target + criteria + products + inclusions + exclusions
url = BASE + target + criteria + products + inclusions + exclusions

result = QueryAPI(url,verbose=False)
sDataSet = pd.DataFrame(result)
sDataSet


In [None]:
sDataSet['genes'][1]

## Save daframe containing information about genes and sectionDatasets

In [None]:
# Prepare a dataframe with the gene description for each sectionDataSet
genesDf = pd.DataFrame([x[0] for x in sDataSet['genes']])

genesDf.index = sDataSet['id']
genesDf.index.name = 'sectionDataSet_id'

genesDf = genesDf[['id','acronym','original_name','original_symbol','genomic_reference_update_id','alias_tags','entrez_id']]
genesDf.rename(columns={'id':'gene_id'}, inplace=True)

genesDf.to_csv('genesDataFrame.csv')

## Cleanup the dataframe
Drop data concerning gene details: only retain gene_id-sectionDataset_id mapping 

In [None]:
# Create a gene dictionary such tah you can access genes 
# information by searching for the id as a key
genesDict = dict([(x[0]['id'], x) for x in sDataSet.genes])

# Create explicit lists for useful infos of the genes
g_id = [x[0]['id'] for x in sDataSet.genes]

sDataSet['gene_id'] = g_id
sDataSet.set_index('id', inplace = True)
sDataSet.index.name = 'sectionDataset_id'
sDataSet.drop(columns=['genes','section_thickness'],inplace=True)
sDataSet

In [None]:
#Download set of 316 mid-ontology level brain structures

from allensdk.core.reference_space_cache import ReferenceSpaceCache

# -------------------------------------------
reference_space_key = 'annotation/ccf_2017'
resolution = 25
# -------------------------------------------

# Create a reference space object
rspc = ReferenceSpaceCache(resolution, reference_space_key, manifest='manifest.json')
# ID 1 is the adult mouse structure graph
tree = rspc.get_structure_tree(structure_graph_id=1) 

chosenID = 167587189 # 

structureList = tree.get_structures_by_set_id([chosenID])
strDf = pd.DataFrame(structureList)
strDf.sort_values(by='id')

## Download expression data (~9 hours)

In [None]:
BASE = "http://api.brain-map.org/api/v2/data"
target = "/StructureUnionize/query.json"
criteria = "?criteria=structure[graph_id$eq{}]".format(graph_ID)
# varOfInterest = "expression_energy"

# Initialize the expression DataFrame
expressionDf1 = pd.DataFrame()
expressionDf1['gene_id'] = []

expressionDf2 = pd.DataFrame()
expressionDf2['gene_id'] = []

expressionDf3 = pd.DataFrame()
expressionDf3['gene_id'] = []
for area in strDf['id']:
    expressionDf1[area] = []
    expressionDf2[area] = []
    expressionDf3[area] = []
    
for idx, dataSetID in enumerate(sDataSet.index.tolist()):

    # Assigne the geneID for the current experiment to this row
    expressionDf1.at[idx,'gene_id'] = int(sDataSet['gene_id'].iloc[idx])
    expressionDf2.at[idx,'gene_id'] = int(sDataSet['gene_id'].iloc[idx])
    expressionDf3.at[idx,'gene_id'] = int(sDataSet['gene_id'].iloc[idx])

    # Download expression values for all the areas
    urlDataSet = ",[section_data_set_id$eq{}]".format(dataSetID)
    print("Now downloading dataset #{} ({}/{})...".format(dataSetID,idx+1,len(sDataSet.index.tolist())))
    
    # Query data from the API
    url = BASE + target + criteria + urlDataSet
    result = QueryAPI(url,verbose=False)
    
    # Create an addressable dict of all the areas for the current structureDataSet
    areasDict = dict([(x['structure_id'],x) for x in result])
    
    # Cycle through all the aligned 316 areas.
    # If there is expression data for one of these areas, put it in the table
    # otherwise, put a NaN
    for areaID in strDf['id']:
        if areaID in areasDict.keys():
            energy = areasDict[areaID]['expression_energy']
            density = areasDict[areaID]['expression_density']
            if  areasDict[areaID]['sum_expressing_pixels'] !=0:
                intensity =  areasDict[areaID]['sum_expressing_pixel_intensity']/ areasDict[areaID]['sum_expressing_pixels']
            else:
                intensity =  np.nan
        else:
            energy = np.nan
            density = np.nan
            intensity = np.nan
        expressionDf1.at[idx,areaID] = energy
        expressionDf2.at[idx,areaID] = density
        expressionDf3.at[idx,areaID] = intensity

# expressionDf
# Save data as a CSV
expressionDf1.set_index('gene_id', inplace=True)
expressionDf2.set_index('gene_id', inplace=True)
expressionDf3.set_index('gene_id', inplace=True)
expressionDf1.to_csv('gene_expression_ABA_energy_raw.csv')
expressionDf2.to_csv('gene_expression_ABA_density_raw.csv')
expressionDf3.to_csv('gene_expression_ABA_intensity_raw.csv')



In [None]:
exp1 = expressionDf1.groupby(by = expressionDf1.index).mean()
exp2 = expressionDf2.groupby(by = expressionDf2.index).mean()
exp3 = expressionDf3.groupby(by = expressionDf3.index).mean()

exp1.to_csv('gene_expression_ABA_energy.csv')
exp2.to_csv('gene_expression_ABA_density.csv')
exp3.to_csv('gene_expression_ABA_intensity.csv')

exp1