In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter code_eval
{% do SectionField(
    name='primary',
    title='CFDE Gene-WG Appyter',
    subtitle='Resolve gene-centric information from CF projects',
    img='geneicon.png',
) %}
{% set gene = AutocompleteField(
    name='gene',
    label='Entrez Gene Symbol',
    description='Please choose a valid entrez gene symbol',
    default='ACE2',
    file_path=url_for('static', filename='genes.json'),
    section='primary',
) %}

In [None]:
%%appyter markdown

# CFDE Gene-WG Appyter: {{ gene.raw_value }}

Given the gene *{{ gene.raw_value }}*, we request information about it from several different DCCs in hopes of creating a comprehensive knowledge report for it.

In [None]:
import requests
import pandas as pd
import urllib.parse
import json
import uuid
from itables import show
from functools import lru_cache
from IPython.display import HTML, display

class RenderJSON(object):
    ''' https://gist.github.com/t27/48b3ac73a1479914f9fe9383e5d45325 '''
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json_data
        self.uuid = str(uuid.uuid4())
        
    def _ipython_display_(self):
        display(HTML('<div id="{}" style="height: auto; width:100%;"></div>'.format(self.uuid)))
        display(HTML("""<script>
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          renderjson.set_show_to_level(1)
          document.getElementById('%s').appendChild(renderjson(%s))
        });</script>
        """ % (self.uuid, self.json_str)))


In [None]:
%%appyter code_exec
gene = {{ gene }}

## GTEx

<https://gtexportal.org/home/>

We query the [GTEx API](https://gtexportal.org/home/api-docs/index.html) to identify tissue sites that significantly express the gene question.

In [None]:
@lru_cache()
def gtex_singleTissueEqtl(geneSymbol, datasetId='gtex_v8'):
    res = requests.get(
        'https://gtexportal.org/rest/v1/association/singleTissueEqtl',
        params=dict(
            format='json',
            geneSymbol=geneSymbol,
            datasetId=datasetId,
        )
    )
    return res.json()['singleTissueEqtl']

In [None]:
gtex_results = pd.DataFrame(gtex_singleTissueEqtl(gene))
columns = list(gtex_results.columns)
columns.insert(0, columns.pop(columns.index('nes')))
columns.insert(0, columns.pop(columns.index('pValue')))
columns.insert(0, columns.pop(columns.index('tissueSiteDetailId')))
gtex_results.sort_values('pValue')[columns]

## IDG

<https://druggablegenome.net/>

We query the [Harmonizome API](https://maayanlab.cloud/Harmonizome/documentation) for associations with various biological entities in a standardized set of numerous omics datasets, as detailed [here](https://maayanlab.cloud/Harmonizome/about).

In [None]:
@lru_cache()
def idg_harmonizome_geneInfo(gene, showAssociations=True, version='1.0'):
    res = requests.get(
        f"https://maayanlab.cloud/Harmonizome/api/{urllib.parse.quote(version)}/gene/{urllib.parse.quote(gene)}",
        params=dict(
            showAssociations=json.dumps(showAssociations),
        ),
    )
    return res.json()

In [None]:
idg_geneInfo = idg_harmonizome_geneInfo(gene)
display(RenderJSON(idg_geneInfo))
idg_geneAssociations = pd.DataFrame([
    dict(
        **geneAssociation['geneSet'],
        **geneAssociation,
    )
    for geneAssociation in idg_geneInfo['associations']
]).drop(['geneSet'], axis=1).dropna().sort_values('standardizedValue')
display(idg_geneAssociations)

## LINCS

<https://lincsproject.org/>

<b style="color: red">Work in progress -- we need to update the backend to support this query which will return the rank of a gene's expression for each signature; giving us a bunch of drugs in LINCS for which this gene is highly ranked</b>

### Signature Commons

We query the LINCS data to identify drugs that can be used to significantly upregulate or downregulate a target human gene using the [Signature Commons LINCS API](https://maayanlab.cloud/sigcom-lincs/#/API).

In [None]:
@lru_cache()
def lincs_sigcom_meta_gene(gene):
    res = requests.get(
        'https://maayanlab.cloud/sigcom-lincs/metadata-api/entities',
        params=dict(
            filter=json.dumps({
                'where': {
                    'or': [
                        {'meta.symbol': gene},
                        {'meta.synonyms': gene},
                    ],
                }
            }),
        ),
    )
    return res.json()

@lru_cache()
def lincs_sigcom_data_gene(geneInfo):
    libraries = requests.post('https://maayanlab.cloud/sigcom-lincs/data-api/api/v1/listdata').json()['repositories']
    for library in libraries:
        res = requests.post(
            'https://maayanlab.cloud/sigcom-lincs/data-api/api/v1/enrich/rank',
            json=dict(
                database=library['uuid'],
                entities=[gene['id'] for gene in json.loads(geneInfo)],
                signatures=[],
            ),
        )
        for result in res.json()['results']:
            yield result

In [None]:
lincs_geneInfo = lincs_sigcom_meta_gene('STAT3')
display(RenderJSON(lincs_geneInfo))
lincs_drugs = list(lincs_sigcom_data_gene(json.dumps(lincs_geneInfo)))
display(RenderJSON(lincs_drugs))

### Enrichr

We query Enrichr via [it's API](https://maayanlab.cloud/Enrichr/help#api) to identify lists which contain the gene.

In [None]:
@lru_cache()
def enrichr_geneMap(gene):
    res = requests.get(
        'https://maayanlab.cloud/Enrichr/genemap',
        params=dict(
            gene=gene,
            json=json.dumps(True),
            setup=json.dumps(True),
        ),
    )
    return res.json()

In [None]:
lincs_enrichr_geneInfo = enrichr_geneMap(gene)
display(RenderJSON(lincs_enrichr_geneInfo))
display()
lincs_enrichr_results = pd.merge(
    left=pd.DataFrame([
        dict(name=name, term=term)
        for name, terms in lincs_enrichr_geneInfo['gene'].items()
        for term in terms
    ]),
    left_on='name',
    right=pd.DataFrame(lincs_enrichr_geneInfo['descriptions']),
    right_on='name',
)
lincs_enrichr_results

## GlyGen

<https://www.glygen.org/>

GlyGen collects extensive protein product information related to Glycans and permits accessing that information over [their API](https://api.glygen.org/).

In [None]:
@lru_cache()
def glygen_geneNameSearch(recommended_gene_name, organism_taxon_id=9606):
    res = requests.get(
        'https://api.glygen.org/directsearch/gene/',
        params=dict(
            query=json.dumps(dict(
                recommended_gene_name=recommended_gene_name,
                organism=dict(
                    id=organism_taxon_id
                ),
            )),
        ),
        verify=False, # not sure why on my system I get SSL errors
    )
    return res.json()

In [None]:
glygen_geneInfo = glygen_geneNameSearch(gene)
display(RenderJSON(glygen_geneInfo))
display(pd.Series(glygen_geneInfo['results'][0]))

## Tissue Consensus

We try to merge common results to produce a single table presenting results from each independent DCC as one. Tissue was chosen simply because most DCCs seemed to have some knowledge to provide about them. Ideally we would choose other connections such as diseases.

In [None]:
# get most significant tissue sites for this gene according to gtex
gtex_tissue_rankings = gtex_results[['tissueSiteDetailId', 'pValue']] \
    .groupby('tissueSiteDetailId')['pValue']\
    .max()\
    .sort_values()\
    .to_frame('pValue')\
    .reset_index()\
    .rename({'tissueSiteDetailId': 'name'}, axis=1)
gtex_tissue_rankings['rank'] = gtex_tissue_rankings['pValue'].rank()
gtex_tissue_rankings['present'] = (gtex_tissue_rankings['pValue'] < 0.05).astype(int)
gtex_tissue_rankings['from'] = 'gtex'
gtex_tissue_rankings

In [None]:
# get most significant tissue sites for this gene according to idg collected evidences
idg_geneAssociations['score'] = -idg_geneAssociations['standardizedValue'].abs()
idg_tissue_rankings = idg_geneAssociations[
    idg_geneAssociations['name'].str.lower().str.contains('tissue')
].set_index('name')['score']\
    .sort_values()\
    .reset_index()
idg_tissue_rankings['rank'] = (-idg_tissue_rankings['score'].abs()).rank()
idg_tissue_rankings['present'] = (idg_tissue_rankings['score'].abs() > 3).astype(int) # zscore > 3 to be significant
idg_tissue_rankings['from'] = 'idg'
idg_tissue_rankings

In [None]:
lincs_tissue_evidence = lincs_enrichr_results[
    lincs_enrichr_results['name'].str.lower().str.contains('tissue')
].rename({'name': 'evidence', 'term': 'name'}, axis=1)\
 .drop('description', axis=1)
lincs_tissue_evidence['present'] = 1
lincs_tissue_evidence

In [None]:
# get curated knowledge about expression of the gene in a tissues according to GlyGen
glygen_tissue_evidence = pd.DataFrame([
    {
        'name': result['tissue'] if type(result['tissue']) == str else result['tissue']['name'],
        'present': 1 if result['present'] == 'yes' else 0,
        'evidence': ','.join(
            f"{evidence['id']} ({evidence['database']})"
            for evidence in result['evidence']
        ),
    }
    for result in glygen_geneInfo['results'][0]['expression_tissue']
]).sort_values('present', ascending=False)
glygen_tissue_evidence['from'] = 'glygen'
glygen_tissue_evidence

This tables shows a aggregated view of different DCC evidences for the presence of the gene's expression in a tissue.

In [None]:
aggregated_tissue_rankings = pd.concat([
    gtex_tissue_rankings,
    idg_tissue_rankings,
    lincs_tissue_evidence,
    glygen_tissue_evidence,
], ignore_index=True).sort_values(['rank', 'present']).fillna('')
show(aggregated_tissue_rankings, maxBytes=0)