# Data Distillery Use Case
For a specific drug transporter or drug processing enzyme, find the tissue where these transporters and enzymes are highly expressed (GTEx), and the drugs that may induce or suppress the expression of these genes (LINCS).

In [1]:
import os
import requests
import ipycytoscape
from py2neo import Graph
import seaborn as sns
import random
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display, Markdown
import json

### Loading environment variables and connecting to the Neo4j database

In [2]:
load_dotenv()
graph = Graph(os.getenv('NEO4j_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD')))

In [3]:
gene = "CES1"
limit = 5
table = 1

display(Markdown("## %s gene"%gene))

## CES1 gene

Carboxylesterase 1

In [4]:


if "HGNC:" in gene:
    endpoint = 'https://maayanlab.cloud/sigcom-lincs/metadata-api/entities?filter={"where": {"meta.dbxrefs": {"fullTextSearch": "%s"}}}'%gene
else:
    endpoint = 'https://maayanlab.cloud/sigcom-lincs/metadata-api/entities?filter={"where": {"meta.symbol":"%s"}}'%gene

res = requests.get(endpoint)
if res.ok and res.json():
    lincs_id = res.json()[0]["id"]
    gene_symbol = res.json()[0]["meta"]["symbol"] + " gene"
    hgnc = [i  for i in res.json()[0]["meta"]["dbxrefs"] if i.startswith("HGNC:")] 
    if hgnc:
        gene_id = hgnc[0].replace("HGNC:HGNC:", "HGNC:")
    ensemblid = res.json()[0]["meta"]["ensemblid"]

### Cypher query function
This function is built so we can start building cypher queries quickly (same as using Neo4j console). Feel free to look at the following libraries for more information:
* [py2neo](https://py2neo.org/2021.1/)
* [cytoscape.js](https://js.cytoscape.org/)
* [ipycytoscape](https://github.com/cytoscape/ipycytoscape)

In [5]:
style = [{
                            "selector": 'node',
                            "style": {
                            'background-color': 'data(color)',
                            'border-color': 'data(borderColor)',
                            'border-width': 'data(borderWidth)',
                            'label': 'data(label)',
                            "text-valign": "center",
                            "text-halign": "center",
                            'width': "50",
                            'height': "50",
                            }
                        },
                        {
                            "selector": 'edge',
                            "style": {
                            'curve-style': 'straight',
                            'line-color': 'data(lineColor)',
                            'width': '3',
                            'label': 'data(relation)',
                            "text-rotation": "autorotate",
                            "text-margin-x": "0px",
                            "text-margin-y": "0px",
                            'font-size': '12px',
                            'target-arrow-shape': "data(directed)",
                            'target-endpoint': 'outside-to-node',
                            'source-endpoint': 'outside-to-node',
                            'target-arrow-color': 'data(lineColor)',
                            }
                        },
                        {
                            "selector": 'node.highlight',
                            "style": {
                                'border-color': 'gray',
                                'border-width': '2px',
                                'font-weight': 'bold',
                                'font-size': '18px',
                                'width': "90",
                                'height': "90",
                            }
                        },
                        {
                            "selector": 'node.focused',
                            "style": {
                                'border-color': 'gray',
                                'border-width': '2px',
                                'font-weight': 'bold',
                                'font-size': '18px',
                                'width': "90",
                                'height': "90",
                            }
                        },
                        {
                            "selector": 'edge.focusedColored',
                            "style": {
                                'line-color': '#F8333C',
                                'width': '6'
                            }
                        },
                        {
                            "selector": 'node.semitransp',
                            "style":{ 'opacity': '0.5' }
                        },
                        {
                            "selector": 'node.focusedSemitransp',
                            "style":{ 'opacity': '0.5' }
                        },
                        {
                            "selector": 'edge.colored',
                            "style": {
                                'line-color': '#F8333C',
                                'target-arrow-color': '#F8333C',
                                'width': '6'
                            }
                        },
                        {
                            "selector": 'edge.semitransp',
                            "style":{ 'opacity': '0.5' }
                        },
                        {
                            "selector": 'edge.focusedSemitransp',
                            "style":{ 'opacity': '0.5' }
                        }]

In [6]:
node_types = {
    ":Code": "CODE",
    ":Concept": "CUI",
    ":Term": "name",
}

node_ids = {
    ":Code": "CODE",
    ":Concept": "CUI",
    ":Term": "SUI",
}

In [7]:
palette = sns.color_palette().as_hex()
def cypher_query(query, input_gene, table):
    results = graph.run(query).data()
    nodes = {}
    edges = []
    colors = {}
    relations = {}
    for i in results:
        for vals in i.values():
            # for node in vals.nodes:
            #     node_type = str(node.labels)
            #     node_id = node[node_ids[node_type]]
            #     node_label = node[node_types[node_type]]
            #     nodes[node_id] = node_label
            for relation in vals.relationships:
                start = relation.nodes[0]
                start_type = str(start.labels)
                start_id = start[node_ids[start_type]]
                start_label = start[node_types[start_type]]
                end = relation.nodes[1]
                end_type = str(end.labels)
                end_id = end[node_ids[end_type]]
                end_label = end[node_types[end_type]]
                relation_name = type(relation).__name__
                # print(start_type, relation_name, end_type)
                
                if start_id not in nodes:
                    nodes[start_id] = {"id": start_id}
                # if end_id not in nodes:
                #     nodes[end_id] = {}
                if end_type == ":Code":
                    nodes[start_id]["label"] = end_label
                elif end_type == ":Term":
                    nodes[start_id]["label"] = end_label
                elif end_type == ":Concept":
                    edges.append({
                        "source": start_id,
                        "target": end_id,
                        "relation": relation_name,
                        "SAB": relation["SAB"],
                        "evidence_class": relation["evidence_class"],
                    })
                if relation_name not in relations:
                    relations[relation_name] = pd.DataFrame("-", index=[], columns=["name", "relation", "SAB", "evidence_class"])
    for edge in edges:
        start = nodes[edge["source"]]
        end = nodes[edge["target"]]
        relation = edge["relation"]
        df = relations[relation]
        if relation not in colors:
            colors[relation] = palette[len(colors) % len(sns.color_palette())]
        color = colors[relation]
        if start['id'] == input_gene or start['label'] == input_gene:
            df.at[end["id"], "relation"] = edge["relation"]
            df.at[end["id"], "SAB"] = edge["SAB"]
            df.at[end["id"], "evidence_class"] = edge["evidence_class"]    
            df.at[end["id"], "name"] = end.get("label", end.get("id", None))
            nodes[end["id"]]["color"] = color
        elif end['id'] == input_gene or end['label'] == input_gene:
            df.at[start["id"], "relation"] = edge["relation"]
            df.at[start["id"], "SAB"] = edge["SAB"]
            df.at[start["id"], "evidence_class"] = edge["evidence_class"]    
            df.at[start["id"], "name"] = start.get("label", start.get("id", None))
            nodes[start["id"]]["color"] = color
    if "positively_regulated_by" in relations:
        df = relations["positively_regulated_by"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is up-regulated by the following drugs."%(table, gene_symbol)))
        table += 1
    if "negatively_regulated_by" in relations:
        df = relations["negatively_regulated_by"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is down-regulated by the following drugs."%(table, gene_symbol)))
        table += 1
    if "expressed_in" in relations:
        df = relations["expressed_in"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is expressed in the following tissues."%(table, gene_symbol)))
        table += 1
    cytoscapeobj = ipycytoscape.CytoscapeWidget()
    cytoscapeobj.graph.add_graph_from_json({
        "nodes": nodes.values(),
        "edges": edges
    }) 
    cytoscapeobj.set_style(style)
    display(cytoscapeobj)
    return table

In [8]:
def cypher(query, table, input_gene):
    if query == "": return
    results = graph.run(query).data()
    nodes = {}
    edges = []
    colors = {}
    relations = {}
    for i in results:
        for vals in i.values():
            for node in vals.nodes:
                label = str(node.labels)
                if label not in colors:
                    colors[label] = palette[len(colors) % len(sns.color_palette())]
                color = colors[label]
                n = {"kind": label, "color": color}
                for k,v in node.items():
                    n[k] = v
                if 'GTEXEXP' in n:
                    n["label"] = n['GTEXEXP']
                nodes[n["id"]] = n
            for relation in vals.relationships:
                r = {
                    "kind": "relation",
                    "source": relation.nodes[0]["id"],
                    "target": relation.nodes[1]["id"]
                    }
                for k,v in relation.items():
                    r[k] = v
                r["relation"] = r["relation"].replace("_"," ")
                edges.append(r)
                if r["relation"] not in relations:
                    relations[r["relation"]] = pd.DataFrame("-", index=[], columns=["name", "relation", "SAB", "evidence"])
    for edge in edges:
        start = nodes[edge["source"]]
        end = nodes[edge["target"]]
        relation = edge["relation"]
        df = relations[relation]
        if relation not in colors:
            colors[relation] = palette[len(colors) % len(sns.color_palette())]
        color = colors[relation]
        if start['id'] == input_gene or start['label'] == input_gene:
            df.at[end["id"], "relation"] = edge["relation"]
            df.at[end["id"], "SAB"] = edge["SAB"]
            df.at[end["id"], "evidence"] = edge.get("evidence", "-")
            df.at[end["id"], "name"] = end.get("label", end.get("id", None))
            # nodes[end["id"]]["color"] = color
        elif end['id'] == input_gene or end['label'] == input_gene:
            df.at[start["id"], "relation"] = edge["relation"]
            df.at[start["id"], "SAB"] = edge["SAB"]
            df.at[start["id"], "evidence"] = edge.get("evidence", "-")
            df.at[start["id"], "name"] = start.get("label", start.get("id", None))
            # nodes[start["id"]]["color"] = color
    if "positively regulated by" in relations:
        df = relations["positively regulated by"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is up-regulated by the following drugs."%(table, gene_symbol)))
        table += 1
    if "negatively regulated by" in relations:
        df = relations["negatively regulated by"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is down-regulated by the following drugs."%(table, gene_symbol)))
        table += 1
    if "expressed in" in relations:
        df = relations["expressed in"]
        df.index.name = "id"
        display(df)
        display(Markdown("**Table %d** %s is expressed in the following tissues."%(table, gene_symbol)))
        table += 1
    
    cytoscapeobj = ipycytoscape.CytoscapeWidget()
    cytoscapeobj.graph.add_graph_from_json({
        "nodes": list(nodes.values()),
        "edges": edges
    }) 
    cytoscapeobj.set_style(style)
    display(cytoscapeobj)
    return table


## Knowledge Graph relationships

In [9]:
# query = '''
# MATCH p=(a:Code {CODE: "%s"})-[]-(b: Concept)-[r1:negatively_regulated_by]-(c: Concept)-[r2]-(d: Term) RETURN p ORDER BY r1.evidence_class DESC LIMIT %d
# UNION
# MATCH p=(a:Code {CODE: "%s"})-[]-(b: Concept)-[r1:positively_regulated_by]-(c: Concept)-[r2]-(d: Term) RETURN p ORDER BY r1.evidence_class DESC LIMIT %d
# UNION
# MATCH p=(a:Code {CODE: "%s"})-[]-(b: Concept)-[r1:expressed_in]-(c: Concept)-[r2]-(d: Code) RETURN p ORDER BY r1.evidence_class DESC LIMIT %d
# '''%(gene_id, limit, gene_id, limit, gene_id, limit)

# table = cypher_query(query, gene_id, table)

In [10]:
query = '''
MATCH p=(a:`Gene or Genome` {label: "%s"})-[r1:`negatively regulates`]-(b:Drug) RETURN p ORDER BY r1.evidence DESC LIMIT %d
UNION
MATCH p=(a:`Gene or Genome` {label: "%s"})-[r1:`positively regulates`]-(b:Drug) RETURN p ORDER BY r1.evidence DESC LIMIT %d
UNION
MATCH p=(a:`Gene or Genome` {label: "%s"})-[r1:`expressed in`]-(b:GTEXEXP) RETURN p ORDER BY r1.evidence DESC LIMIT %d
'''%(gene_symbol, limit, gene_symbol, limit, gene_symbol, limit)

table = cypher(query, table, gene_symbol)

Unnamed: 0_level_0,name,relation,SAB,evidence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEXEXP:ENSG00000198848-12-Liver CUI,GTEXEXP:ENSG00000198848-12-Liver,expressed in,GTEXEXP,500.0
GTEXEXP:ENSG00000198848-12-Lung CUI,GTEXEXP:ENSG00000198848-12-Lung,expressed in,GTEXEXP,200.0
GTEXEXP:ENSG00000198848-12-Artery-Aorta CUI,GTEXEXP:ENSG00000198848-12-Artery-Aorta,expressed in,GTEXEXP,78.0
GTEXEXP:ENSG00000198848-12-Artery-Tibial CUI,GTEXEXP:ENSG00000198848-12-Artery-Tibial,expressed in,GTEXEXP,75.0
GTEXEXP:ENSG00000198848-12-Colon-Sigmoid CUI,GTEXEXP:ENSG00000198848-12-Colon-Sigmoid,expressed in,GTEXEXP,75.0


**Table 1** CES1 gene is expressed in the following tissues.

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'style': {'background…

## Querying the API

In [11]:
signatures = None
query = {
    "where": {
        "library": "54198d6e-fe17-5ef8-91ac-02b425761653"
    }
}
res = requests.get("https://maayanlab.cloud/sigcom-lincs/metadata-api/entities/%s/signatures?filter=%s"%(lincs_id, json.dumps(query)))
if res.ok:
    signatures = res.json()
    up_sigs = []
    down_sigs = []
    for i in signatures:
        if i["direction"] == "up":
            up_sigs.append(i)
        else:
            down_sigs.append(i)
    up_sigs = sorted(up_sigs, key=lambda x: x['score'], reverse=True)[0:limit]
    down_sigs = sorted(down_sigs, key=lambda x: x['score'])[0:limit]

KeyboardInterrupt: 

In [None]:
nodes = {
    gene_id: {"id": gene_id, "label": gene}
}
edges = []

In [None]:
up_df = pd.DataFrame("-", index=[], columns=["name", "direction", "cell_line", "tissue", "uberon", "score"])
for i in up_sigs:
    sig_id = i["meta"]["local_id"]
    pert = i["meta"]["pert_name"]
    cell_line = i["meta"]["cell_line"]
    tissue = i["meta"]["tissue"]
    uberon = i["meta"]["anatomy"]
    score = i["score"]
    up_df.loc[sig_id] = [pert, "up", cell_line, tissue, uberon, score]
    nodes[sig_id] = {
        "id": sig_id,
        "label": "%s_%s_%s"%(pert, cell_line, tissue),
        "color": palette[0]
    }
    edges.append({
        "source": sig_id,
        "relation": "up-regulates",
        "target": gene_id,
        "score": score
    })
up_df.index.name = "signatures"
display(Markdown("### Up-regulated drugs"))
display(up_df)
display(Markdown("**Table %d** LINCS Chemical perurbagens that up-regulates %s ranked by characteristic direction coefficient."%(table, gene_symbol)))
table += 1

down_df = pd.DataFrame("-", index=[], columns=["name", "direction", "cell_line", "tissue", "uberon", "score"])
for i in down_sigs:
    sig_id = i["meta"]["local_id"]
    pert = i["meta"]["pert_name"]
    cell_line = i["meta"]["cell_line"]
    tissue = i["meta"]["tissue"]
    uberon = i["meta"]["anatomy"]
    score = i["score"]
    down_df.loc[sig_id] = [pert, "up", cell_line, tissue, uberon, score]
    nodes[sig_id] = {
        "id": sig_id,
        "label": "%s_%s_%s"%(pert, cell_line, tissue),
        "color": palette[1]
    }
    edges.append({
        "source": sig_id,
        "relation": "down-regulates",
        "target": gene_id,
        "score": score
    })
down_df.index.name = "signatures"
display(Markdown("### Down-regulated drugs"))
display(down_df)
display(Markdown("**Table %d** LINCS Chemical perurbagens that down-regulates %s ranked by characteristic direction coefficient."%(table, gene_symbol)))
table += 1

### Up-regulated drugs

Unnamed: 0_level_0,name,direction,cell_line,tissue,uberon,score
signatures,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
REP.B023_THP1_24H_B19_palmitoylethanolamide_2.22uM,palmitoylethanolamide,up,THP1,blood,UBERON:0000178,0.081235
REP.B020_HUVEC.A_24H_D05_nelarabine_0.03uM,nelarabine,up,HUVEC,umbilical cord,UBERON:0002331,0.079654
REP.B005_A549_24H_H10_merimepodib_0.08uM,merimepodib,up,A549,lung,UBERON:0002048,0.076532
REP.A023_HELA_24H_H01_atorvastatin_10uM,atorvastatin,up,HELA,uterine cervix,UBERON:0000002,0.073997
LTC004_HME1_24H_L04_WYE-125132_0.02uM,WYE-125132,up,HME1,breast,UBERON:0000310,0.073741


**Table 4** LINCS Chemical perurbagens that up-regulates CES1 gene ranked by characteristic direction coefficient.

### Down-regulated drugs

Unnamed: 0_level_0,name,direction,cell_line,tissue,uberon,score
signatures,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
REP.A018_THP1_24H_B19_fluticasone-propionate_10uM,fluticasone-propionate,up,THP1,blood,UBERON:0000178,-0.078147
CPC019_MCF7_6H_M17_BRD-K67414432_10uM,BRD-K67414432,up,MCF7,breast,UBERON:0000310,-0.077881
ASG003_A549_6H_H10_marbofloxacin_10uM,marbofloxacin,up,A549,lung,UBERON:0002048,-0.07723
ASG003_XC.P933_24H_B21_CV-1808_0.12uM,CV-1808,up,XC.P933,epithelium,UBERON:0000483,-0.075018
REP.B017_HEK293_24H_C04_ibutamoren_0.08uM,ibutamoren,up,HEK293,kidney,UBERON:0002113,-0.074348


**Table 5** LINCS Chemical perurbagens that down-regulates CES1 gene ranked by characteristic direction coefficient.

## GTEx

In [None]:
res = requests.get("https://gtexportal.org/api/v2/reference/geneSearch?geneId=%s"%ensemblid)
try:
    gencodeId = res.json()["data"][0]["gencodeId"]
    res = requests.get("https://gtexportal.org/api/v2/expression/medianGeneExpression?gencodeId=%s"%gencodeId)
    if res.ok:
        exp = res.json()["data"]
        exp = sorted(exp, key=lambda x: x['median'], reverse=True)[0:limit]

        tissue_exp = pd.DataFrame("-", index=[], columns=[ "uberon", "median expression"])
        for i in exp:
            tissue_exp.loc[i["tissueSiteDetailId"]] = [i["ontologyId"], i["median"]]
            nodes[i["tissueSiteDetailId"]] = {
                "id": i["tissueSiteDetailId"],
                "label": i["tissueSiteDetailId"],
                "color": palette[2]
            }
            edges.append({
                "source": gene_id,
                "relation": "expressed in",
                "target": i["tissueSiteDetailId"],
                "score": i["median"]
            })
        tissue_exp.index.name = "tissue"
        display(tissue_exp)
        display(Markdown("**Table %d** Tissues expressing %s ranked by median expression."%(table, gene_symbol)))
        table += 1
except Exception as e:
    print(e)
    print("Could not resolve gencode id")

Unnamed: 0_level_0,uberon,median expression
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1
Liver,UBERON:0001114,403.205
Lung,UBERON:0008952,180.276
Artery_Aorta,UBERON:0001496,77.1052
Colon_Sigmoid,UBERON:0001159,74.7568
Artery_Tibial,UBERON:0007610,74.3284


**Table 6** Tissues expressing CES1 gene ranked by median expression.

In [None]:
cytoscapeobj = ipycytoscape.CytoscapeWidget()
cytoscapeobj.graph.add_graph_from_json({
    "nodes": nodes.values(),
    "edges": edges
}) 
cytoscapeobj.set_style(style)
display(cytoscapeobj)

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'style': {'background…

In [None]:
import csv
counter = 0
phen_sabs = set()
with open("../neo4j/import/CUI-CUIs.csv") as o:
    csv_reader = csv.reader(o)
    for row in csv_reader:
        counter+=1
        if row[2] == "has_phenotype":
            phen_sabs.add(row[3])

In [None]:
phen_sabs

{'DOID', 'EFO', 'OMIM'}

In [None]:
127920727 - 1432805

126487922

In [None]:
payload = [
    {"name": "positively regulated by", "limit": 5},
    {"name": "negatively regulated by", "limit": 5}, 
    {"name": "expressed in", "limit": 5}]
res = requests.get('http://localhost:3000/api/knowledge_graph?start=Gene%20or%20Genome&start_term=CES1%20gene&start_field=label&relation='+json.dumps(payload))
results = res.json()

In [None]:
nodes = {}
edges = []
for i in results:
    if i["data"]["kind"] == 'Relation':
        edges.append(i)
    else:
        nodes[i["data"]["id"]] = i["data"]
cytoscapeobj = ipycytoscape.CytoscapeWidget()
cytoscapeobj.graph.add_graph_from_json({
    "nodes": nodes.values(),
    "edges": edges
}) 
cytoscapeobj.set_style(style)
display(cytoscapeobj)

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'style': {'background…

In [None]:
nodes

{'C1413347': {'id': 'C1413347',
  'kind': 'Gene or Genome',
  'label': 'CES1 gene',
  'properties': {'LNC': 'LNC LP203636-8',
   'MTH': 'MTH NOCODE',
   'ENSEMBL': 'ENSEMBL ENSG00000198848.13',
   'OMIM': 'OMIM 114835',
   'ENTREZ lowerbound': 55802851,
   'type_combined': 'Gene or Genome',
   'label': 'CES1 gene',
   'ENTREZ': 'ENTREZ 1066',
   'type': 'Gene or Genome',
   'ENSEMBL upperbound': 55833337,
   'HGNC': 'HGNC HGNC:1863',
   'NCI': 'NCI C116034',
   'id': 'C1413347',
   'ENTREZ upperbound': 55812974,
   'ENSEMBL lowerbound': 55802851},
  'color': '#ff8a80',
  'node_type': 1},
 'UFVCQ0hFTSA2NzM2MQ==': {'id': 'UFVCQ0hFTSA2NzM2MQ==',
  'kind': 'Drug',
  'label': 'Thiomersal',
  'properties': {'PUBCHEM': 'PUBCHEM 67361',
   'type_combined': 'Drug',
   'id': 'UFVCQ0hFTSA2NzM2MQ==',
   'label': 'Thiomersal',
   'type': 'Drug'},
  'color': '#C5E1A5',
  'node_type': 0},
 'UFVCQ0hFTSAyMzUzODI=': {'id': 'UFVCQ0hFTSAyMzUzODI=',
  'kind': 'Drug',
  'label': 'SIB-1893',
  'properties': 