# Expansion of nodes using Monarch APIs
* https://api.monarchinitiative.org/api/#/
* https://scigraph-ontology.monarchinitiative.org/scigraph/docs/#/

In [2]:
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize

In [3]:
# input gene
gene = 'NCBIGene:55768'

# output files
path = 'ngly1-human-expansion/ngly1_human'

## Graph queries (SciGraph)

In [78]:
# api address
api = 'https://scigraph-ontology.monarchinitiative.org/scigraph'
endpoint = '/graph'

In [79]:
# get neighbors (JSON content)
r = requests.get('{}{}/neighbors/{}'.format(api,endpoint,gene))
r.headers

{'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Content-Encoding': 'gzip', 'Content-Type': 'application/json', 'Vary': 'Accept, Accept-Encoding', 'Cache-Control': 'no-transform, max-age=7200', 'Date': 'Tue, 23 May 2017 21:21:37 GMT'}

In [80]:
# Read results 
r.json()

{'edges': [{'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:55768',
   'pred': 'IAO:0000136',
   'sub': 'PMID:12477932'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:55768',
   'pred': 'IAO:0000136',
   'sub': 'PMID:14749736'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:55768',
   'pred': 'IAO:0000136',
   'sub': 'PMID:15610852'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['in taxon']},
   'obj': 'NCBITaxon:9606',
   'pred': 'RO:0002162',
   'sub': 'NCBIGene:55768'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.o

In [81]:
with open('{}_neighbors.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [82]:
neighbors_df = json_normalize(r.json(), 'nodes')
neighbors_df.head()

Unnamed: 0,id,lbl,meta
0,PMID:10831608,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
1,MonarchData:ncbigene.ttl,,"{'types': ['Ontology', 'cliqueLeader', 'Node']}"
2,PMID:22190034,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
3,PMID:16055502,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
4,OMIM:610661,,"{'category': ['gene', 'sequence feature'], 'ty..."


In [83]:
neighbors_df[neighbors_df.id.str.startswith('NCBIGene')]
# the other ncbigene terms are discontinued ones (replaced by ncbi:55768)

Unnamed: 0,id,lbl,meta
5,NCBIGene:55768,NGLY1,{'http://www.geneontology.org/formats/oboInOwl...
14,NCBIGene:95041,LOC95041,{'http://www.w3.org/2000/01/rdf-schema#label':...
34,NCBIGene:80175,FLJ12409,{'http://www.w3.org/2002/07/owl#deprecated': [...


In [84]:
# Analyze neighbours' types
neighbors_df.id.unique()
print('nodes: {}'.format(len(neighbors_df.id.unique())))

nodes: 38


In [85]:
neighbors_df['node_type'] = neighbors_df.id.apply(lambda x: x.split(':')[0])
neighbors_df.node_type.value_counts()

PMID           28
NCBIGene        3
CHR             1
NCBITaxon       1
OMIM            1
MonarchData     1
ENSEMBL         1
SO              1
HGNC            1
Name: node_type, dtype: int64

In [86]:
# conclusion: neighbors are taxon, chr position, so, xref (ensembl, hgnc, omim) and provenance

In [87]:
# Filters
# Filter by interaction_type. BUT what are the strings per interaction_type???

In [88]:
# get reachable nodes (JSON content)
r = requests.get('{}{}/reachablefrom/{}'.format(api,endpoint,gene))
r.headers

{'Connection': 'keep-alive', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Content-Length': '446', 'Content-Encoding': 'gzip', 'Content-Type': 'application/json', 'Vary': 'Accept, Accept-Encoding', 'Cache-Control': 'no-transform, max-age=7200', 'Date': 'Tue, 23 May 2017 21:21:40 GMT'}

In [89]:
# Read results 
r.json()
reach_df = json_normalize(r.json(), 'nodes')
reach_df.head()

Unnamed: 0,id,lbl,meta
0,CHR:9606chr3p24.2,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
1,MonarchData:ncbigene.ttl,,"{'types': ['Ontology', 'cliqueLeader', 'Node']}"
2,NCBITaxon:9606,Homo sapiens,{'http://www.geneontology.org/formats/oboInOwl...
3,SO:0001217,protein_coding_gene,{'http://www.geneontology.org/formats/oboInOwl...


In [90]:
reach_df.id.unique()
print('nodes: {}'.format(len(reach_df.id.unique())))

nodes: 4


In [91]:
reach_df['node_type'] = reach_df.id.apply(lambda x: x.split(':')[0])
reach_df.node_type.value_counts()

CHR            1
MonarchData    1
SO             1
NCBITaxon      1
Name: node_type, dtype: int64

In [92]:
# conclusion: reachables are taxon, chr, so

## Edge Queries (Monarch)

In [4]:
# api address
api = 'https://api.monarchinitiative.org/api'
endpoint = '/bioentity'

In [94]:
# get gene info
r = requests.get('{}{}/gene/{}'.format(api,endpoint,gene))
#r = requests.get('https://api.monarchinitiative.org/api/bioentity/gene/%s/phenotypes/'%gene, headers={'Accept':'application/json'})
r.headers
r.json()

{'categories': ['gene', 'sequence feature'],
 'chromosome': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'consider': None,
 'deprecated': None,
 'description': None,
 'disease_associations': [{'evidence_graph': {'edges': [{'obj': ':.well-known/genid/genoGM25347',
      'pred': 'GENO:0000222',
      'sub': ':.well-known/genid/person-3200-1'},
     {'obj': 'NCBIGene:55768',
      'pred': 'GENO:0000418',
      'sub': 'ClinVarVariant:126423'},
     {'obj': ':.well-known/genid/610661.0004-610661.0005',
      'pred': 'GENO:0000382',
      'sub': ':.well-known/genid/genoGM25990'},
     {'obj': 'OMIM:615273',
      'pred': 'GENO:0000840',
      'sub': 'ClinVarVariant:50962'},
     {'obj': 'OMIM:615273',
      'pred': 'RO:0002200',
      'sub': ':.well-known/genid/person-3200-1'},
     {'obj': 'OMIM:615273'

In [95]:
with open('{}_id.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [96]:
# get gene-gene interactions
r = requests.get('{}{}/gene/{}/interactions/'.format(api,endpoint,gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 17


In [97]:
with open('{}_interactions.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [98]:
ggi_df = json_normalize(r.json(), 'objects')
ggi_df.columns = ['gene_id']
ggi_df.head(2)

Unnamed: 0,gene_id
0,NCBIGene:10906
1,NCBIGene:11124


In [99]:
ggi_df.to_csv('{}_ggi.tsv'.format(path), sep='\t', index=False, header=True)

In [100]:
# get gene-phenotype
r = requests.get('{}{}/gene/{}/phenotypes/'.format(api,endpoint,gene))
nassociations = len(r.json()['associations'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 8


In [101]:
with open('{}_phenotypes.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [102]:
gph_df = json_normalize(r.json(), 'objects')
gph_df.columns = ['phenotype_id']
gph_df.head(2)

Unnamed: 0,phenotype_id
0,HP:0001252
1,HP:0001263


In [103]:
gph_df.to_csv('{}_gene_phenotype.tsv'.format(path), sep='\t', index=False, header=True)

In [104]:
# Get gene-disease
r = requests.get('{}{}/gene/{}/diseases/'.format(api,endpoint,gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 1


In [105]:
with open('{}_diseases.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [106]:
r_dict = r.json()
gda_df = json_normalize(r_dict, 'objects')
gda_df.columns = ['disease_id']
gda_df.head(2)

Unnamed: 0,disease_id
0,OMIM:615273


In [107]:
gda_df.to_csv('{}_gene_disease.tsv'.format(path), sep='\t', index=False, header=True)

In [108]:
# get gene-function
r = requests.get('{}{}/gene/{}/function/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))
r.json()

Number of nodes associated are 0


{'associations': [],
 'compact_associations': None,
 'facet_counts': {'isa_partof_closure': {}, 'taxon_label': {}},
 'facet_pivot': None,
 'numFound': None,
 'objects': [],
 'start': None}

In [109]:
# get gene-expressedInAnatomy
r = requests.get('{}{}/gene/{}/expressed/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 20


In [110]:
with open('{}_expressed.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [111]:
r_data = r.json()
gaa_df = json_normalize(r_data, 'objects')
gaa_df.columns = ['anatomy_id']
gaa_df.head(2)

Unnamed: 0,anatomy_id
0,CL:0002328
1,UBERON:0000007


In [112]:
gaa_df.to_csv('{}_gene_anatomy.tsv'.format(path), sep='\t', index=False, header=True)

In [113]:
# get gene-pub
r = requests.get('{}{}/gene/{}/pubs/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 0


In [114]:
# get gene-homolog
r = requests.get('{}{}/gene/{}/homologs/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 14


In [115]:
with open('{}_homologs.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [116]:
r_dict = r.json()
gha_df = json_normalize(r_dict, 'objects')
gha_df.columns = ['homolog_id']
gha_df.head(2)

Unnamed: 0,homolog_id
0,FlyBase:FBgn0033050
1,MGI:1913276


In [117]:
gha_df.to_csv('{}_gene_homolog.tsv'.format(path), sep='\t', index=False, header=True)

## Query Wikidata for Knowldege.Bio

In [5]:
# api address:
api = 'https://query.wikidata.org/sparql'

In [6]:
def generate_table(header, results):
    df = {}
    for res_d in results:
        for head in header:
            df[head] = []
        
    for res_d in results:
        for head in header:
            try:
                value = res_d[head]['value']
            except:
                value = 'NA'
            if value.startswith('http'):
                namespace, value = value.rsplit('/', 1)
            aux = df[head]
            aux.append(value)
            df[head] = aux
            
    try:
        results_df = pd.DataFrame.from_dict(df)
    except e:
        print(e)
        print(df)
        
    results_df = results_df[header]
    return results_df

In [120]:
# get NCBIGene:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {"55768"}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [121]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})

In [122]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,55768,Q18042037,NGLY1,CDDG|CDG1V|N-glycanase 1|NGLY1|PNG1|PNGase,protein-coding gene in the species Homo sapiens


In [123]:
df.to_csv('{}_subject_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [124]:
# get uberon
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_gene_anatomy.tsv')
input_df = input_df[input_df.anatomy_id.str.contains('UBERON')]
input_df['id'] = input_df.anatomy_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"0000007" "0000014" "0000074" "0000167" "0000922" "0000945" "0001103" "0001166" "0001264" "0001304" "0001723" "0002358" "0004358" "0004359" "0004736" "0004801" "0007371" "0007616" "0011876"'

In [125]:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  ?item wdt:P1554 ?id .
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [126]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [127]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,945,Q1029907,stomach,,
1,7371,Q30030505,superior surface of tongue,,


In [128]:
# merge input with response
input_df = input_df[['anatomy_id']]
input_df['id'] = input_df.anatomy_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['anatomy_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,anatomy_id,item,itemLabel,altLabel,itemDesc
0,UBERON:0000007,Q156871,pituitary gland,hypophysis,endocrine gland
1,UBERON:0000014,Q30029801,zone of skin,,


In [129]:
output_df.to_csv('{}_anatomies_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [9]:
# get orthologs (ncbigene)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('NCBIGene')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"100031666" "100059456" "100522951" "100562703" "100682555" "173028" "361014" "460233" "533736" "553627" "700842" "835019"'

In [10]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [11]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [12]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,533736,Q30031022,NGLY1,N-glycanase 1,protein-coding gene in the species Bos taurus
1,700842,Q30031074,NGLY1,N-glycanase 1,protein-coding gene in the species Macaca mulatta


In [13]:
# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,NCBIGene:100031666,Q30030707,NGLY1,N-glycanase 1,protein-coding gene in the species Monodelphis...
1,NCBIGene:100059456,Q30030795,NGLY1,N-glycanase 1,protein-coding gene in the species Equus caballus


In [14]:
ncbigene_df = output_df

In [15]:
# get orthologs (mgi)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('MGI')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"MGI:1913276"'

In [16]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P671 ?id .} # mgi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
mgi_df = output_df
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,MGI:1913276,Q18258933,Ngly1,N-glycanase 1|PNGase|Ngly1|1110002C09Rik|Png1,protein-coding gene in the species Mus musculus


In [17]:
# get orthologs (fly) 
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('FlyBase')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"FBgn0033050"'

In [18]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P3852 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
fly_df = output_df
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,FlyBase:FBgn0033050,Q29733620,Pngl,PNGase|CG7865|Dmel_CG7865|Dmel\CG7865|png1|PNG...,protein-coding gene in the species Drosophila ...


In [19]:
output_df = pd.concat([fly_df,mgi_df,ncbigene_df])
output_df

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,FlyBase:FBgn0033050,Q29733620,Pngl,PNGase|CG7865|Dmel_CG7865|Dmel\CG7865|png1|PNG...,protein-coding gene in the species Drosophila ...
0,MGI:1913276,Q18258933,Ngly1,N-glycanase 1|PNGase|Ngly1|1110002C09Rik|Png1,protein-coding gene in the species Mus musculus
0,NCBIGene:100031666,Q30030707,NGLY1,N-glycanase 1,protein-coding gene in the species Monodelphis...
1,NCBIGene:100059456,Q30030795,NGLY1,N-glycanase 1,protein-coding gene in the species Equus caballus
2,NCBIGene:100522951,Q30030866,NGLY1,N-glycanase 1,protein-coding gene in the species Sus scrofa
3,NCBIGene:100562703,Q30030907,NGLY1,N-glycanase 1,protein-coding gene in the species Anolis caro...
4,NCBIGene:100682555,Q30030945,NGLY1,N-glycanase 1,protein-coding gene in the species Canis lupus...
5,NCBIGene:173028,Q29697143,png-1,CELE_F56G4.5|Peptide-N(4)-(N-acetyl-beta-gluco...,protein-coding gene in the species Caenorhabdi...
6,NCBIGene:361014,Q24413748,Ngly1,N-glycanase 1|Ngly1,protein-coding gene in the species Rattus norv...
7,NCBIGene:460233,Q30030979,NGLY1,N-glycanase 1,protein-coding gene in the species Pan troglod...


In [20]:
output_df.to_csv('{}_homologs_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [142]:
# get NCBIGene (ggi):
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_ggi.tsv')
input_df['id'] = input_df.gene_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"10906" "11124" "137886" "1956" "26270" "29979" "351" "55968" "57159" "5886" "5887" "6733" "7415" "79139" "83606" "84866" "8702"'

In [143]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [144]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)

# merge input with response
input_df = input_df[['gene_id']]
input_df['id'] = input_df.gene_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['gene_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,gene_id,item,itemLabel,altLabel,itemDesc
0,NCBIGene:10906,Q18035937,TRAFD1,FLN29|TRAF-type zinc finger domain containing ...,protein-coding gene in the species Homo sapiens
1,NCBIGene:11124,Q18036138,FAF1,CGI-03|FAF1|Fas associated factor 1|hFAF1|HFAF...,protein-coding gene in the species Homo sapiens


In [145]:
output_df.to_csv('{}_ncbi_interactors_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [146]:
# get phenotypes (hp)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/monarch/ngly1_human_expansion/ngly1_human_gene_phenotype.tsv')
input_df = input_df[input_df.phenotype_id.str.contains('HP')]
input_df['id'] = input_df.phenotype_id.apply(lambda x: '"' + str(x) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"HP:0001252" "HP:0001263" "HP:0001773" "HP:0002910" "HP:0004305" "HP:0009830" "HP:0012448" "HP:0200055"'

In [147]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P3841 ?id .} # mgi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [148]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['phenotype_id']]
input_df['id'] = input_df.phenotype_id
output_df = input_df.merge(df, how="left")
output_df = output_df[['phenotype_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,phenotype_id,item,itemLabel,altLabel,itemDesc
0,HP:0001252,Q1753547,Hypotonia,Low Muscle Tone|Muscular hypotonia,Abnormally low muscle tone
1,HP:0001263,Q1142806,developmental disability,,


In [149]:
output_df.to_csv('{}_phenotypes_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [150]:
# get taxon, chr, so