In [1]:
import pandas as pd
import requests
from pprint import pprint
import ontobio

In [2]:
# get gene sets from github
base_url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/"
FA_1_core_complex = "FA_1_core_complex.txt"
columns = ['gene_curie', 'gene_symbol']
fa_genes = pd.read_csv(base_url + FA_1_core_complex, sep='\t', names=columns)

In [3]:
fa_genes

Unnamed: 0,gene_curie,gene_symbol
0,NCBIGene:2175,FANCA
1,NCBIGene:2187,FANCB
2,NCBIGene:2176,FANCC
3,NCBIGene:2178,FANCE
4,NCBIGene:2188,FANCF
5,NCBIGene:2189,FANCG
6,NCBIGene:55120,FANCL
7,NCBIGene:57697,FANCM
8,NCBIGene:2177,FANCD2
9,NCBIGene:55215,FANCI


In [4]:
class BioLinkWrapper(object):
    def __init__(self):
        self.endpoint = 'https://api.monarchinitiative.org/api/'
        
    def get_gene(self, gene_curie):
        url = '{}bioentity/gene/{}'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()

    def get_orthologs(self, gene_curie):
        url = '{}bioentity/gene/{}/homologs/'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()

    def get_phenotypes(self,gene_curie):
        url = '{}bioentity/gene/{}/phenotypes/'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()
    
    def get_diseases(self, gene_curie):
        url = '{}bioentity/gene/{}/diseases/'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()
    
    def get_interactions(self, gene_curie):
        url = '{}bioentity/gene/{}/interactions/'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()
    
    def get_functions(self, gene_curie):
        url = '{}bioentity/gene/{}/functions/'.format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()
    
    def get_disease_models(self, disease_curie):
        url = '{}/bioentity/disease/{}/models/'.format(self.endpoint, disease_curie)
        response = requests.get(url)
        return response.json()
    
    def get_all_phenotypes_for_taxon(self, taxon_curie):
        # get phenotypes associated with taxid
        url = "mart/gene/phenotype/{}".format(self.endpoint, taxon_curie)
        response = requests.get(url)
        return response.json()
    
    def get_gene_function(self, gene_curie):
        # get function associated with gene
        url = "{}bioentity/gene/{}/function/".format(self.endpoint, gene_curie)
        response = requests.get(url)
        return response.json()
    

# Get orthologs

In [5]:
fa_genes_orthologs = list()
BLW = BioLinkWrapper()
for index, gene in fa_genes.iterrows():
    orthologs = BLW.get_orthologs(gene[0])
    for orth in orthologs['associations']:
        orth_dict = {
            'gene_name': gene[1],
            'gene_curie': gene[0],
            'ortholog_name': orth['object']['label'],
            'ortholog_curie': orth['object']['id'],
            'orth_tax_label': orth['object']['taxon']['label'],
            'orth_tax_id': orth['object']['taxon']['id']
        }
        fa_genes_orthologs.append(orth_dict)

In [6]:
orth_columns  = [ 'gene_name', 'gene_curie', 'ortholog_name', 'ortholog_curie', 'orth_tax_label', 'orth_tax_id']
orth_df = pd.DataFrame(data=fa_genes_orthologs, columns=orth_columns)
orth_df

Unnamed: 0,gene_name,gene_curie,ortholog_name,ortholog_curie,orth_tax_label,orth_tax_id
0,FANCA,NCBIGene:2175,Fanca,RGD:1311380,Rattus norvegicus,NCBITaxon:10116
1,FANCA,NCBIGene:2175,FANCA,NCBIGene:100621453,Sus scrofa,NCBITaxon:9823
2,FANCA,NCBIGene:2175,FANCA,NCBIGene:100052495,Equus caballus,NCBITaxon:9796
3,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090
4,FANCA,NCBIGene:2175,FANCA,NCBIGene:100080126,Ornithorhynchus anatinus,NCBITaxon:9258
5,FANCA,NCBIGene:2175,FANCA,NCBIGene:415854,Gallus gallus,NCBITaxon:9031
6,FANCA,NCBIGene:2175,FANCA,NCBIGene:454393,Pan troglodytes,NCBITaxon:9598
7,FANCA,NCBIGene:2175,FANCA,NCBIGene:618375,Bos taurus,NCBITaxon:9913
8,FANCA,NCBIGene:2175,FANCA,NCBIGene:100027499,Monodelphis domestica,NCBITaxon:13616
9,FANCB,NCBIGene:2187,FANCB,NCBIGene:616549,Bos taurus,NCBITaxon:9913


In [58]:
def query_mygene(gene_curie):
    gene_curie = gene_curie.replace('NCBIGene:', '')
    url = 'https://mygene.info/v3/query?q={}&fields=all'.format(gene_curie)
    hit = requests.get(url)
    hit = hit.json()
    ukb = 'UniProtKB:{}'.format(hit['hits'][0]['uniprot']['Swiss-Prot'])
    return ukb

In [None]:
orth_df.head()

In [7]:
from ontobio.ontol_factory import OntologyFactory

# Create ontology object, for GO
# Transparently uses remote SPARQL service.
# (May take a few seconds to run first time, Jupyter will show '*'. BE PATIENT, do
# not re-execute cell)
ofactory = OntologyFactory()
ont = ofactory.create('go')

In [8]:
from ontobio.io.gafparser import GafParser
from ontobio.assoc_factory import AssociationSetFactory

p = GafParser()
afactory = AssociationSetFactory()

def make_assocs(group):
    url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format(group)
    if group == 'human':
        url = "http://geneontology.org/gene-associations/goa_human.gaf.gz"
    assocs = p.parse(url)
    return afactory.create_from_assocs(assocs, ontology=ont)

In [9]:
asoc_mouse = make_assocs('mgi')

In [54]:
mouse_orthologs = orth_df.loc[orth_df['orth_tax_id'] == 'NCBITaxon:10090']
mouse_ortho_sims = list()
for index, row in mouse_orthologs.iterrows():
    gene_name = row[0]
    gene_curie = row[1]
    ortholog_name = row[2]
    ortholog_curie = row[3]
    ortholog_taxon_curie = row[4]
    ortholog_taxon_name = row[5]
    mo = 'MGI:{}'.format(ortholog_curie)
    for mgene in list(asoc_mouse.subject_label_map.keys()):
        amScore = asoc_mouse.jaccard_similarity(mo, mgene)     
        if amScore > .7 and amScore < 1:
            mouse_ortho_sims.append({
                    'gene_name': gene_name,
                    'gene_curie': gene_curie,
                    'ortholog_name': ortholog_name,
                    'ortholog_curie': row[3],
                    'ortholog_taxon_curie': row[4],
                    'ortholog_taxon_name': row[5],
                    'non_fa_hit_name': asoc_mouse.label(mgene),
                    'non_fa_hit_curie': mgene.replace('MGI:MGI:', 'MGI:'),
                    'sim_score' : amScore
                })

In [55]:
mouse_sims_columns = [ 'gene_name','gene_curie', 'ortholog_name', 
                      'ortholog_curie', 'ortholog_taxon_curie', 
                      'ortholog_taxon_name', 'non_fa_hit_name', 'non_fa_hit_curie', 'sim_score' ]
mouse_sims_df = pd.DataFrame(data=mouse_ortho_sims, columns=mouse_sims_columns )

In [56]:
mouse_sims_df

Unnamed: 0,gene_name,gene_curie,ortholog_name,ortholog_curie,ortholog_taxon_curie,ortholog_taxon_name,non_fa_hit_name,non_fa_hit_curie,sim_score
0,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Fancd2,MGI:2448480,0.705882
1,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Cfap45,MGI:1919120,0.833333
2,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Fnbp4,MGI:1860513,0.75
3,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Rnu6,MGI:97989,0.807692
4,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Rprd2,MGI:1922387,0.709677
5,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Caskin2,MGI:2157062,0.730769
6,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Zbtb34,MGI:2685195,0.833333
7,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Pwwp2b,MGI:2142008,0.833333
8,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Evx1os,MGI:1917843,0.709677
9,FANCE,NCBIGene:2178,Fance,MGI:1920025,Mus musculus,NCBITaxon:10090,Lyrm1,MGI:1921169,0.714286


In [57]:
asoc_hsap = make_assocs('human')

In [64]:
hsap_ortho_sims = list()
for index, row in fa_genes.iterrows():
    ukb = query_mygene(row[0])
    for hgene in list(asoc_hsap.subject_label_map.keys()):
        amScore = asoc_hsap.jaccard_similarity(ukb, hgene)
        if amScore > .7 and amScore < 1:
            hsap_ortho_sims.append({
                    'gene_name': row[0],
                    'gene_curie': row[1],
                    'non_fa_hit_name': asoc_hsap.label(hgene),
                    'non_fa_hit_curie': hgene,
                    'sim_score' : amScore
                })

In [66]:
hsap_ortho_sim_columns = [ 'gene_name','gene_curie', 'non_fa_hit_name', 'non_fa_hit_curie', 'sim_score' ]
hsap_ortho_sims_df = pd.DataFrame(data=hsap_ortho_sims, columns=hsap_ortho_sim_columns )

In [67]:
hsap_ortho_sims_df

Unnamed: 0,gene_name,gene_curie,non_fa_hit_name,non_fa_hit_curie,sim_score
0,NCBIGene:2187,FANCB,FAAP24,UniProtKB:Q9BTP7,0.90566
1,NCBIGene:2187,FANCB,FAAP100,UniProtKB:Q0VG06,0.813559
2,NCBIGene:2187,FANCB,FANCE,UniProtKB:Q9HB96,0.958333
3,NCBIGene:2187,FANCB,INIP,UniProtKB:Q9NRY2,0.865385
4,NCBIGene:2178,FANCE,FAAP24,UniProtKB:Q9BTP7,0.867925
5,NCBIGene:2178,FANCE,FANCB,UniProtKB:Q8NB91,0.958333
6,NCBIGene:2178,FANCE,FAAP100,UniProtKB:Q0VG06,0.779661
7,NCBIGene:2178,FANCE,INIP,UniProtKB:Q9NRY2,0.826923


In [68]:
pwd

'/Users/timputman/CODE/NCATS/cq_notebooks/cq-notebooks/Orange_Demonstrator_1_CQs/OrangeQ1.2_GO_Functional_Similarity'