In [1]:
import pandas as pd
import requests
from pprint import pprint
from collections import defaultdict

In [2]:
# get gene sets from github
base_url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/"
FA_1_core_complex = "FA_1_core_complex.txt"
columns = ['gene_curie', 'gene_symbol']
fa_genes = pd.read_csv(base_url + FA_1_core_complex, sep='\t', names=columns)

In [3]:
fa_genes

Unnamed: 0,gene_curie,gene_symbol
0,NCBIGene:2175,FANCA
1,NCBIGene:2187,FANCB
2,NCBIGene:2176,FANCC
3,NCBIGene:2178,FANCE
4,NCBIGene:2188,FANCF
5,NCBIGene:2189,FANCG
6,NCBIGene:55120,FANCL
7,NCBIGene:57697,FANCM
8,NCBIGene:2177,FANCD2
9,NCBIGene:55215,FANCI


In [6]:
def get_orthologs(gene_curie):
    url = 'https://api.monarchinitiative.org/api/bioentity/gene/{}/homologs/'.format(gene_curie)
    response = requests.get(url)
    return response.json()

def get_phenotypes(gene_curie):
    url = 'https://api.monarchinitiative.org/api/bioentity/gene/{}/phenotypes/'.format(gene_curie)
    response = requests.get(url)
    return response.json()

In [7]:
ortho_pheno = list()
for index, gene in fa_genes.iterrows():
    orhtologs = get_orthologs(gene[0])
    for assoc in orhtologs['associations']:
        phenotypes = get_phenotypes(assoc['object']['id'])
        if len(phenotypes['associations']) > 0:
            for passoc in phenotypes['associations']:
                result = {
                    'gene_name': gene[1], 
                    'gene_curie': gene[0], 
                    'ortholog_name': assoc['object']['label'], 
                    'ortholog_curie': assoc['object']['id'], 
                    'model_name':  assoc['object']['taxon']['label'], 
                    'model_curie': assoc['object']['taxon']['id'],
                    'phenotype_name': passoc['object']['label'], 
                    'phenotype_curie': passoc['object']['id']
                }
                ortho_pheno.append(result)
columns = ['gene_name', 'gene_curie', 'ortholog_name', 'ortholog_curie', 'model_name', 'model_curie', 'phenotype_name', 'phenotype_curie'] 
phenotypes = pd.DataFrame(data=ortho_pheno, columns=columns)
phenotypes

Unnamed: 0,gene_name,gene_curie,ortholog_name,ortholog_curie,model_name,model_curie,phenotype_name,phenotype_curie
0,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Abnormality of cell physiology,HP:0011017
1,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,absent ovarian follicles,MP:0002777
2,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Thrombocytopenia,HP:0001873
3,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,female gonad development phenotype,GO:0008585PHENOTYPE
4,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,abnormal cranium morphology,MP:0000438
5,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,embryonic growth retardation,MP:0003984
6,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Abnormality of the ovary,HP:0000137
7,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,ovary hemorrhage,MP:0004834
8,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Decreased testicular size,HP:0008734
9,FANCA,NCBIGene:2175,Fanca,MGI:1341823,Mus musculus,NCBITaxon:10090,Hyperplasia of the Leydig cells,HP:0010791


In [56]:
OWLSIM_API = "http://owlsim3.monarchinitiative.org/api"
# Use phenodigm algorithm, with a cutoff of 70/100
matcher = 'phenodigm'
score_cutoff = 70
result_set = []
for index, row in fa_genes.iterrows():
    if index < 10:
        phenos = phenotypes['phenotype_curie'].tolist()
        params = {
            'id': phenos
        }
        url = "{}/match/{}".format(OWLSIM_API, matcher)
        req = requests.get(url, params=params)
        owlsim_results = req.json()
        try:
            for match in owlsim_results['matches']:
                if match['rawScore'] >= score_cutoff:
                    pprint(match)
#                 pprint(match)
#                 result = [row['gene_curie'],index, match['matchId'],
#                           match['matchLabel'], match['rawScore']]
#                 result_set.append(result)
        except TypeError as e:
            # TypeError when score is NaN
            #print(e)
            #print(match)
            continue


# # # Create a table of query gene, matched gene, and sim score
column_names = ['query_gene', 'query_symbol', 'match_gene', 'match_symbol', 'sim_score']
result_frame = pd.DataFrame(data=result_set, columns=column_names)
result_set

{'matchId': 'MGI:1341823',
 'matchLabel': 'Fanca',
 'percentageScore': 7313,
 'rank': 1,
 'rawScore': 73.12643145053599,
 'score': 7312.643145053599,
 'significance': 'NaN'}
{'matchId': 'MGI:1914280',
 'matchLabel': 'Fancl',
 'percentageScore': 7016,
 'rank': 2,
 'rawScore': 70.1562345616751,
 'score': 7015.623456167511,
 'significance': 'NaN'}
{'matchId': 'MGI:1341823',
 'matchLabel': 'Fanca',
 'percentageScore': 7313,
 'rank': 1,
 'rawScore': 73.12643145053599,
 'score': 7312.643145053599,
 'significance': 'NaN'}
{'matchId': 'MGI:1914280',
 'matchLabel': 'Fancl',
 'percentageScore': 7016,
 'rank': 2,
 'rawScore': 70.1562345616751,
 'score': 7015.623456167511,
 'significance': 'NaN'}
{'matchId': 'MGI:1341823',
 'matchLabel': 'Fanca',
 'percentageScore': 7313,
 'rank': 1,
 'rawScore': 73.12643145053599,
 'score': 7312.643145053599,
 'significance': 'NaN'}
{'matchId': 'MGI:1914280',
 'matchLabel': 'Fancl',
 'percentageScore': 7016,
 'rank': 2,
 'rawScore': 70.1562345616751,
 'score': 70

KeyboardInterrupt: 

In [51]:
result_set

[]