In [1]:
import requests
import json
from pprint import pprint
import pandas as pd

### General Solr Query

In [2]:
def query_solr(category, curie):
    solr_url = 'https://solr.monarchinitiative.org/solr/golr/select/'
    params = {
        'qt': 'standard',
        'wt': 'json',
        'rows': '250',
        'start': '0',
        'fl': '*,score',
        'fq': ['subject_category:"{}"'.format(category),'object_closure:"{}"'.format(curie)],
        'q': '*:*'
    }
    r = requests.get(solr_url, params)
    return json.loads(r.text)['response']

### Gene 2 Variants Solr Query

In [3]:
def solr_gene2variants(gene_curie):
    variants = []
    data = query_solr('variant', gene_curie)
    for doc in data['docs']:
        ev_graph = json.loads(doc['evidence_graph'])
        variants.append(ev_graph['edges'][0]['sub'])
    return variants

### Variant 2 Disease Solr Query

In [4]:
def query_solr_var2dis(varID):
    solr_url = 'https://solr.monarchinitiative.org/solr/golr/select/'
    params = {
        'qt': 'standard',
        'wt': 'json',
        'rows': '250',
        'start': '0',
        'fl': '*,score',
        'fq': [
            'object_category:"disease"',
            'subject_category:"variant"', 
            'subject_closure:"{}"'.format(varID)],
        'q': '*:*'
    }
    r = requests.get(solr_url, params)
    return json.loads(r.text)['response']

### Get all FA Associated Genes

In [5]:
# get gene sets from github
base_url = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/"
FA_all_genes = "FA_4_all_genes.txt"
columns = ['gene_curie', 'gene_symbol']
fa_genes = pd.read_csv(base_url + FA_all_genes, sep='\t', names=columns)
fa_genes

Unnamed: 0,gene_curie,gene_symbol
0,NCBIGene:2175,FANCA
1,NCBIGene:2187,FANCB
2,NCBIGene:2176,FANCC
3,NCBIGene:2178,FANCE
4,NCBIGene:2188,FANCF
5,NCBIGene:2189,FANCG
6,NCBIGene:55120,FANCL
7,NCBIGene:57697,FANCM
8,NCBIGene:2177,FANCD2
9,NCBIGene:55215,FANCI


### Useful Functions

In [6]:
def variant_count(val, dataframe, field):
    hits = dataframe.loc[dataframe[field] == val]
    return hits['variant_curie'].tolist()

In [33]:
tv_data = []
g2v2d_data = []
for index, row in fa_genes.iterrows():
    variants = solr_gene2variants(row[0])
    tot_var_results = {
        'gene_name': row[1], 
        'gene_curie': row[0], 
        '#variants': len(variants)
    }
    dis_count = 0
    unique_diseases = list()
    for var in variants:
        var2dis = query_solr_var2dis(varID=var)
        if var2dis['numFound'] != 0:
            if var2dis['docs'][0]['relation'] == 'GENO:0000841' or var2dis['docs'][0]['relation'] == 'GENO:0000840':
                dis_count += 1
                association = {
                    'gene_name': row[1],
                    'gene_curie': row[0],
                    'variant_name': var2dis['docs'][0]['subject_label'],
                    'variant_curie': var2dis['docs'][0]['subject'],
                    'relation_name': var2dis['docs'][0]['relation_label'],
                    'relation_curie': var2dis['docs'][0]['relation'],
                    'disease_curie': var2dis['docs'][0]['object'], 
                    'disease_name': var2dis['docs'][0]['object_label']
                }
                unique_diseases.append(var2dis['docs'][0]['object_label'])
                g2v2d_data.append(association)
    tot_var_results['#disease_variants'] = dis_count
    tot_var_results['#unique_diseases'] = len(set(unique_diseases))
    tv_data.append(tot_var_results)
tv_columns = ['gene_name', 'gene_curie', '#variants', '#disease_variants', '#unique_diseases']
total_variants = pd.DataFrame(data=tv_data, columns=tv_columns)
g2v2d_columns = ['gene_name', 'gene_curie', 'variant_name', 'variant_curie', 
                'relation_name', 'relation_curie', 'disease_name', 'disease_curie']
gene2variant2disease_table = pd.DataFrame(data=g2v2d_data, columns=g2v2d_columns)
gene2variant2disease_table.to_csv('FA_Pathogenic_variants_Solr.csv', sep=',')

In [34]:
total_variants

Unnamed: 0,gene_name,gene_curie,#variants,#disease_variants,#unique_diseases
0,FANCA,NCBIGene:2175,193,32,1
1,FANCB,NCBIGene:2187,45,7,1
2,FANCC,NCBIGene:2176,219,41,2
3,FANCE,NCBIGene:2178,34,4,2
4,FANCF,NCBIGene:2188,34,6,1
5,FANCG,NCBIGene:2189,37,9,1
6,FANCL,NCBIGene:55120,36,7,1
7,FANCM,NCBIGene:57697,44,0,0
8,FANCD2,NCBIGene:2177,74,6,1
9,FANCI,NCBIGene:55215,67,7,1


In [35]:
gene2variant2disease_table

Unnamed: 0,gene_name,gene_curie,variant_name,variant_curie,relation_name,relation_curie,disease_name,disease_curie
0,FANCA,NCBIGene:2175,NM_000135.2(FANCA):c.3624C>T (p.Ser1208=),ClinVarVariant:419528,likely_pathogenic_for_condition,GENO:0000841,"Fanconi Anemia, Complementation Group a",OMIM:227650
1,FANCA,NCBIGene:2175,NM_000135.3(FANCA):c.2398G>T (p.Glu800Ter),ClinVarVariant:435129,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
2,FANCA,NCBIGene:2175,NM_000135.3(FANCA):c.2151+1G>A,ClinVarVariant:435130,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
3,FANCA,NCBIGene:2175,NM_000135.2(FANCA):c.2574C>G (p.Ser858Arg),ClinVarVariant:134256,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
4,FANCA,NCBIGene:2175,NM_000135.2(FANCA):c.3720_3724delAAACA (p.Glu1...,ClinVarVariant:3448,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
5,FANCA,NCBIGene:2175,NM_000135.3(FANCA):c.1340C>G (p.Ser447Ter),ClinVarVariant:435132,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
6,FANCA,NCBIGene:2175,NM_000135.2(FANCA):c.2762A>T (p.Lys921Ile),ClinVarVariant:203995,likely_pathogenic_for_condition,GENO:0000841,"Fanconi Anemia, Complementation Group a",OMIM:227650
7,FANCA,NCBIGene:2175,NC_000016.9:g.(89829945_89831038)_(89836251_89...,ClinVarVariant:402242,likely_pathogenic_for_condition,GENO:0000841,"Fanconi Anemia, Complementation Group a",OMIM:227650
8,FANCA,NCBIGene:2175,NM_000135.3(FANCA):c.3558dup (p.Arg1187Glufs),ClinVarVariant:3444,pathogenic_for_condition,GENO:0000840,"Fanconi Anemia, Complementation Group a",OMIM:227650
9,FANCA,NCBIGene:2175,NM_000135.2(FANCA):c.811C>T (p.Gln271Ter),ClinVarVariant:371093,likely_pathogenic_for_condition,GENO:0000841,"Fanconi Anemia, Complementation Group a",OMIM:227650


In [36]:
pwd

'/Users/timputman/CODE/NCATS/cq_notebooks/cq-notebooks/Orange_QB1_Benchmark_CQs/QB1.2_FA_Gene_Pathogenic_Variants'