In [94]:
import pandas as pd
import xmltodict
from collections import defaultdict
import requests
import operator
from string import punctuation
from tqdm.notebook import trange, tqdm

In [63]:
def isNaN(num):
    return num != num

In [71]:
def is_gene_name(gene_name):
    if any(char in set(punctuation) for char in gene_name):
        # Isoform?
        if '.' in gene_name:
            # Isoform of a cryptic gene?
            return is_gene_name(gene_name.split('.')[0])
        else:
            return False
    else:
        return True

In [72]:
def geneshot_gene_similarity(genes, similarity):
    url = 'https://maayanlab.cloud/geneshot/api/associate'
    if isinstance(genes, str):
        genes = [genes]
    payload = {'similarity': similarity, 'gene_list': genes}
    headers = {'Cookie': ''}
    response = requests.post(url, headers=headers, json=payload)
    return response.json()

In [73]:
def extract_genes(gs_gene_sim, key = 'simScore', top=100):
    # Iterate over GeneShot results and return top N genes
    # Filter names with special characters by default
    gene_res = {k: v[key] for k, v in gs_gene_sim.items()}

    # Check if genes have 1. non-NaN values 2. majority of genes have different scores
    scores = list(gene_res.values())
    perc_uniq = len(set(scores[:int(top)]))
    if perc_uniq / int(top) < 0.2:
        return []

    for score in scores:
        if isNaN(score):
            return []

    # I hope you're running Python3.7+, don't you?
    s = sorted(gene_res.items(), key=operator.itemgetter(1, 0), reverse=True)
    # Detect and collapse isoforms
    # list(set()) doesn't work as set() messes up ordering
    # Dictionary retains order, even though scores are rewritten by lower ones
    top_genes = list({k.split('.')[0]: v for k, v in s if is_gene_name(k)})[:int(top)]
    return sorted(top_genes)

In [57]:
# Download from http://www.orphadata.org/data/xml/en_product6.xml and save in UTF-8 encoding
disease_list = xmltodict.parse(open('en_product6.xml').read())['JDBOR']['DisorderList']['Disorder']

In [103]:
orpha_dict = defaultdict(list)
for i in trange(len(disease_list)):
    disease = disease_list[i]
    name = disease['Name']['#text']
    if disease['DisorderGeneAssociationList']['@count'] == '1':
        g = disease['DisorderGeneAssociationList']['DisorderGeneAssociation']['Gene']['Symbol']        
        aug_g = extract_genes(geneshot_gene_similarity(g, 'coexpression')['association'])
        orpha_dict[name] += [g] + aug_g
    else:
        for gene in disease['DisorderGeneAssociationList']['DisorderGeneAssociation']:
            orpha_dict[name].append(gene['Gene']['Symbol'])
        gs = orpha_dict[name]
        aug_gs = extract_genes(geneshot_gene_similarity(gs, 'coexpression')['association'])
        orpha_dict[name] += gs + aug_gs

  0%|          | 0/3847 [00:00<?, ?it/s]

In [104]:
with open('orphanet.gmt', 'w') as orph_file:
    orph_file.write('\n'.join(sorted('{}\t\t{}'.format(k, '\t'.join(v)) for k,v in orpha_dict.items())))