# Orphanet

Author: John Erol Evangelista <br/>
Date: 01-19 <br/>
Data Source: http://www.orphadata.org

In [1]:
import sys, datetime, os, csv, json
import numpy as np
import pandas as pd
import importlib
import xml.etree.ElementTree as ET
%matplotlib inline
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
from entrez_helper import entrez
import requests, time

In [2]:
ez = entrez("JohnErol.Evangelista@mssm.edu", "83a36edabe6b67dec8d71446a47b7c3b3e09")

In [3]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts/utility_functions.py'>

## Load Data

In [4]:
data_path = '/Users/maayan/sigsets/Harmonizome/Data/'

In [5]:
df = pd.read_csv(data_path+"orphadisease.csv")
df.head()

Unnamed: 0,ORPHA number,Disease Name
0,289157,1-alpha-hydroxylase deficiency
1,431361,"2,4-dienoyl-CoA reductase\ndeficiency"
2,976,"2,8-dihydroxyadenine urolithiasis"
3,79154,2-aminoadipic 2-oxoadipic aciduria
4,391417,2-methyl-3-hydroxybutyric aciduria


In [6]:
a = set([])
tot = 0
for i in df["ORPHA number"]:
    tot+=1
    try:
        a.add(int(i))
    except Exception as e:
        print(i)

In [7]:
df.dtypes

ORPHA number      int64
 Disease Name    object
dtype: object

In [8]:
tot

18359

In [9]:
len(a)

7101

### Read XML

In [10]:
tree = ET.parse(data_path + "OrphaData_DiseaseList.xml")
root = tree.getroot()

In [11]:
# Remove subtypes, categorical, and deprecated entries
tot=0
with open("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Disorder_list.csv", "w") as o:
    csv_writer = csv.writer(o)
    for disorder in root.find("DisorderList").findall("Disorder"):
        did = disorder.attrib["id"]
        orphaID = int(disorder.find("OrphaNumber").text)
        name = disorder.find("Name").text
        flags = disorder.find("DisorderFlagList").findall("DisorderFlag")
        disorderType = disorder.find("DisorderType").find("Name").text
        unflagged = True
        if "subtype" in disorderType:
            unflagged = False
        for flag in flags:
            label = flag.find("Label").text
            if label in ["Category", "Deprecated entity", "Head of classification"]:
                unflagged = False
        textauto = disorder.find("TextAuto")
        text = textauto.find("Info").text if textauto else None
        if text and "group of diseases" in text:
            unflagged = False
        if unflagged:
            csv_writer.writerow([orphaID, name, disorderType])
            tot+=1
tot

6340

In [28]:
df = pd.read_csv("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Disorder_list.csv")
df.head()

Unnamed: 0,ORPHA ID,Disease,Type
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Disease
1,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",Disease
2,58,Alexander disease,Disease
3,166029,"Multiple epiphyseal dysplasia, with severe pro...",Disease
4,61,Alpha-mannosidosis,Disease


In [30]:
rare_diseases = df.to_dict(orient="list")["Disease"]

In [31]:
rare_diseases

['Multiple epiphyseal dysplasia, Al-Gazali type',
 'Multiple epiphyseal dysplasia, with miniepiphyses',
 'Alexander disease',
 'Multiple epiphyseal dysplasia, with severe proximal femoral dysplasia',
 'Alpha-mannosidosis',
 'Metaphyseal chondrodysplasia, Kaitila type',
 'Brachydactyly-short stature-retinitis pigmentosa syndrome',
 'Aspartylglucosaminuria',
 'Multiple sulfatase deficiency',
 'Beta-mannosidosis',
 'Canavan disease',
 'Pontocerebellar hypoplasia type 5',
 'Pontocerebellar hypoplasia type 4',
 'Cystinosis',
 'Pontocerebellar hypoplasia type 6',
 'Farber disease',
 'Fucosidosis',
 'Glycogen storage disease due to acid maltase deficiency',
 'Glycogen storage disease due to glycogen debranching enzyme deficiency',
 'Glycogen storage disease due to muscle glycogen phosphorylase deficiency',
 'Glycogen storage disease due to glycogen branching enzyme deficiency',
 'Glycogen storage disease due to muscle phosphofructokinase deficiency',
 'Stickler syndrome type 3',
 'Glycogen st

## Get PMIDs

In [125]:
# Get disease dict from another library
GARD_pmid_file = "/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_PMID.json"
with open(GARD_pmid_file) as o:
    GARD_pmids = json.loads(o.read())

In [126]:
len(GARD_pmids)

6328

In [130]:
rare_disease_pmids = ez.get_all_pmids_of_list(rare_diseases, GARD_pmids, timeout1=1, timeout2=5)

Processing term Mosaic trisomy 9-sclerotic bones syndromene syndromedeficiencylete IFNgammaR1 deficiencyibitor22)dromeic traumaencydrome

In [131]:
with open("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Orphanet_PMID.json", "w") as o:
    o.write(json.dumps(rare_disease_pmids))

## Load Mapping File, GeneRIF, and AutoRIF

In [132]:
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,2]).sort_index()

In [134]:
geneRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/generifs_basic', sep='\t', index_col=[0,2]).sort_index()

In [135]:
autoRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/autorif.tsv', sep='\t', header=None, index_col=1)

## GeneRIF Geneset

In [136]:
HUMAN_TAXON_ID = 9606

In [137]:
with open("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Orphanet_GeneRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if (HUMAN_TAXON_ID, pmid) in geneRIF.index:
                gene_ids = geneRIF.loc[(HUMAN_TAXON_ID,pmid), "Gene ID"]
                for gene_id in gene_ids:
                    if (HUMAN_TAXON_ID, gene_id) in getGeneIDsHMR_updated.index:
                        gene = getGeneIDsHMR_updated.loc[(HUMAN_TAXON_ID, gene_id), "Human, Mouse, and Rat Approved Symbol"]
                        genes.add(gene)
                        # gmt_line += ("\t" + gene)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Mosaic trisomy 9-sclerotic bones syndromene syndromedeficiencylete IFNgammaR1 deficiencyibitor22)dromeic traumaencydrome

## AutoRIF Geneset

In [138]:
with open("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Orphanet_AutoRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if pmid in autoRIF.index:
                gene = autoRIF.loc[pmid, 0]
                if not isinstance(gene, str): # Multiple genes
                    for g in gene:
                        genes.add(g)
                else:
                    genes.add(gene)
               # gmt_line += ("\t" + genes)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Mosaic trisomy 9-sclerotic bones syndromene syndromedeficiencylete IFNgammaR1 deficiencyibitor22)dromeic traumaencydrome

## GeneShot API

In [14]:
API_url = "https://amp.pharm.mssm.edu/geneshot/api/search/%s"

In [15]:
gene_lib = {}

In [32]:
for disease in rare_diseases:
    sys.stdout.write("Processing disease %s\r" % (disease))
    sys.stdout.flush()
    url = API_url%disease
    res = requests.get(url)
    if not res.status_code == 200:
        for i in range(5):
            time.sleep(30)
            res = requests.get(url)
            if res.status_code == 200:
                break
        else:
            raise Exception(res.status_code)
    genes = res.json()["gene_count"]
    gene_set = set([])
    i = 0
    for gene in genes:
        if i > 2000:
            break
        i+=1
        gene_set.add(gene)
    gene_list = "\t".join(list(gene_set))
    if len(gene_set) > 2:
        gene_lib[disease] = gene_list
    time.sleep(1)


Processing disease Mosaic trisomy 9-sclerotic bones syndromene syndromedeficiencylete IFNgammaR1 deficiencyibitor22)dromeic traumaencydrome

In [33]:
len(gene_lib)

2501

In [34]:
with open("/Users/maayan/sigsets/Harmonizome/Output/Orphanet/Orphanet_GeneShot.gmt", "w") as o:
    for disease, gene_list in gene_lib.items():
        o.write(disease + "\t\t" + gene_list + "\n")

## GeneShot AutoRIF

In [35]:
Autorif_API_url = "https://amp.pharm.mssm.edu/geneshot/api/search/auto/%s"

In [36]:
autorif_lib = {}

In [None]:
for disease in rare_diseases:
    if disease not in autorif_lib and not disease=='Persistent migraine aura without infarction (also known as "Visual Snow")':
        sys.stdout.write("Processing disease %s\r" % (disease))
        sys.stdout.flush()
        url = Autorif_API_url%disease
        res = requests.get(url)
        if not res.status_code == 200:
            for i in range(5):
                time.sleep(30)
                res = requests.get(url)
                if res.status_code == 200:
                    break
            else:
                raise Exception(res.status_code)
        genes = res.json()["gene_count"]
        gene_set = set([])
        i = 0
        for gene in genes:
            if i > 2000:
                break
            i+=1
            gene_set.add(gene)
        gene_list = "\t".join(list(gene_set))
        if len(gene_set) > 2:
            autorif_lib[disease] = gene_list
        time.sleep(1)

Processing disease Fatal mitochondrial disease due to combined oxidative phosphorylation defect type 3iciencyrome-Hirschsprung disease

In [40]:
len(autorif_lib)

994