# National Organization for Rare Disorders list

Author: John Erol Evangelista <br/>
Date: 01-19 <br/>
Data Source: https://rarediseases.org

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import requests
from bs4 import BeautifulSoup
import time
%matplotlib inline
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
from entrez_helper import entrez


In [30]:
import json, csv

In [2]:
ez = entrez("JohnErol.Evangelista@mssm.edu", "83a36edabe6b67dec8d71446a47b7c3b3e09")

## Extract diseases from website

In [3]:
baseUrl = "https://rarediseases.org/for-patients-and-families/information-resources/rare-disease-information/"
pages = 82

In [4]:
rare_diseases = []
for page in range(pages):
    if page == 1:
        suffix = ""
    else:
        suffix = "page/%d" % (page+1)
    sys.stdout.write("Processing page %d\r" % (page+1))
    url = baseUrl + suffix
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    lists = soup.find('div', class_="rare-diseases-db-index")
    for entry in lists.findAll("article"):
        rare_diseases.append(entry.text.strip())
    time.sleep(1)

Processing page 82

In [24]:
len(set(rare_diseases))

1215

## Get PMIDs

In [19]:
# rare_disease_pmids = {}
for disease in rare_diseases:
    # Continue where it failed
#     sys.stdout.write("Processing disease %s\r" % (disease))
    if disease not in rare_disease_pmids:
        sys.stdout.write("Processing disease %s\r" % (disease))
        time.sleep(0.3)
        for i in range(5):
            try:
                pmids = ez.get_pmid(disease)
            except Exception as e:
                if i == 4:
                    raise e
                else:
                    time.sleep(5)
                    continue
        rare_disease_pmids[disease] = pmids

In [21]:
len(rare_disease_pmids)

1215

In [27]:
data_path = "/Users/maayan/sigsets/Harmonizome/Output/NORD/NORD_PMID.json"
with open(data_path, "w") as o:
    o.write(json.dumps(rare_disease_pmids))

## Load Mapping File, GeneRIF, and AutoRIF

In [74]:
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,2]).sort_index()


In [41]:
geneRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/generifs_basic', sep='\t', index_col=[0,2]).sort_index()

In [47]:
autoRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/autorif.tsv', sep='\t', header=None, index_col=1)

## GeneRIF Gene Set

In [93]:
HUMAN_TAXON_ID = 9606

In [94]:
getGeneIDsHMR_updated.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,"Human, Mouse, and Rat Approved Symbol"
tax_id,Entrez Gene ID(supplied by NCBI),Unnamed: 2_level_1
9606,1,A1BG
9606,2,A2M
9606,3,A2MP1
9606,9,NAT1
9606,10,NAT2


In [156]:
with open("/Users/maayan/sigsets/Harmonizome/Output/NORD/NORD_GeneRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if (HUMAN_TAXON_ID, pmid) in geneRIF.index:
                gene_ids = geneRIF.loc[(HUMAN_TAXON_ID,pmid), "Gene ID"]
                for gene_id in gene_ids:
                    if (HUMAN_TAXON_ID, gene_id) in getGeneIDsHMR_updated.index:
                        gene = getGeneIDsHMR_updated.loc[(HUMAN_TAXON_ID, gene_id), "Human, Mouse, and Rat Approved Symbol"]
                        genes.add(gene)
                        # gmt_line += ("\t" + gene)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Zollinger Ellison Syndromerseive Autophagyiciency (LCAD)omeilation, and Autonomic Dysregulation (FIRES)

## AutoRIF Gene Set

In [160]:
with open("/Users/maayan/sigsets/Harmonizome/Output/NORD/NORD_AutoRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if pmid in autoRIF.index:
                gene = autoRIF.loc[pmid, 0]
                if not isinstance(gene, str): # Multiple genes
                    for g in gene:
                        genes.add(g)
                else:
                    genes.add(gene)
               # gmt_line += ("\t" + genes)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Zollinger Ellison Syndromerseive Autophagyiciency (LCAD)omeilation, and Autonomic Dysregulation (FIRES)

In [151]:
(9606, 12851857) in geneRIF.index

True