# FDA Orphan Disease List

Author: John Erol Evangelista <br/>
Date: 01-19 <br/>
Data Source: https://www.accessdata.fda.gov/scripts/opdlisting/oopd/index.cfm; https://rarediseases.info.nih.gov/diseases/fda-orphan-drugs

In [1]:
import sys, datetime, os, json
import numpy as np
import pandas as pd
import importlib
import xml.etree.ElementTree as ET
import requests
from bs4 import BeautifulSoup
import time

%matplotlib inline
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
from entrez_helper import entrez

In [2]:
ez = entrez("JohnErol.Evangelista@mssm.edu", "83a36edabe6b67dec8d71446a47b7c3b3e09")

## Fetch csv file

In [3]:
data_path = '/Users/maayan/sigsets/Harmonizome/Data/'

In [4]:
df = pd.read_csv(data_path + "AllOrphanFDA.csv")
df.head()

Unnamed: 0,Generic Name,Trade Name,Designation Date,Designation,Orphan Drug Status,FDA Approval Status,Approved Indication,Marketing Approval Date,Exclusivity End Date,Contact Company,Contact Address 1,Contact Address 2,Contact City,Contact State,Contact Zip,Contact Country,CF Grid Key
0,"(2S,3S,4E,6S,7R,10R)-7,10-Dihydroxy-3,7-dimeth...",,6/22/17,Treatment of chronic myelomonocytic leukemia (...,Designated,,,ÊN/A,ÊN/A,H3 Biomedicine Inc. (US Research Subsidiary of...,300 Technology Square,Floor 5,Cambridge,Massachusetts,2139.0,USA,581717
1,"(2S,4R)-1-(2-(3-acetyl-5-(2-methylpyrimidin-5-...",,11/2/17,Treatment of paroxysmal nocturnal hemoglobinuria,Designated,,,ÊN/A,ÊN/A,"Achillion Pharmaceuticals, Inc",300 George Street,,New Haven,Connecticut,6511.0,USA,512715
2,1-(2-Nitro-imidazolyl)-3-[18F]fluoro-2-propano...,,1/6/16,As a diagnostic for clinical management of sof...,Designated,,,ÊN/A,ÊN/A,"Advanced Imaging Projects, LLC",7947 Brookside Ct.,,Lake Worth,Florida,33467.0,USA,509715
3,ascorbic acid,,5/11/09,Treatment of Charcot-Marie-Tooth disease type 1A.,Designated,,,ÊN/A,ÊN/A,Murigenetics SAS,Faculte de Medecine Timone,13005 Marseille,,,,France,280909
4,bosentan,Tracleer,10/6/00,Treatment of pulmonary arterial hypertension,Designated/Approved,Approved for Orphan Indication,Treatment of pulmonary arterial hypertension.,Ê11/20/2001Ê,Ê11/20/2008,"Actelion Pharmaceuticals, Ltd.",1840 Gateway Drive,Suite 300,Cherry Hill,New Jersey,8002.0,USA,134200


## Extract diseases from website

In [8]:
baseUrl = "https://rarediseases.info.nih.gov/diseases/fda-orphan-drugs/"

In [16]:
rare_diseases = []
for num in range(27):
    if num == 26:
        sym = "0-9"
    else:
        sym = chr(ord('A')+num)
    sys.stdout.write("Processing: %s\r" % (sym))
    url = baseUrl + sym
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    lists = soup.find('ul', class_="MedicalProductsDiseaseLinks")
    for entry in lists.findAll("li"):
        disease = entry.find("a").text
        if not "\n -" in disease:
            rare_diseases.append(disease.strip())
    time.sleep(1)

Processing: 0-9

In [17]:
len(rare_diseases)

380

In [18]:
with open("/Users/maayan/sigsets/Harmonizome/Output/FDAOrphan/Disease_list.json", "w") as o:
    o.write(json.dumps(rare_diseases))

In [20]:
#Get Edited version
with open("/Users/maayan/sigsets/Harmonizome/Output/FDAOrphan/Disease_list.json") as o:
    rare_diseases = json.loads(o.read())

In [21]:
len(rare_diseases)

334

## Get PMIDs

In [22]:
# Get disease dict from another library
GARD_pmid_file = "/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_PMID.json"
with open(GARD_pmid_file) as o:
    GARD_pmids = json.loads(o.read())

In [23]:
rare_disease_pmids = ez.get_all_pmids_of_list(rare_diseases, GARD_pmids, timeout1=1, timeout2=5)

Processing term Zygomycosisllison syndromeadiopathiciated periodic syndromesyndromeal defect

In [24]:
with open("/Users/maayan/sigsets/Harmonizome/Output/FDAOrphan/FDAOrphan_PMID.json", "w") as o:
    o.write(json.dumps(rare_disease_pmids))

## Load Mapping File, GeneRIF, and AutoRIF

In [25]:
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,2]).sort_index()

In [27]:
geneRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/generifs_basic', sep='\t', index_col=[0,2]).sort_index()

In [28]:
autoRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/autorif.tsv', sep='\t', header=None, index_col=1)

## GeneRIF Geneset

In [29]:
HUMAN_TAXON_ID = 9606

In [30]:
with open("/Users/maayan/sigsets/Harmonizome/Output/FDAOrphan/FDAOrphan_GeneRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if (HUMAN_TAXON_ID, pmid) in geneRIF.index:
                gene_ids = geneRIF.loc[(HUMAN_TAXON_ID,pmid), "Gene ID"]
                for gene_id in gene_ids:
                    if (HUMAN_TAXON_ID, gene_id) in getGeneIDsHMR_updated.index:
                        gene = getGeneIDsHMR_updated.loc[(HUMAN_TAXON_ID, gene_id), "Human, Mouse, and Rat Approved Symbol"]
                        genes.add(gene)
                        # gmt_line += ("\t" + gene)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Zygomycosisllison syndromeadiopathiciated periodic syndromesyndromeal defect

## AutoRIF Geneset

In [31]:
with open("/Users/maayan/sigsets/Harmonizome/Output/FDAOrphan/FDAOrphan_AutoRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if pmid in autoRIF.index:
                gene = autoRIF.loc[pmid, 0]
                if not isinstance(gene, str): # Multiple genes
                    for g in gene:
                        genes.add(g)
                else:
                    genes.add(gene)
               # gmt_line += ("\t" + genes)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Zygomycosisllison syndromeadiopathiciated periodic syndromesyndromeal defect