# Genetic and Rare Diseases Information Center (GARD) list

Author: John Erol Evangelista <br/>
Date: 01-19 <br/>
Data Source: https://rarediseases.info.nih.gov/diseases

In [1]:
import sys, datetime, os, json
import numpy as np
import pandas as pd
import importlib
import xml.etree.ElementTree as ET
import requests
from bs4 import BeautifulSoup
import time

%matplotlib inline
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
from entrez_helper import entrez

In [2]:
ez = entrez("JohnErol.Evangelista@mssm.edu", "83a36edabe6b67dec8d71446a47b7c3b3e09")

## Extract diseases from website

In [3]:
baseUrl = "https://rarediseases.info.nih.gov/diseases/browse-by-first-letter/"

In [4]:
rare_diseases = []
for num in range(27):
    if num == 26:
        sym = "0-9"
    else:
        sym = chr(ord('A')+num)
    sys.stdout.write("Processing: %s\r" % (sym))
    url = baseUrl + sym
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    lists = soup.find('ul', class_="MedicalProductsDiseaseLinks")
    for entry in lists.findAll("li"):
        if not "\n -" in entry.text:
            rare_diseases.append(entry.text.strip())
    time.sleep(1)

Processing: 0-9

## Get PMIDs

In [5]:
# Get disease dict from another library
NORD_pmid_file = "/Users/maayan/sigsets/Harmonizome/Output/NORD/NORD_PMID.json"
with open(NORD_pmid_file) as o:
    NORD_pmids = json.loads(o.read())

In [6]:
len(NORD_pmids)

1215

In [7]:
rare_disease_pmids = ez.get_all_pmids_of_list(rare_diseases, NORD_pmids, timeout1=1, timeout2=5)

Processing term 8q12 microduplication syndromeon syndromeciencyH defect)ephaly club neoplasiaaresisctual disabilityy bones

In [8]:
len(rare_disease_pmids)

6328

In [9]:
data_path = "/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_PMID.json"
with open(data_path, "w") as o:
    o.write(json.dumps(rare_disease_pmids))

## Load Mapping File, GeneRIF, and AutoRIF

In [10]:
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,2]).sort_index()

In [12]:
geneRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/generifs_basic', sep='\t', index_col=[0,2]).sort_index()

In [13]:
autoRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/autorif.tsv', sep='\t', header=None, index_col=1)

## GeneRIF Geneset

In [14]:
HUMAN_TAXON_ID = 9606

In [16]:
with open("/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_GeneRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if (HUMAN_TAXON_ID, pmid) in geneRIF.index:
                gene_ids = geneRIF.loc[(HUMAN_TAXON_ID,pmid), "Gene ID"]
                for gene_id in gene_ids:
                    if (HUMAN_TAXON_ID, gene_id) in getGeneIDsHMR_updated.index:
                        gene = getGeneIDsHMR_updated.loc[(HUMAN_TAXON_ID, gene_id), "Human, Mouse, and Rat Approved Symbol"]
                        genes.add(gene)
                        # gmt_line += ("\t" + gene)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease 8q12 microduplication syndromeon syndromeciencyH defect)ephaly club neoplasiaaresisctual disabilityy bones

## AutoRIF Geneset

In [18]:
with open("/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_AutoRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if pmid in autoRIF.index:
                gene = autoRIF.loc[pmid, 0]
                if not isinstance(gene, str): # Multiple genes
                    for g in gene:
                        genes.add(g)
                else:
                    genes.add(gene)
               # gmt_line += ("\t" + genes)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease 8q12 microduplication syndromeon syndromeciencyH defect)ephaly club neoplasiaaresisctual disabilityy bones