# RD Connect

Author: John Erol Evangelista <br/>
Date: 01-19 <br/>
Data Source: https://rd-connect.eu/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import requests
from bs4 import BeautifulSoup
import time
import json, csv
%matplotlib inline
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
from entrez_helper import entrez

In [2]:
ez = entrez("JohnErol.Evangelista@mssm.edu", "83a36edabe6b67dec8d71446a47b7c3b3e09")

In [3]:
# RD_connect_Sample_2019-01-03_23_13_37.csv

## Extract diseases from csv file

In [4]:
csv_filename = "/Users/maayan/sigsets/Harmonizome/Data/RD_connect_Sample_2019-01-03_23_13_37.csv"

In [7]:
with open(csv_filename) as o:
    rare_disease = set([])
    csvfile = csv.reader(o)
    for row in csvfile:
        if row[0] not in ["Undiagnosed", "Healthy subject", "Invalid Data", "Healthy Control"]:
            
            if "OBSOLETE" not in row[0]:
                rare_disease.add(row[0])

In [8]:
len(rare_disease)

210

In [13]:
rare_disease = rare_disease - set(["Disease"])

## Get PMIDs

In [14]:
# Get disease dict from another library
GARD_pmid_file = "/Users/maayan/sigsets/Harmonizome/Output/GARD/GARD_PMID.json"
with open(GARD_pmid_file) as o:
    GARD_pmids = json.loads(o.read())

In [16]:
len(GARD_pmids)

6328

In [18]:
rare_disease_pmids = ez.get_all_pmids_of_list(rare_disease, GARD_pmids, timeout1=1, timeout2=5)

Processing term Autosomal recessive limb-girdle muscular dystrophy type 2Ieuropathyal dementia

In [26]:
with open("/Users/maayan/sigsets/Harmonizome/Output/RDConnect/RDConnect_PMID.json", "w") as o:
    o.write(json.dumps(rare_disease_pmids))

## Load Mapping File, GeneRIF, and AutoRIF

In [19]:
mappingDFHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/mappingFileHMR_2019.tsv', sep='\t', header=None, index_col=[0,1]).sort_index()
getGeneIDsHMR_updated = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/GeneSymbolsAndIDSHMR_2019.tsv', sep='\t', index_col=[0,2]).sort_index()

In [21]:
geneRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/generifs_basic', sep='\t', index_col=[0,2]).sort_index()

In [22]:
autoRIF = pd.read_csv('/Users/maayan/sigsets/Harmonizome/Data/autorif.tsv', sep='\t', header=None, index_col=1)

## GeneRIF Geneset

In [23]:
HUMAN_TAXON_ID = 9606

In [24]:
with open("/Users/maayan/sigsets/Harmonizome/Output/RDConnect/RDConnect_GeneRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if (HUMAN_TAXON_ID, pmid) in geneRIF.index:
                gene_ids = geneRIF.loc[(HUMAN_TAXON_ID,pmid), "Gene ID"]
                for gene_id in gene_ids:
                    if (HUMAN_TAXON_ID, gene_id) in getGeneIDsHMR_updated.index:
                        gene = getGeneIDsHMR_updated.loc[(HUMAN_TAXON_ID, gene_id), "Human, Mouse, and Rat Approved Symbol"]
                        genes.add(gene)
                        # gmt_line += ("\t" + gene)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Autosomal recessive limb-girdle muscular dystrophy type 2Ieuropathyal dementia

## AutoRIF Geneset

In [25]:
with open("/Users/maayan/sigsets/Harmonizome/Output/RDConnect/RDConnect_AutoRIF.gmt", "w") as o:
    for disease, pmids in rare_disease_pmids.items():
        gmt_line = disease + "\t\t"
        sys.stdout.write("Processing disease %s\r" % (disease))
        genes = set([])
        for pmid in pmids:
            pmid = int(pmid)
            if pmid in autoRIF.index:
                gene = autoRIF.loc[pmid, 0]
                if not isinstance(gene, str): # Multiple genes
                    for g in gene:
                        genes.add(g)
                else:
                    genes.add(gene)
               # gmt_line += ("\t" + genes)
        if genes:
            gmt_line += ("\t".join(list(genes)) +"\n")        
            o.write(gmt_line)

Processing disease Autosomal recessive limb-girdle muscular dystrophy type 2Ieuropathyal dementia

## GeneShot API

In [32]:
rare_disease

{'22q11.2 deletion syndrome',
 'Acromegaly',
 'Acute intermittent porphyria',
 'Adams-Oliver syndrome',
 'Addison disease',
 'Adult-onset autosomal recessive cerebellar ataxia',
 'Alexander disease',
 'Alpha-1-antitrypsin deficiency',
 'Amyotrophic lateral sclerosis',
 'Aniridia',
 'Apert syndrome',
 'Atypical hemolytic-uremic syndrome',
 'Autosomal dominant Charcot-Marie-Tooth disease type 2D',
 'Autosomal dominant Charcot-Marie-Tooth disease type 2N',
 'Autosomal dominant Charcot-Marie-Tooth disease type 2O',
 'Autosomal dominant cerebellar ataxia',
 'Autosomal dominant epidermolytic ichthyosis',
 'Autosomal dominant limb-girdle muscular dystrophy type 1B',
 'Autosomal dominant limb-girdle muscular dystrophy type 1C',
 'Autosomal dominant progressive external ophthalmoplegia',
 'Autosomal dominant spastic paraplegia type 4',
 'Autosomal erythropoietic protoporphyria',
 'Autosomal recessive congenital ichthyosis',
 'Autosomal recessive limb-girdle muscular dystrophy type 2A',
 'Autoso

In [33]:
API_url = "https://amp.pharm.mssm.edu/geneshot/api/search/%s"

In [37]:
with open("/Users/maayan/sigsets/Harmonizome/Output/RDConnect/RDConnect_GeneShot.gmt", "w") as o:
    for disease in rare_disease:
        sys.stdout.write("Processing disease %s\r" % (disease))
        sys.stdout.flush()
        url = API_url%disease
        res = requests.get(url)
        genes = res.json()["gene_count"]
        gene_set = set([])
        i = 0
        for gene in genes:
            if i > 2000:
                break
            i+=1
            gene_set.add(gene)
        gene_list = "\t".join(list(gene_set))
        if len(gene_set) > 2:
            o.write(disease + "\t\t" + gene_list + "\n")
        time.sleep(0.3)


Processing disease Autosomal recessive limb-girdle muscular dystrophy type 2Ieuropathyal dementia