In [1]:
from pyteomics import fasta
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import json
import os

### Data directories

In [17]:
#Data path
datadir = os.path.abspath("../../Data/")

#Datasets
OT_targets   = datadir+"/OpenTargets/targets" #OpenTargets | "targets" data
OT_directevi = datadir+"/OpenTargets/associationByOverallDirect" #OpenTargets | Disease association - Direct evidence data
OT_indireevi   = datadir+"/OpenTargets/associationByOverallIndirect" #OpenTargets | Disease association - Indirect evidence data

Uniprot_hsap = datadir+"/Uniprotdb/human_proteome/UP000005640_9606.fasta" #Uniprot h. sapiens proteome
Genemap      = datadir+"/Uniprotdb/HUMAN_9606_idmapping.dat" #Uniprot gene ID mapping

Orpha_en     = datadir+"/Disease_ontology/orpha_en_product1.json" #Orphanet rare disease xef


# Rare disease analysis

Rare diseases are conditions that affect a  percentage of the population, typically fewer than 200,000 people in the U.S. or 1 in 2,000 in Europe. This imprecision in the definition makes determining the precise number of "rare diseases" , however estimates and curation efforts cite approximately 7000 known diseases.

The low incidence within the general population of these rare diseases make them economically unappealing for traditional drug discovery processes. This unmet need may be overcome through the repurposing of existing drugs to modulate and improve patient outcomes in understudied rare disease areas.


- https://health.ec.europa.eu/medicinal-products/orphan-medicinal-products_en    
- https://www.sciencedirect.com/science/article/pii/S0012369218300643


## Goal/s 

1. Generate graph database of rare disease ontologies linked to protein intel
2. Identify / integrate relevant datasets into graph db
3. Perform network analysis to identify drugs to repurpose for rare disease targets

## To-do

- Identify core datasets
- Parse datasets  
- Initialise graph
- Identify potential tractable targets / diseases 

## Datasets

### Protein intel

- Uniprot 
- OpenTargets
    - "targets"

### Disease intel / ontology

- Orphanet (ORPHA)
    - https://www.orphadata.com/orphanet-scientific-knowledge-files  
    - A rare disease nomenclature / ontology resource. Contains curated files crossreferencing a subset of rare diseases across different ontologies. 

- Disease Ontology (DOID)
- OpenTargets
    - "associationByOverallDirect"
    - "associationByOverallIndirect"

## Functions

### Protein data parsing

In [3]:
def parseUniprot(fastain):
    #Generate dic of Uniprot: Gene, Sequences for proteome
    proteome = {}
    seq_file = fasta.UniProt(fastain)
    for prot in seq_file:
        unid = prot[0]['id']
        name = prot[0]['name']
        gene = prot[0]['gene_id']
        seq  = prot[1]
        #print(prot[0])
        proteome.setdefault(unid, {"Gene": gene, "Name": name, "Seq": seq})
    return proteome

def unitoGene(genein):
    #Generate dic of Uniprot to Gene names
    with open(genein) as f:
        read = f.readlines()
    unigene = {}
    for i in read:
        splitr = i.split("\t")
        if splitr[1] == "Gene_Name" or splitr[1] == "Gene_Synonym":
            unigene.setdefault(splitr[0], [])
            unigene[splitr[0]].append(splitr[2].strip("\n"))
    return unigene 

def parseOTtarget(data_dir, df):
    #Parse OpenTargets "targets" intel, return df
    #Add new cols to df
    intelcol = {"description": "functionDescriptions", \
                "subcell_loc": "subcellularLocations", \
                "db_entries": "dbXrefs", \
                "bio_path": "pathways", \
                "tx_approaches": "tractability"}
    for col in intelcol:
        df[col] = [list() for x in range(len(df.index))]

    data = []
    #Read in multi part .json to data list 
    for file in os.listdir(data_dir):
        path = data_dir + "/" + str(file).replace("._", "")
        with open(path, "r") as f:
            for line in f:
                data.append(json.loads(line))
    
    #Retrieve intel from protein coding entries, annotate df
    protlist = {}
    for entry in data:
        try:
            entid = entry['proteinIds'][0]['id']
        except:
            entid = ""
        if entid in df.index:
            for j in intelcol:
                if intelcol[j] in entry:
                    df.loc[entid, j].append(entry[intelcol[j]])                                               

    return df

### Disease intel / ontologies

In [39]:
def parseOrpha(datain):
    ##TO-DO
        #Alter output to df with cols = ontologies    
    #Read in Orphanet json, return dataframe of Orpha code / Name / other database crossreferences
    with open(datain) as f:
        dataj = json.load(f)
    outdic = {}
    for i in dataj["JDBOR"][0]["DisorderList"][0]["Disorder"]:
        orpha = i["OrphaCode"]
        nam   = i["Name"][0]["label"]
        xrefs = {}
        try:
            for j in i["ExternalReferenceList"][0]['ExternalReference']:
                xrefs.setdefault(j['Source'], {"ID": j['id'], "Mapping": j['DisorderMappingRelation'][0]["Name"][0]["label"]})
        except:
            pass
        outdic.setdefault(orpha, {"Name": nam, "Xref": xrefs})

    outdf = pd.DataFrame.from_dict(outdic)

    return outdf

def parseOTdisease(dataloc):
    data = {}
    for file in os.listdir(dataloc):
        if ".json" in str(file):
            with open(dataloc + "/" + str(file).replace("._", "")) as f:
                for line in f.readlines():
                    linedict = json.loads(line)
                    data.setdefault(linedict["diseaseId"], [linedict])
                    data[linedict["diseaseId"]].append(linedict)
    return data

In [15]:
out = parseOrpha(Orpha_en)
display(out)
print(len(out))
print(out['162516'])
print(out.keys())

Unnamed: 0,166024,58,166032,61,166029,166038,93,166035,585,118,...,619948,619979,619972,619249,619284,619340,619363,619360,619233,619238
Name,Multiple epiphyseal dysplasia-macrocephaly-fac...,Alexander disease,Multiple epiphyseal dysplasia-miniepiphyses sy...,Alpha-mannosidosis,Multiple epiphyseal dysplasia-severe proximal ...,"Metaphyseal chondrodysplasia, Kaitila type",Aspartylglucosaminuria,Brachydactyly-short stature-retinitis pigmento...,Multiple sulfatase deficiency,Beta-mannosidosis,...,Early-onset autoimmunity-autoinflammation-immu...,Developmental delay-immunodeficiency-leukoence...,CADINS disease,Rare hereditary connective tissue disease,Narcolepsy,Inherited hematologic cancer-predisposing synd...,NOCARH syndrome,NON RARE IN EUROPE: Isolated hereditary persis...,Hereditary persistence of fetal hemoglobin-int...,Rare hereditary autoinflammatory disease
Xref,"{'ICD-11': {'ID': '212360', 'Mapping': 'NTBT (...","{'OMIM': {'ID': '3649', 'Mapping': 'E (Exact m...","{'MeSH': {'ID': '222274', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '223996', 'Mapping': 'E (Exa...","{'MeSH': {'ID': '222275', 'Mapping': 'E (Exact...","{'MeSH': {'ID': '222276', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '104504', 'Mapping': 'E (Exa...","{'ICD-10': {'ID': '193810', 'Mapping': 'NTBT (...","{'MeSH': {'ID': '222277', 'Mapping': 'E (Exact...","{'GARD': {'ID': '240540', 'Mapping': 'E (Exact...",...,"{'ICD-10': {'ID': '210932', 'Mapping': 'NTBT (...","{'OMIM': {'ID': '214686', 'Mapping': 'E (Exact...","{'ICD-10': {'ID': '211707', 'Mapping': 'NTBT (...","{'UMLS': {'ID': '219271', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '224351', 'Mapping': 'E (Exa...","{'UMLS': {'ID': '219273', 'Mapping': 'E (Exact...","{'UMLS': {'ID': '219274', 'Mapping': 'E (Exact...","{'ICD-10': {'ID': '206924', 'Mapping': 'E (Exa...","{'UMLS': {'ID': '221826', 'Mapping': 'E (Exact...","{'UMLS': {'ID': '247950', 'Mapping': 'E (Exact..."


2
Name    Isolated congenital nasal pyriform aperture st...
Xref    {'UMLS': {'ID': '216412', 'Mapping': 'E (Exact...
Name: 162516, dtype: object
Index(['166024', '58', '166032', '61', '166029', '166038', '93', '166035',
       '585', '118',
       ...
       '619948', '619979', '619972', '619249', '619284', '619340', '619363',
       '619360', '619233', '619238'],
      dtype='object', length=11023)


## Main code

In [37]:
#Define default proteome 
start = time.time()
proteome = parseUniprot(Uniprot_hsap)
#Update gene to likely recent id
genemap  = unitoGene(Genemap)
for i in proteome:
    try:
        proteome[i]["Gene"] = genemap[i][0]
    except:
        pass
#Generate dataframe from dic
initdf = pd.DataFrame.from_dict(proteome, orient='index')
print("Time taken | Initial proteome = ", round(time.time() - start, 2))

#Annotate with OpenTarget intel
start = time.time()
initdf = parseOTtarget(OT_targets, initdf)
print("Time taken | OpenTargets targets intel = ", round(time.time() - start, 2))

#Generate rare disease crossreference
start = time.time()
Diseasexref = parseOrpha(Orpha_en)
print("Time taken | Rare disese xref = ", round(time.time() - start, 2))

#Disease associations
start = time.time()
OT_direct = parseOTdisease(OT_directevi)
OT_indir  = parseOTdisease(OT_indireevi)
print("Time taken | OpenTargets disease intel = ", round(time.time() - start, 2))

Time taken | Initial proteome =  1.32
Time taken | OpenTargets targets intel =  6.24
Time taken | Rare disese xref =  0.69
Time taken | OpenTargets disease intel =  14.99


In [5]:
display(initdf.loc["P04637"])

Gene                                                          TP53
Name                                    Cellular tumor antigen p53
Seq              MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
description      [[Acts as a tumor suppressor in many tumor typ...
subcell_loc      [[{'location': 'Cytoplasm', 'source': 'uniprot...
db_entries       [[{'id': '11998', 'source': 'HGNC'}, {'id': '1...
bio_path         [[{'pathwayId': 'R-HSA-3232118', 'pathway': 'S...
tx_approaches    [[{'modality': 'SM', 'id': 'Approved Drug', 'v...
Name: P04637, dtype: object

In [38]:
print(OT_direct.keys())

dict_keys(['Orphanet_382', 'Orphanet_383', 'Orphanet_384', 'Orphanet_385', 'Orphanet_388', 'Orphanet_38874', 'Orphanet_39041', 'Orphanet_391307', 'Orphanet_391311', 'Orphanet_391316', 'Orphanet_391320', 'Orphanet_391327', 'Orphanet_391330', 'Orphanet_391343', 'Orphanet_391348', 'Orphanet_391351', 'Orphanet_391366', 'Orphanet_391372', 'Orphanet_391376', 'Orphanet_391384', 'Orphanet_391389', 'Orphanet_391392', 'Orphanet_391397', 'Orphanet_391408', 'Orphanet_391411', 'Orphanet_391417', 'Orphanet_391428', 'Orphanet_391457', 'Orphanet_391474', 'Orphanet_391487', 'Orphanet_391641', 'Orphanet_391646', 'Orphanet_391665', 'Orphanet_391677', 'Orphanet_392', 'Orphanet_393', 'Orphanet_394', 'Orphanet_394529', 'Orphanet_394532', 'Orphanet_395', 'Orphanet_397590', 'Orphanet_397593', 'Orphanet_397596', 'Orphanet_397606', 'Orphanet_397612', 'Orphanet_397615', 'Orphanet_397618', 'Orphanet_397623', 'Orphanet_397685', 'Orphanet_397692', 'Orphanet_397695', 'Orphanet_397709', 'Orphanet_397715', 'Orphanet_3

In [40]:
OT_direct['Orphanet_382']

[{'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000004848',
  'score': 0.06731485792832437,
  'evidenceCount': 5},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000004848',
  'score': 0.06731485792832437,
  'evidenceCount': 5},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000006468',
  'score': 0.03722408296384317,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000006611',
  'score': 0.031244280915044692,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000007384',
  'score': 0.03649970644866115,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000007402',
  'score': 0.061138030805772056,
  'evidenceCount': 5},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000008277',
  'score': 0.031798650697071745,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_382',
  'targetId': 'ENSG00000022355',
  'score': 0.07017731139305698,
  'evidenceCount': 4},
 {'diseaseId': 'Orphanet_382',
  'targetId': 