In [1]:
from pyteomics import fasta
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import json
import os

### Data directories

In [2]:
#Data path
datadir = os.path.abspath("../../Data/")

#Datasets
OT_targets   = datadir+"/OpenTargets/targets" #OpenTargets | "targets" data
OT_directevi = datadir+"/OpenTargets/associationByOverallDirect" #OpenTargets | Disease association - Direct evidence data
OT_indireevi  = datadir+"/OpenTargets/associationByOverallIndirect" #OpenTargets | Disease association - Indirect evidence data

Uniprot_hsap = datadir+"/Uniprotdb/human_proteome/UP000005640_9606.fasta" #Uniprot h. sapiens proteome
Genemap      = datadir+"/Uniprotdb/HUMAN_9606_idmapping.dat" #Uniprot gene ID mapping

Orpha_en     = datadir+"/Disease_ontology/orpha_en_product1.json" #Orphanet rare disease xef


## Functions

### Protein data parsing

In [3]:
def parseUniprot(fastain):
    #Generate dic of Uniprot: Gene, Sequences for proteome
    proteome = {}
    seq_file = fasta.UniProt(fastain)
    for prot in seq_file:
        unid = prot[0]['id']
        name = prot[0]['name']
        gene = prot[0]['gene_id']
        seq  = prot[1]
       # print(prot[0])
        proteome.setdefault(unid, {"Gene": gene, "Name": name, "Seq": seq})
    return proteome

def unitoGene(genein):
    #Generate dic of Uniprot to Gene names
    with open(genein) as f:
        read = f.readlines()
    unigene = {}
    for i in read:
        splitr = i.split("\t")
        if splitr[1] == "Gene_Name" or splitr[1] == "Gene_Synonym":
            unigene.setdefault(splitr[0], [])
            unigene[splitr[0]].append(splitr[2].strip("\n"))
    return unigene 

def parseOTtarget(data_dir, df):
    #Parse OpenTargets "targets" intel, return df
    #Add new cols to df
    intelcol = {"Ensembl_gene": 'id', \
                "Ensembl_trans": 'transcriptIds', \
                "Description": "functionDescriptions", \
                "Subcell_loc": "subcellularLocations", \
                "DB_entries": "dbXrefs", \
                "Bio_path": "pathways", \
                "Tx_approaches": "tractability"}
    for col in intelcol:
        df[col] = [list() for x in range(len(df.index))]

    data = []
    #Read in multi part .json to data list 
    for file in os.listdir(data_dir):
        path = data_dir + "/" + str(file).replace("._", "")
        with open(path, "r") as f:
            for line in f:
                data.append(json.loads(line))
    #print("Here")
    #Retrieve intel from protein coding entries, annotate df
    protlist = {}
    for entry in data:
        try:
            entid = entry['proteinIds'][0]['id']
        except:
            entid = ""
        #print(entid)
        if entid in df.index:
            for j in intelcol:
                if intelcol[j] in entry:
                    df.loc[entid, j].append(entry[intelcol[j]])
    return df

### Disease intel / ontologies

In [4]:
def parseOrpha(datain):
    ##TO-DO
        #Alter output to df with cols = ontologies    
    #Read in Orphanet json, return dataframe of Orpha code / Name / other database crossreferences
    with open(datain) as f:
        dataj = json.load(f)
    outdic = {}
    for i in dataj["JDBOR"][0]["DisorderList"][0]["Disorder"]:
        orpha = i["OrphaCode"]
        nam   = i["Name"][0]["label"]
        xrefs = {}
        try:
            for j in i["ExternalReferenceList"][0]['ExternalReference']:
                xrefs.setdefault(j['Source'], {"ID": j['id'], "Mapping": j['DisorderMappingRelation'][0]["Name"][0]["label"]})
        except:
            pass
        outdic.setdefault(orpha, {"Name": nam, "Xref": xrefs})

    outdf = pd.DataFrame.from_dict(outdic)

    return outdf.transpose() 

def parseOTdisease(dataloc):
    data = {}
    for file in os.listdir(dataloc):
        if ".json" in str(file):
            with open(dataloc + "/" + str(file).replace("._", "")) as f:
                for line in f.readlines():
                    linedict = json.loads(line)
                    data.setdefault(linedict["diseaseId"], [linedict])
                    data[linedict["diseaseId"]].append(linedict)
    return data

def assocUniprotDisease(datadic, ensgenetouni, diseasexref):
    #Generate dictionary linking Uniprot ID to disease associations + scores
    uniprotdisease = {}
    for i in datadic:
        for ent in datadic[i]:
            try:
                Orpha   = ent['diseaseId'].split("_")[1]
                Score   = round(ent['score'], 3)
                Uniprot = ensgenetouni[ent['targetId']]
                Disnam  = diseasexref.loc[Orpha]["Name"]
                uniprotdisease.setdefault(Uniprot, {Orpha: {"Disease": Disnam, "Score": Score}})
                uniprotdisease[Uniprot][Orpha] = {"Disease": Disnam, "Score": Score}
            except:
                pass
    return uniprotdisease

# Main code

### Prepare datasets for graph

In [5]:
#Define default proteome 
start = time.time()
proteome = parseUniprot(Uniprot_hsap)
#Update gene to likely recent id
genemap  = unitoGene(Genemap)
for i in proteome:
    try:
        proteome[i]["Gene"] = genemap[i][0]
    except:
        pass
#Generate dataframe from dic
initdf = pd.DataFrame.from_dict(proteome, orient='index')
print("Time taken | Initial proteome = ", round(time.time() - start, 2))

#Annotate with OpenTarget intel
start = time.time()
initdf = parseOTtarget(OT_targets, initdf)
initdf = initdf.astype(str)

#Lookup dic of ensemble gene ids to uniprot
ensgenetouni = {}
for index, row in initdf.iterrows():
    val = row["Ensembl_gene"]
    for i in row["Ensembl_gene"][1:-1].replace(" ", "").split(","):
        ensgenetouni.setdefault(i.replace("\'", ""), index)
print("Time taken | OpenTargets targets intel = ", round(time.time() - start, 2))
        
#Generate rare disease crossreference
start = time.time()
diseasexref = parseOrpha(Orpha_en)
print("Time taken | Rare disese xref = ", round(time.time() - start, 2))

#Read in disease associations
start = time.time()
OT_direct = parseOTdisease(OT_directevi)
OT_indir  = parseOTdisease(OT_indireevi)
print("Time taken | OpenTargets disease intel = ", round(time.time() - start, 2))

#Generate dictionaries of Uniprot to disease assocations
start = time.time()
dirdic = assocUniprotDisease(OT_direct, ensgenetouni, diseasexref)
inddic = assocUniprotDisease(OT_indir, ensgenetouni, diseasexref)
print("Time taken | Uniprot disease association = ", round(time.time() - start, 2))

Time taken | Initial proteome =  1.27
Time taken | OpenTargets targets intel =  5.79
Time taken | Rare disese xref =  0.76
Time taken | OpenTargets disease intel =  14.85
Time taken | Uniprot disease association =  59.03


### Convert data to graph format

### Test work

In [7]:
display(initdf)

Unnamed: 0,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
A0A075B6U7,TRAJ23,T cell receptor alpha joining 23 (Fragment),XIYNQGGKLIFGQGTELSVKP,['ENSG00000211866'],[['ENST00000390514']],[],[],"[[{'id': '12052', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A0A2R8YED5,OR5BS1P,Olfactory receptor,MEVSNMTTVTVFILLGLSNNPQVQALLFVLFLVIYLLTLLGNLLMV...,['ENSG00000198678'],[['ENST00000328207']],[],[],"[[{'id': '19627', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A2RRL7,TMEM213,Transmembrane protein 213,MQRLPAATRATLILSLAFASLHSACSAEASSSNSSSLTAHHPDPGT...,['ENSG00000214128'],"[['ENST00000458494', 'ENST00000442682', 'ENST0...",[[]],[[{'location': 'Membrane ; Single-pass type I ...,"[[{'id': '27220', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A6NHS1,YK042,Putative uncharacterized protein ENSP00000347057,MVLLAGTRPQGGEARCMIPPPPSPLLGAQVEEDRTEFKEFQDFSSL...,[],[],[],[],[],[],[]
A6NL46,YF016,Putative UPF0607 protein ENSP00000332738,MRLCLIPWNTTPHRVLPPVVWSAPSRKKPVLSARNSMMFGHLSPVR...,[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
Q9UPU3,SORCS3,VPS10 domain-containing receptor SorCS3,MEAARTERPAGRPGAPLVRTGLLLLSTWVLAGAEITWDATGGPGRP...,['ENSG00000156395'],"[['ENST00000393176', 'ENST00000369701']]",[[]],[[{'location': 'Membrane; Single-pass type I m...,"[[{'id': '16699', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y2Z9,COQ6,"Ubiquinone biosynthesis monooxygenase COQ6, mi...",MAARLVSRCGAVRAAPHSGPLVSWRRWSGASTDTVYDVVVSGGGLV...,['ENSG00000119723'],"[['ENST00000553922', 'ENST00000557780', 'ENST0...",[['FAD-dependent monooxygenase required for th...,[[{'location': 'Mitochondrion inner membrane ;...,"[[{'id': '20233', 'source': 'HGNC'}, {'id': 'R...","[[{'pathwayId': 'R-HSA-2142789', 'pathway': 'U...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y328,NSG2,Neuronal vesicle trafficking-associated protein 2,MVKLNSNPSEKGTKPPSVEDGFQTVPLITPLEVNHLQLPAPEKVIV...,['ENSG00000170091'],"[['ENST00000517902', 'ENST00000517587', 'ENST0...",[[]],[[{'location': 'Membrane ; Single- pass type I...,"[[{'id': '24955', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y5S2,CDC42BPB,Serine/threonine-protein kinase MRCK beta,MSAKVRLKKLEQLLLDGPWRNESALSVETLLDVLVCLYTECSHSAL...,['ENSG00000198752'],"[['ENST00000559790', 'ENST00000558321', 'ENST0...",[['Serine/threonine-protein kinase which is an...,"[[{'location': 'Cytoplasm', 'source': 'uniprot...","[[{'id': '1738', 'source': 'HGNC'}, {'id': '3Q...","[[{'pathwayId': 'R-HSA-9013409', 'pathway': 'R...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."


In [15]:
out = parseOrpha(Orpha_en)
display(out)
print(len(out))
print(out['162516'])
print(out.keys())

Unnamed: 0,166024,58,166032,61,166029,166038,93,166035,585,118,...,619948,619979,619972,619249,619284,619340,619363,619360,619233,619238
Name,Multiple epiphyseal dysplasia-macrocephaly-fac...,Alexander disease,Multiple epiphyseal dysplasia-miniepiphyses sy...,Alpha-mannosidosis,Multiple epiphyseal dysplasia-severe proximal ...,"Metaphyseal chondrodysplasia, Kaitila type",Aspartylglucosaminuria,Brachydactyly-short stature-retinitis pigmento...,Multiple sulfatase deficiency,Beta-mannosidosis,...,Early-onset autoimmunity-autoinflammation-immu...,Developmental delay-immunodeficiency-leukoence...,CADINS disease,Rare hereditary connective tissue disease,Narcolepsy,Inherited hematologic cancer-predisposing synd...,NOCARH syndrome,NON RARE IN EUROPE: Isolated hereditary persis...,Hereditary persistence of fetal hemoglobin-int...,Rare hereditary autoinflammatory disease
Xref,"{'ICD-11': {'ID': '212360', 'Mapping': 'NTBT (...","{'OMIM': {'ID': '3649', 'Mapping': 'E (Exact m...","{'MeSH': {'ID': '222274', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '223996', 'Mapping': 'E (Exa...","{'MeSH': {'ID': '222275', 'Mapping': 'E (Exact...","{'MeSH': {'ID': '222276', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '104504', 'Mapping': 'E (Exa...","{'ICD-10': {'ID': '193810', 'Mapping': 'NTBT (...","{'MeSH': {'ID': '222277', 'Mapping': 'E (Exact...","{'GARD': {'ID': '240540', 'Mapping': 'E (Exact...",...,"{'ICD-10': {'ID': '210932', 'Mapping': 'NTBT (...","{'OMIM': {'ID': '214686', 'Mapping': 'E (Exact...","{'ICD-10': {'ID': '211707', 'Mapping': 'NTBT (...","{'UMLS': {'ID': '219271', 'Mapping': 'E (Exact...","{'MedDRA': {'ID': '224351', 'Mapping': 'E (Exa...","{'UMLS': {'ID': '219273', 'Mapping': 'E (Exact...","{'UMLS': {'ID': '219274', 'Mapping': 'E (Exact...","{'ICD-10': {'ID': '206924', 'Mapping': 'E (Exa...","{'UMLS': {'ID': '221826', 'Mapping': 'E (Exact...","{'UMLS': {'ID': '247950', 'Mapping': 'E (Exact..."


2
Name    Isolated congenital nasal pyriform aperture st...
Xref    {'UMLS': {'ID': '216412', 'Mapping': 'E (Exact...
Name: 162516, dtype: object
Index(['166024', '58', '166032', '61', '166029', '166038', '93', '166035',
       '585', '118',
       ...
       '619948', '619979', '619972', '619249', '619284', '619340', '619363',
       '619360', '619233', '619238'],
      dtype='object', length=11023)


In [6]:
display(pd.DataFrame.from_dict(dirdic))

Unnamed: 0,Q96QS3,P50549,Q9Y6N9,Q96CC6,Q9NY47,Q9P0K1,P14867,P51787,P54803,Q8TDD5,...,Q9H347,Q8TDM5,Q8N715,Q86UU5,Q496A3,O95396,Q12962,Q8NHS1,Q401N2,Q9ULG3
382,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,{'Disease': 'Guanidinoacetate methyltransferas...,...,,,,,,,,,,
391411,"{'Disease': 'Atypical juvenile parkinsonism', ...",,,,"{'Disease': 'Atypical juvenile parkinsonism', ...",,"{'Disease': 'Atypical juvenile parkinsonism', ...","{'Disease': 'Atypical juvenile parkinsonism', ...","{'Disease': 'Atypical juvenile parkinsonism', ...",,...,,,,,,,,,,
397725,{'Disease': 'COASY protein-associated neurodeg...,,,,,,,,{'Disease': 'COASY protein-associated neurodeg...,,...,,,,,,,,,,
397933,{'Disease': 'Severe intellectual disability-pr...,,,,,,,,,,...,,,,,,,,,,
397946,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,{'Disease': 'Autosomal spastic paraplegia type...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100974,,,,,,,,,,,...,,,,,,,,,,
2699,,,,,,,,,,,...,,,,,,,,,,
254920,,,,,,,,,,,...,,,,,,,,,,
502444,,,,,,,,,,,...,,,,,,,,,,


In [170]:
display(initdf.loc[["P04637"]])

Unnamed: 0,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
P04637,TP53,Cellular tumor antigen p53,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,['ENSG00000141510'],"[['ENST00000359597', 'ENST00000622645', 'ENST0...",[['Acts as a tumor suppressor in many tumor ty...,"[[{'location': 'Cytoplasm', 'source': 'uniprot...","[[{'id': '11998', 'source': 'HGNC'}, {'id': '1...","[[{'pathwayId': 'R-HSA-3232118', 'pathway': 'S...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."


In [171]:
display(initdf.loc[["Q96QS3"]])

Unnamed: 0,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
Q96QS3,ARX,Homeobox protein ARX,MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGA...,['ENSG00000004848'],"[['ENST00000379044', 'ENST00000637394', 'ENST0...",[['Transcription factor required for normal br...,"[[{'location': 'Nucleus', 'source': 'uniprot'}]]","[[{'id': '18060', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."


In [42]:
OT_direct['Orphanet_397692']

[{'diseaseId': 'Orphanet_397692',
  'targetId': 'ENSG00000090534',
  'score': 0.3695798546847018,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_397692',
  'targetId': 'ENSG00000090534',
  'score': 0.3695798546847018,
  'evidenceCount': 1},
 {'diseaseId': 'Orphanet_397692',
  'targetId': 'ENSG00000102977',
  'score': 0.6185329512431468,
  'evidenceCount': 5},
 {'diseaseId': 'Orphanet_397692',
  'targetId': 'ENSG00000117400',
  'score': 0.3695798546847018,
  'evidenceCount': 1}]

In [181]:
display(Diseasexref)

Unnamed: 0,Name,Xref
166024,Multiple epiphyseal dysplasia-macrocephaly-fac...,"{'ICD-11': {'ID': '212360', 'Mapping': 'NTBT (..."
58,Alexander disease,"{'OMIM': {'ID': '3649', 'Mapping': 'E (Exact m..."
166032,Multiple epiphyseal dysplasia-miniepiphyses sy...,"{'MeSH': {'ID': '222274', 'Mapping': 'E (Exact..."
61,Alpha-mannosidosis,"{'MedDRA': {'ID': '223996', 'Mapping': 'E (Exa..."
166029,Multiple epiphyseal dysplasia-severe proximal ...,"{'MeSH': {'ID': '222275', 'Mapping': 'E (Exact..."
...,...,...
619340,Inherited hematologic cancer-predisposing synd...,"{'UMLS': {'ID': '219273', 'Mapping': 'E (Exact..."
619363,NOCARH syndrome,"{'UMLS': {'ID': '219274', 'Mapping': 'E (Exact..."
619360,NON RARE IN EUROPE: Isolated hereditary persis...,"{'ICD-10': {'ID': '206924', 'Mapping': 'E (Exa..."
619233,Hereditary persistence of fetal hemoglobin-int...,"{'UMLS': {'ID': '221826', 'Mapping': 'E (Exact..."


In [18]:
display(Diseasexref.loc["382"])

Name        Guanidinoacetate methyltransferase deficiency
Xref    {'GARD': {'ID': '241305', 'Mapping': 'E (Exact...
Name: 382, dtype: object