In [5]:
%run "../scripts/data_processing.py"
%run "../scripts/node_edge_generation.py"
%run "../scripts/graph_functions.py"

In [6]:
from pyteomics import fasta
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import json
import os
import subprocess
import csv
from neo4j import GraphDatabase

In [7]:
#Data path
datadir = os.path.abspath("../../Data/")

#Datasets
OT_targets    = datadir+"/OpenTargets/targets" #OpenTargets | "targets" data
OT_directevi  = datadir+"/OpenTargets/associationByOverallDirect" #OpenTargets | Disease association - Direct evidence data
OT_indireevi  = datadir+"/OpenTargets/associationByOverallIndirect" #OpenTargets | Disease association - Indirect evidence data
diseasedb     = datadir+"/OpenTargets/Disease" #OpenTargets "Disease" intel

Uniprot_hsap  = datadir+"/Uniprotdb/human_proteome/UP000005640_9606.fasta" #Uniprot h. sapiens proteome
Unisprot      = datadir+"/Uniprotdb/uniprot_sprot_human.dat" #Uniprot core dataset, contains functional intel
Genemap       = datadir+"/Uniprotdb/HUMAN_9606_idmapping.dat" #Uniprot gene ID mapping

Orpha_en      = datadir+"/Disease_ontology/orpha_en_product1.json" #Orphanet rare disease xef

# Functions

## Data input

In [None]:
def genUniprot(unidata, genein):
    #Generate Uniprot <> Gene dictionary
    with open(genein) as f:
        read = f.readlines()
    unigene = {}
    for i in read:
        splitr = i.split("\t")
        if splitr[1] == "Gene_Name" or splitr[1] == "Gene_Synonym":
            unigene.setdefault(splitr[0], [])
            unigene[splitr[0]].append(splitr[2].strip("\n"))

    #Generate df of human proteome & dic of gene names to uniprot ids
    unidic    = {}
    fastain   = fasta.UniProt(unidata)
    count = 0
    for uni in fastain:
        count += 1
        unid = uni[0]['id']
        name = uni[0]['name']
        try:
            gene = unigene[unid][0]
        except:
            gene = uni[0]['gene_id']
        seq  = uni[1]
        unidic.setdefault(unid, {"Uniprot": unid, "Gene": gene, "Name": name, "Seq": seq})

    unidf = pd.DataFrame.from_dict(unidic, orient='index')

    return unidf

def parseOTtarget(data_dir, df):
    #Parse OpenTargets "targets" intel, return df
    #Add new intel cols to df
    intelcol = {"Ensembl_gene": 'id', \
                "Ensembl_trans": 'transcriptIds', \
                "Description": "functionDescriptions", \
                "Subcell_loc": "subcellularLocations", \
                "DB_entries": "dbXrefs", \
                "Bio_path": "pathways", \
                "Tx_approaches": "tractability"}
    for col in intelcol:
        df[col] = [list() for x in range(len(df.index))]
    
    data = []
    #Read in multi part .json to data list 
    for file in os.listdir(data_dir):
        path = data_dir + "/" + str(file).replace("._", "")
        with open(path, "r") as f:
            for line in f:
                data.append(json.loads(line))
                
    #Retrieve intel from protein coding entries, annotate df
    ensemtouni = {}
    for entry in data:
        try:
            entid = entry['proteinIds'][0]['id']
        except:
            entid = ""

        if entid in df.index:
            for j in intelcol:
               #if intelcol[j] in entry and j != "Bio_path":
                if intelcol[j] in entry and j not in ["Bio_path", "Subcell_loc"]:
                    df.loc[entid, j].append(entry[intelcol[j]])
                #Simplify biological pathways for easier querying
                elif intelcol[j] in entry and j == "Bio_path":
                    for path in entry[intelcol[j]]:
                        df.loc[entid, j].append([path['pathwayId'], path['pathway']])
                elif j == "Ensembl_gene":
                    ensemtouni.setdefault(entry[intelcol[j]], entid) 
                elif intelcol[j] in entry and j == "Subcell_loc":
                    allsubloc = []
                    if len(entry[intelcol[j]]) > 0:
                        for sub in entry[intelcol[j]]:
                            allsubloc.append(sub["location"].lower())   
                    df.loc[entid, j].append(allsubloc[0:])
                   # df.loc[entid, j] = allsubloc
                    
    
    return df, ensemtouni

def parseOTdisease(dataloc):
    #Parse OT disease <> protein evidence
    data = {}
    for file in os.listdir(dataloc):
        if ".json" in str(file):
            with open(dataloc + "/" + str(file).replace("._", "")) as f:
                for line in f.readlines():
                    linedict = json.loads(line)
                    data.setdefault(linedict["diseaseId"], [linedict])
                    data[linedict["diseaseId"]].append(linedict)
    return data

def findDisease(database):
    #Parse high level OT disease intel
    data = {}
    count = 0
    for file in os.listdir(database):
        if ".json" in str(file):
            count += 1
            try:
                with open(database + "/" + str(file)) as f:
                    for line in f.readlines():
                        linedict = json.loads(line)
                        data.setdefault(linedict["id"], linedict)
            except:
                pass

    return data

def funcGen(unisprot):
    #Return a dictionary of Uniprot to function annotations
    outdic = {}
    with open(unisprot, "r") as f:
        infile = f.read().replace(";", " ")
        splitin= infile.split("//\n")
    #Iterate per entry, split block, retrieve Uniprot + individual functions
    for uni in splitin:
        count = 0
        allfunc = [line for line in uni.split("\n") if line.startswith('FT')]
        allac = [line for line in uni.split("\n") if line.startswith('AC')]
        try:
            uniprot = re.search(r"\bAC\s+([A-Za-z0-9]+)", uni).group().split()[1]
        except:
            uniprot = ""
        funcdic = {}
        func = {}
        curannot = ""
        #Iterate over functions, append to out dic
        for i in allfunc:
            match = re.match(r"^\S+(\s+)\S+", i)
            if len(match.group(1)) == 3:
                funcdic.setdefault(count, func)
                curannot = ""
                spliti = i.split()
                count += 1
                nums = re.findall(r'\d+', spliti[-1])
                nums = [int(num) for num in nums]
                func = {"type": spliti[1].replace(";", " "), "range": nums, "note": "", "evidence": "", "id": ""}
            else:
                if re.search(r"(?<=\/)\w+=+", i) != None:
                    search = re.search(r"(?<=\/)\w+=+", i)
                    curannot = search.group()[:-1]
                    func[curannot] = [i.split("=")[1].replace("\"", "")] 
                    spliti = i.split("/id=")
                else:
                    func[curannot] = func[curannot][0]+" ".join(i.split()[1:])
        outdic.setdefault(uniprot, funcdic)
    return outdic

def parseOrpha(datain):
    ##TO-DO
        #Alter output to df with cols = ontologies    
    #Read in Orphanet json, return dataframe of Orpha code / Name / other database crossreferences
    with open(datain) as f:
        dataj = json.load(f)
    outdic = {}
    for i in dataj["JDBOR"][0]["DisorderList"][0]["Disorder"]:
        orpha = i["OrphaCode"]
        nam   = i["Name"][0]["label"]
        xrefs = {}
        try:
            for j in i["ExternalReferenceList"][0]['ExternalReference']:
                xrefs.setdefault(j['Source'], {"ID": j['id'], "Mapping": j['DisorderMappingRelation'][0]["Name"][0]["label"]})
        except:
            pass
        outdic.setdefault(orpha, {"Name": nam, "Xref": xrefs})

    outdf = pd.DataFrame.from_dict(outdic)

    return outdf.transpose() 

def assocUniprotDisease(datadic, ensgenetouni, diseasexref):
    #Generate dictionary linking Uniprot ID to disease associations + scores
    uniprotdisease = {}
    for i in datadic:
        for ent in datadic[i]:
            try:
                Orpha   = int(ent['diseaseId'].split("_")[1])
                Score   = round(ent['score'], 3)
                Uniprot = ensgenetouni[ent['targetId']]
                Disnam  = diseasexref.loc[str(Orpha)]["Name"]
                uniprotdisease.setdefault(Uniprot, {Orpha: {"Disease": Disnam, "Score": Score}})
                uniprotdisease[Uniprot][Orpha] = {"Disease": Disnam, "Score": Score}
            except:
                pass
    return uniprotdisease

## Node / Edge generation

In [80]:
def genProtNode(df):
    #Generate Uniprot Protein node
    count = 0
    outcsv = [["id:ID", ":LABEL", "uniprot", "gene", "name", "seq", "ens_gene", "ens_trans", "description", "db_entries", "bio_path", "tx_approach", "subcellular_location"]]
    nodecodec = {}
    for index, row in df.iterrows():
        count += 1
        outcsv.append(["P"+str(count), "Protein", row["Uniprot"], row['Gene'], row['Name'], row['Seq'], row['Ensembl_gene'], row['Ensembl_trans'],\
                        row['Description'], row['DB_entries'], row['Bio_path'], row['Tx_approaches'], row["Subcell_loc"]])
        nodecodec.setdefault(index, "P"+str(count))
    return outcsv, nodecodec

def genFuncNode(funcdic):
    #Generate Uniprot Function node
    count = 0
    outcsv = [["id:ID", ":LABEL", "type",  "range", "alt_id", "evidence", "note"]]
    nodecodec = {}
    for uni in funcdic:
        for func in funcdic[uni]: 
            if len(funcdic[uni][func]) > 0:
                count += 1
                outcsv.append(["F"+str(count), "Function", funcdic[uni][func]["type"], funcdic[uni][func]["range"], funcdic[uni][func]["id"], funcdic[uni][func]["evidence"], funcdic[uni][func]["note"]])
                nodecodec.setdefault("F"+str(count), uni)
    return outcsv, nodecodec

def genDisNode(disdic, disedge):
    #Generate OpenTarget disease node
    count = 0
    outcsv = [["id:ID", ":LABEL", "id_ont", "name", "description"]]
    nodecodec = {}
    for dis in disdic:
        if dis in disedge:
            try:
                desc = disdic[dis]["description"]
            except:
                desc = ""
            count += 1
            outcsv.append(["D"+str(count), "Disease", dis, disdic[dis]["name"], desc.replace("\n", " ")])
            nodecodec.setdefault(dis, "D"+str(count))
    return outcsv, nodecodec

def genSubcellNode(df):
    #Subcellular localisation annotations for broad categories
    uniloc = {"Intracellular": [], "Extracellular": [], "Membrane": []}

    #Identify unique locations
    indiv_loc = {}
    for loc_list in df["Subcell_loc"].values:
        for ent in loc_list:
            for loc in ent:
                if loc not in indiv_loc:
                    indiv_loc.setdefault(loc, "")
                    
    #Add broad subcellular locations based on keywords
    for loc in indiv_loc:
        if any( i in loc.lower().split() for i in ["secreted", "secret", "extracellular"]):
            indiv_loc[loc] = "extracellular"
        else:
            if any( i in loc.lower().split() for i in ["membrane", "wall", "lipid"]):
                indiv_loc[loc] = "membrane"
            else:
                indiv_loc[loc] = "intracellular"
    
    #Generate Subcellular Location node
    count = 0
    outcsv = [["id:ID", ":LABEL", "location", "intra_mem_extra"]]
    nodecodec = {}
    for loc in indiv_loc:
        count += 1
        outcsv.append(["S"+str(count), "Subcellular", loc, indiv_loc[loc]])
        nodecodec.setdefault(loc, "S"+str(count))

    return outcsv, nodecodec

def genProtSubEdge(protcod, subcelcod, df):
    #Generate Protein <> Subcellular location edges
    outcsv = [[":START_ID",":END_ID", ":TYPE"]]
    for sub in subcelcod:
        subdf = df[df["Subcell_loc"].apply(lambda x: sub in x[0] if x else False)]
        for index, row in subdf.iterrows():
            outcsv.append([protcod[row["Uniprot"]], subcelcod[sub], "is_location"])
    return outcsv

def genProtFuncEdge(funcod, protcod):
    #Generate Protein <> Functional edges
    outcsv = [[":START_ID",":END_ID", ":TYPE"]]
    count = 0
    for i in funcod:
        if funcod[i] in protcod:
            outcsv.append([i, protcod[funcod[i]], "is_feature"])

    return outcsv

def genDisProtEdge(disdic, protcod, discod, ensdic):
    #Generate Disease <> Functional edges
    outcsv = [[":START_ID",":END_ID", ":TYPE", "score"]]
    for i in disdic:
        for j in disdic[i]:
            if j["targetId"] in ensdic:
                outcsv.append([protcod[ensdic[j["targetId"]]], discod[i], "direct_evidence", float(j["score"])])
    return outcsv

## Graph read / write

In [None]:
def nodeToCSVprep(genenode, diseasenode):
    #Generate out lists for .csv out
    geneout = [["id:ID", ":LABEL", "name", "ensemble_id"]]
    for i in genenode:
        geneout.append([genenode[i]["ID"], "Gene", i, genenode[i]["Ensemble"]])
        
    disout = [["id:ID", ":LABEL", "name", "uri"]]
    for i in diseasenode:
        disout.append([diseasenode[i]["ID"], "Disease", i, diseasenode[i]["URI"]])
        
    return geneout, disout

def writecsv(infile, outfile):
    #Write out csv
    with open(outfile, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=";")
        writer.writerows(infile)

def run_query(query):
    #Submit query to graph database, retrieve output as df
    with driver.session() as session:
        result = session.run(query)
        rows = [record.data() for record in result]
        df = pd.DataFrame(rows)
        return df

# Main script

## Data prep

In [9]:
#Read in Uniprot proteome intel
start = time.time()
unidf = genUniprot(Uniprot_hsap, Genemap)
print("Time taken | Uniprot initial proteome = ", round(time.time() - start, 2))

#Read in OT target intel - DONE
start = time.time()
unidf, ensemtouni = parseOTtarget(OT_targets, unidf)
print("Time taken | OpenTargets targets intel = ", round(time.time() - start, 2))

#Lookup dic of ensemble gene ids to uniprot
start = time.time()
ensgenetouni = {}
genetouni    = {}
for index, row in unidf.iterrows():
    val = row["Ensembl_gene"]
   # print(val)
    for i in row["Ensembl_gene"]:
        ensgenetouni.setdefault(i.replace("\'", ""), index)
    genetouni.setdefault(row["Gene"], index)
print("Time taken | Ensemble <> Uniprot | Gene <> Uniprot lookup dic = ", round(time.time() - start, 2))

#Read in OT functional intel 
start = time.time()
funcdic = funcGen(Unisprot)
print("Time taken | Uniprot function intel = ", round(time.time() - start, 2))

#Read in OT disease intel
start = time.time()
diseasego = findDisease(diseasedb) 
print("Time taken | OpenTargets init disease intel = ", round(time.time() - start, 2))

#Read in disease associations
start = time.time()
OT_direct = parseOTdisease(OT_directevi)
OT_indir  = parseOTdisease(OT_indireevi)
print("Time taken | OpenTargets dir/ind disease intel = ", round(time.time() - start, 2))

#Generate rare disease crossreference
start = time.time()
diseasexref = parseOrpha(Orpha_en)
print("Time taken | Rare disese xref = ", round(time.time() - start, 2))

#Generate dictionaries of Uniprot to disease assocations
start = time.time()
dirdic = assocUniprotDisease(OT_direct, ensgenetouni, diseasexref)
inddic = assocUniprotDisease(OT_indir, ensgenetouni, diseasexref)
print("Time taken | Uniprot disease association = ", round(time.time() - start, 2))

Time taken | Uniprot initial proteome =  1.35
Time taken | OpenTargets targets intel =  7.76
Time taken | Ensemble <> Uniprot | Gene <> Uniprot lookup dic =  0.32
Time taken | Uniprot function intel =  8.57
Time taken | OpenTargets init disease intel =  0.21
Time taken | OpenTargets dir/ind disease intel =  18.5
Time taken | Rare disese xref =  1.1
Time taken | Uniprot disease association =  65.88


## Node / Edge generation

In [10]:
#Generate nodes / edges csv
protnodes, protcodec = genProtNode(unidf)
funcnodes, funccodec = genFuncNode(funcdic)
disnodes, discodec   = genDisNode(diseasego, OT_direct)
subcelnodes, subcelcodec = genSubcellNode(unidf)
profuncedge          = genProtFuncEdge(funccodec, protcodec)
disprotedge          = genDisProtEdge(OT_direct, protcodec, discodec, ensgenetouni)
prosubedge           = genProtSubEdge(protcodec, subcelcodec, unidf)

#transnod, transedge  = genscRNANodeEdge(scdf, genetouni, protcodec)

In [81]:
#Write out node + edge csv files
if os.path.exists("./Results") == False:
    os.mkdir("./Results")

#Write out nodes / edges csv
writecsv(protnodes, "./Results/protnodes.csv")
writecsv(disnodes, "./Results/disnodes.csv")
writecsv(funcnodes, "./Results/funcnodes.csv")
writecsv(subcelnodes, "./Results/subnodes.csv")

writecsv(profuncedge, "./Results/profuncedge.csv")
writecsv(disprotedge, "./Results/disprotedge.csv")
writecsv(prosubedge, "./Results/prosubedge.csv")

## Start graph

In [82]:
def startGraph(passloc, directory):
    #Password
    with open(passloc, "r") as f:
        password = f.read()
    
    curdir = os.path.abspath(directory)
    #Commands for stopping / generating / starting neo4j
    cmd0 = ["sudo", "-S", "neo4j", "stop"]
    cmd1 = [ \
            "sudo", "-S", "neo4j-admin", "database", "import", "full", "neo4j", \
            "--nodes="+ curdir+"/protnodes.csv", \
            "--nodes="+ curdir+"/disnodes.csv", \
            "--nodes="+ curdir+"/funcnodes.csv", \
            "--nodes="+ curdir+"/subnodes.csv", \
            "--relationships="+ curdir+"/profuncedge.csv", \
            "--relationships="+ curdir+"/disprotedge.csv", \
            "--relationships="+ curdir+"/prosubedge.csv", \
            "--delimiter=;", \
            "--array-delimiter=|", \
            "--overwrite-destination", \
            "--verbose"
            ]
    cmd2 = ["sudo", "-S", "neo4j", "start", "--verbose"]
    
    #Stop server
    result = subprocess.run(cmd0, input=password+"\n", capture_output=True, text=True)
    if result.returncode == 0:
        print("Graph stopped")
    else:
        print(f"Stop failed with error:\n{result.stderr}")
    
    #Generate graph
    result = subprocess.run(cmd1, input=password+"\n", capture_output=True, text=True)
    if result.returncode == 0:
        print("Database import successful!")
    else:
        print(f"Import failed with error:\n{result.stderr}")
    
    #Start graph
    result = subprocess.run(cmd2, input=password+"\n", capture_output=True, text=True)
    if result.returncode == 0:
        print("Graph started")
    else:
        print(f"Start failed with error:\n{result.stderr}")


startGraph("./pass_ent.txt", "./Results/")

Graph stopped
Database import successful!
Graph started


In [5]:
startGraph("./pass_ent.txt", "./Results/")

Graph stopped
Database import successful!
Graph started


## Run cypher

In [11]:
uri = "bolt://localhost:7687" 
driver = GraphDatabase.driver(uri)

#“Which traits are associated with gene $gene’s_name?”
query_1 = [f'match (p:Protein)\n \
            where not p.subcellular_location contains "[]" \
            return p.uniprot as Uniprot, p.subcellular_location as Subcellular_location, p.seq as Sequence']
#print(f"\nWhich traits are associated with gene {gene_name}?")
results_1 = run_query(query_1[0], driver)
display(results_1)

#Close connection
driver.close()

Unnamed: 0,Uniprot,Subcellular_location,Sequence
0,A2RRL7,[['membrane ; single-pass type i membrane prot...,MQRLPAATRATLILSLAFASLHSACSAEASSSNSSSLTAHHPDPGT...
1,A8MYZ6,"[['cytoplasm', 'nucleus']]",MAAKLRAHQVDVDPDFAPQSRPRSCTWPLPQPDLAGDEDGALGAGV...
2,O14804,[['cell membrane ; multi-pass membrane protein']],MRAVFIQGAEEHPAAFCYQVNGSCPRTVHTLGIQLVIYLACAAGML...
3,O75459,"[['nucleoli fibrillar center', 'mitochondria',...",MGFLRRLIYRRRPMIYVESSEESSDEQPDEVESPTQSQDSTPAEER...
4,O75912,"[['cell projection, axon', 'cell projection, d...",MDAAGRGCHLLPLPAARGPARAPAAAAAAAASPPGPCSGAACAPSA...
...,...,...,...
18417,Q16526,"[['cytoplasm', 'nucleus', 'microtubules', 'nuc...",MGVNAVHWFRKGLRLHDNPALKECIQGADTIRCVYILDPWFAGSSN...
18418,Q3B7T1,"[['nucleus', 'nucleoplasm', 'centrosome', 'cyt...",MGDAKEAGAEGPPAGAAARGGLSLLSQGESEESSAQGSALFLGGNE...
18419,Q499Z4,"[['nucleus', 'nucleoplasm']]",MFATSGAVAAGKPYSCSECGKSFCYSSVLLRHERAHGGDGRFRCLE...
18420,Q5H8A4,[['endoplasmic reticulum membrane ; multi-pass...,MRLGSGTFATCCVAIEVLGIAVFLRGFFPAPVRSSARAEHGAEPPA...


In [None]:
call {match (d:Disease)-[de1:direct_evidence]-(p1:Protein)
where toFloat(de1.score) > 0.6
return p1, de1, d}
match (d)-[de2]-(p2:Protein)-[ife1:is_feature]-(f:Function)
where toLower(f.note) contains toLower(p1.gene)
and not p1.gene = p2.gene
and toFloat(de2.score) > 0.6
return p2.gene as Gene_1, p1.gene as Gene_2, d.name as Disease, collect(distinct f.note) as Function 
order by size(Function) desc limit 50 

# Function improvements

In [23]:
unidf

Unnamed: 0,Uniprot,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
A0A075B6U7,A0A075B6U7,TRAJ23,T cell receptor alpha joining 23 (Fragment),XIYNQGGKLIFGQGTELSVKP,[ENSG00000211866],[[ENST00000390514]],[],[],"[[{'id': '12052', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A0A2R8YED5,A0A2R8YED5,OR5BS1P,Olfactory receptor,MEVSNMTTVTVFILLGLSNNPQVQALLFVLFLVIYLLTLLGNLLMV...,[ENSG00000198678],[[ENST00000328207]],[],[],"[[{'id': '19627', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A2RRL7,A2RRL7,TMEM213,Transmembrane protein 213,MQRLPAATRATLILSLAFASLHSACSAEASSSNSSSLTAHHPDPGT...,[ENSG00000214128],"[[ENST00000458494, ENST00000442682, ENST000004...",[[]],[[membrane ; single-pass type i membrane prote...,"[[{'id': '27220', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A6NHS1,A6NHS1,YK042,Putative uncharacterized protein ENSP00000347057,MVLLAGTRPQGGEARCMIPPPPSPLLGAQVEEDRTEFKEFQDFSSL...,[],[],[],[],[],[],[]
A6NL46,A6NL46,YF016,Putative UPF0607 protein ENSP00000332738,MRLCLIPWNTTPHRVLPPVVWSAPSRKKPVLSARNSMMFGHLSPVR...,[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
Q9UPU3,Q9UPU3,SORCS3,VPS10 domain-containing receptor SorCS3,MEAARTERPAGRPGAPLVRTGLLLLSTWVLAGAEITWDATGGPGRP...,[ENSG00000156395],"[[ENST00000393176, ENST00000369701]]",[[]],[[membrane; single-pass type i membrane protein]],"[[{'id': '16699', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y2Z9,Q9Y2Z9,COQ6,"Ubiquinone biosynthesis monooxygenase COQ6, mi...",MAARLVSRCGAVRAAPHSGPLVSWRRWSGASTDTVYDVVVSGGGLV...,[ENSG00000119723],"[[ENST00000553922, ENST00000557780, ENST000003...",[[FAD-dependent monooxygenase required for the...,[[mitochondrion inner membrane ; peripheral me...,"[[{'id': '20233', 'source': 'HGNC'}, {'id': 'R...","[[R-HSA-2142789, Ubiquinol biosynthesis]]","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y328,Q9Y328,NSG2,Neuronal vesicle trafficking-associated protein 2,MVKLNSNPSEKGTKPPSVEDGFQTVPLITPLEVNHLQLPAPEKVIV...,[ENSG00000170091],"[[ENST00000517902, ENST00000517587, ENST000005...",[[]],[[membrane ; single- pass type ii membrane pro...,"[[{'id': '24955', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y5S2,Q9Y5S2,CDC42BPB,Serine/threonine-protein kinase MRCK beta,MSAKVRLKKLEQLLLDGPWRNESALSVETLLDVLVCLYTECSHSAL...,[ENSG00000198752],"[[ENST00000559790, ENST00000558321, ENST000005...",[[Serine/threonine-protein kinase which is an ...,"[[cytoplasm, cell membrane ; peripheral membra...","[[{'id': '1738', 'source': 'HGNC'}, {'id': '3Q...","[[R-HSA-9013409, RHOJ GTPase cycle], [R-HSA-90...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."


In [51]:
def genSubcellNode(df):
    #Subcellular localisation annotations for broad categories
    uniloc = {"Intracellular": [], "Extracellular": [], "Membrane": []}

    #Identify unique locations
    indiv_loc = {}
    for loc_list in df["Subcell_loc"].values:
        for ent in loc_list:
            for loc in ent:
                if loc not in indiv_loc:
                    indiv_loc.setdefault(loc, "")
                    
    #Add broad subcellular locations based on keywords
    for loc in indiv_loc:
        if any( i in loc.lower().split() for i in ["secreted", "secret", "extracellular"]):
            indiv_loc[loc] = "extracellular"
        else:
            if any( i in loc.lower().split() for i in ["membrane", "wall", "lipid"]):
                indiv_loc[loc] = "membrane"
            else:
                indiv_loc[loc] = "intracellular"
    
    #Generate Subcellular Location node
    count = 0
    outcsv = [["id:ID", ":LABEL", "location", "intra_mem_extra"]]
    nodecodec = {}
    for loc in indiv_loc:
        count += 1
        outcsv.append(["S"+str(count), "Subcellular", loc, indiv_loc[loc]])
        nodecodec.setdefault(loc, "S"+str(count))

    return outcsv, nodecodec

def genProtSubEdge(protcod, subcelcod, df):
    #Generate Protein <> Subcellular location edges
    outcsv = [[":START_ID",":END_ID", ":TYPE"]]
    for sub in subcelcod:
        subdf = df[df["Subcell_loc"].apply(lambda x: sub in x[0] if x else False)]
        for index, row in subdf.iterrows():
            outcsv.append([protcod[row["Uniprot"]], subcelcod[sub], "is_location"])
    return outcsv

subcelnodes, subcelcodec = genSubcellNode(unidf)
protnodes, protcodec = genProtNode(unidf)

prosubedge      = genProtSubEdge(protcodec, subcelcodec, unidf)

for i in subcelnodes:
    print(i)


['id:ID', ':LABEL', 'location', 'intra_mem_extra']
['S1', 'Subcellular', 'membrane ; single-pass type i membrane protein', 'membrane']
['S2', 'Subcellular', 'cytoplasm', 'intracellular']
['S3', 'Subcellular', 'nucleus', 'intracellular']
['S4', 'Subcellular', 'cell membrane ; multi-pass membrane protein', 'membrane']
['S5', 'Subcellular', 'nucleoli fibrillar center', 'intracellular']
['S6', 'Subcellular', 'mitochondria', 'intracellular']
['S7', 'Subcellular', 'nucleoplasm', 'intracellular']
['S8', 'Subcellular', 'cell projection, axon', 'intracellular']
['S9', 'Subcellular', 'cell projection, dendrite', 'intracellular']
['S10', 'Subcellular', 'cell junction, synapse, presynapse', 'intracellular']
['S11', 'Subcellular', 'cell junction, synapse, postsynapse', 'intracellular']
['S12', 'Subcellular', 'cell junction, synapse, postsynaptic density', 'intracellular']
['S13', 'Subcellular', 'cell junction, synapse, synaptic cell membrane', 'membrane']
['S14', 'Subcellular', 'cytoplasmic vesicle

In [63]:
display(unidf.iloc[100]["Subcell_loc"][0])

['secreted', 'cell membrane']

In [70]:
display(unidf[unidf["Subcell_loc"].apply(lambda x: 'cell membrane' in x[0] if x else False)])

Unnamed: 0,Uniprot,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
A0A075B6I3,A0A075B6I3,IGLV11-55,Probable non-functional immunoglobulin lambda ...,MALTPLLLLLLSHCTGSLSRPVLTQPPSLSASPGATARLPCTLSSD...,[ENSG00000211641],[[ENST00000390286]],[[Probable non-functional open reading frame (...,"[[secreted, cell membrane]]","[[{'id': '5886', 'source': 'HGNC'}, {'id': 'IP...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
P29474,P29474,NOS3,"Nitric oxide synthase, endothelial",MGNLKSVAQEPGPPCGLGLGLGLGLCGKQGPATPAPEPSRAPASLL...,[ENSG00000164867],"[[ENST00000297494, ENST00000461406, ENST000004...",[[Produces nitric oxide (NO) which is implicat...,"[[cell membrane, membrane, caveola, cytoplasm,...","[[{'id': '7876', 'source': 'HGNC'}, {'id': '1M...","[[R-HSA-9009391, Extra-nuclear estrogen signal...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9H7D0,Q9H7D0,DOCK5,Dedicator of cytokinesis protein 5,MARWIPTKRQKYGVAIYNYNASQDVELSLQIGDTVHILEMYEGWYR...,[ENSG00000147459],"[[ENST00000276440, ENST00000479547, ENST000004...",[[Guanine nucleotide exchange factor (GEF) for...,"[[cytoplasm, cell membrane, cytosol]]","[[{'id': '23476', 'source': 'HGNC'}, {'id': 'R...","[[R-HSA-983231, Factors involved in megakaryoc...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
O75038,O75038,PLCH2,"1-phosphatidylinositol 4,5-bisphosphate phosph...",MSGPWPSPDSRTKGTVAWLAEVLLWVGGSVVLSSEWQLGPLVERCM...,[ENSG00000149527],"[[ENST00000473964, ENST00000449969, ENST000003...",[[The production of the second messenger molec...,"[[cytoplasm, cell membrane]]","[[{'id': '29037', 'source': 'HGNC'}, {'id': 'R...","[[R-HSA-1855204, Synthesis of IP3 and IP4 in t...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
P09471,P09471,GNAO1,Guanine nucleotide-binding protein G(o) subuni...,MGCTLSAEERAALERSKAIEKNLKEDGISAAKDVKLLLLGAGESGK...,[ENSG00000087258],"[[ENST00000639787, ENST00000640560, ENST000006...",[[Guanine nucleotide-binding proteins (G prote...,"[[cell membrane, membrane]]","[[{'id': '4389', 'source': 'HGNC'}, {'id': '6F...","[[R-HSA-4086398, Ca2+ pathway]]","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
...,...,...,...,...,...,...,...,...,...,...,...
Q5EBL8,Q5EBL8,PDZD11,PDZ domain-containing protein 11,MDSRIPYDDYPVVFLPAYENPPAWIPPHERVHHPDYNNELTQFLPR...,[ENSG00000120509],"[[ENST00000239666, ENST00000473667, ENST000003...",[[Mediates docking of ADAM10 to zonula adheren...,"[[[isoform 2]: secreted, [isoform 1]: cytoplas...","[[{'id': '28034', 'source': 'HGNC'}, {'id': 'R...","[[R-HSA-199220, Vitamin B5 (pantothenate) meta...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q641Q2,Q641Q2,WASHC2A,WASH complex subunit 2A,MMNRTTPDQELAPASEPVWERPWSVEEIRRSSQSWSLAADAGLLQF...,[ENSG00000099290],"[[ENST00000611324, ENST00000454806, ENST000002...",[[Acts at least in part as component of the WA...,"[[early endosome membrane, cell membrane, vesi...","[[{'id': '23416', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
O60437,O60437,PPL,Periplakin,MNSLFRKRNKGKYSPTVQTRSISNKELSELIEQLQKNADQVEKNIV...,[ENSG00000118898],"[[ENST00000588556, ENST00000589090, ENST000003...",[[Component of the cornified envelope of kerat...,"[[cell junction, desmosome, cytoplasm, cytoske...","[[{'id': '9273', 'source': 'HGNC'}, {'id': '4Q...","[[R-HSA-8851680, Butyrophilin (BTN) family int...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
P35125,P35125,USP6,Ubiquitin carboxyl-terminal hydrolase 6,MDMVENADSLQAQERKDILMKYDKGHRAGLPEDKGPEPVGINSSID...,[ENSG00000129204],"[[ENST00000575709, ENST00000574788, ENST000002...",[[Deubiquitinase with an ATP-independent isope...,"[[cell membrane, cytoplasm, endosome, vesicles]]","[[{'id': '12629', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."


In [72]:
print(protcodec)

{'A0A075B6U7': 'P1', 'A0A2R8YED5': 'P2', 'A2RRL7': 'P3', 'A6NHS1': 'P4', 'A6NL46': 'P5', 'A8MUI8': 'P6', 'A8MUU9': 'P7', 'A8MVJ9': 'P8', 'A8MWP6': 'P9', 'A8MYZ6': 'P10', 'O00370': 'P11', 'O14804': 'P12', 'O75459': 'P13', 'O75912': 'P14', 'O94855': 'P15', 'P07099': 'P16', 'P0C7X3': 'P17', 'P0C851': 'P18', 'P0C879': 'P19', 'P12757': 'P20', 'P16118': 'P21', 'P19013': 'P22', 'P19838': 'P23', 'P30044': 'P24', 'P31275': 'P25', 'P31930': 'P26', 'P35716': 'P27', 'P41227': 'P28', 'P45381': 'P29', 'P48552': 'P30', 'P60153': 'P31', 'P78344': 'P32', 'Q02413': 'P33', 'Q0P670': 'P34', 'Q13069': 'P35', 'Q13387': 'P36', 'Q13480': 'P37', 'Q13519': 'P38', 'Q1T7F1': 'P39', 'Q3C1V9': 'P40', 'Q499Y3': 'P41', 'Q5MIZ7': 'P42', 'Q5SW24': 'P43', 'Q5TBC7': 'P44', 'Q5TCQ9': 'P45', 'Q5TYX0': 'P46', 'Q5VZ52': 'P47', 'Q64ET8': 'P48', 'Q6ISB3': 'P49', 'Q6P531': 'P50', 'Q6UWI4': 'P51', 'Q6ZSA8': 'P52', 'Q6ZSY5': 'P53', 'Q6ZTI0': 'P54', 'Q6ZUG5': 'P55', 'Q6ZVH6': 'P56', 'Q6ZVQ6': 'P57', 'Q70Z44': 'P58', 'Q7RTV5': 'P59

In [74]:
print(subcelcodec)

{'membrane ; single-pass type i membrane protein': 'S1', 'cytoplasm': 'S2', 'nucleus': 'S3', 'cell membrane ; multi-pass membrane protein': 'S4', 'nucleoli fibrillar center': 'S5', 'mitochondria': 'S6', 'nucleoplasm': 'S7', 'cell projection, axon': 'S8', 'cell projection, dendrite': 'S9', 'cell junction, synapse, presynapse': 'S10', 'cell junction, synapse, postsynapse': 'S11', 'cell junction, synapse, postsynaptic density': 'S12', 'cell junction, synapse, synaptic cell membrane': 'S13', 'cytoplasmic vesicle, secretory vesicle, synaptic vesicle membrane': 'S14', 'cytoplasm, cytosol': 'S15', 'cytoplasmic vesicle, copii-coated vesicle membrane ; peripheral membrane protein ; cytoplasmic side': 'S16', 'endoplasmic reticulum membrane ; peripheral membrane protein ; cytoplasmic side': 'S17', 'vesicles': 'S18', 'microsome membrane ; single-pass type iii membrane protein': 'S19', 'endoplasmic reticulum membrane ; single-pass type iii membrane protein': 'S20', 'membrane ; multi-pass membrane p

In [79]:
len(prosubedge)

52241

In [75]:
unidf

Unnamed: 0,Uniprot,Gene,Name,Seq,Ensembl_gene,Ensembl_trans,Description,Subcell_loc,DB_entries,Bio_path,Tx_approaches
A0A075B6U7,A0A075B6U7,TRAJ23,T cell receptor alpha joining 23 (Fragment),XIYNQGGKLIFGQGTELSVKP,[ENSG00000211866],[[ENST00000390514]],[],[],"[[{'id': '12052', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A0A2R8YED5,A0A2R8YED5,OR5BS1P,Olfactory receptor,MEVSNMTTVTVFILLGLSNNPQVQALLFVLFLVIYLLTLLGNLLMV...,[ENSG00000198678],[[ENST00000328207]],[],[],"[[{'id': '19627', 'source': 'HGNC'}]]",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A2RRL7,A2RRL7,TMEM213,Transmembrane protein 213,MQRLPAATRATLILSLAFASLHSACSAEASSSNSSSLTAHHPDPGT...,[ENSG00000214128],"[[ENST00000458494, ENST00000442682, ENST000004...",[[]],[[membrane ; single-pass type i membrane prote...,"[[{'id': '27220', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
A6NHS1,A6NHS1,YK042,Putative uncharacterized protein ENSP00000347057,MVLLAGTRPQGGEARCMIPPPPSPLLGAQVEEDRTEFKEFQDFSSL...,[],[],[],[],[],[],[]
A6NL46,A6NL46,YF016,Putative UPF0607 protein ENSP00000332738,MRLCLIPWNTTPHRVLPPVVWSAPSRKKPVLSARNSMMFGHLSPVR...,[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
Q9UPU3,Q9UPU3,SORCS3,VPS10 domain-containing receptor SorCS3,MEAARTERPAGRPGAPLVRTGLLLLSTWVLAGAEITWDATGGPGRP...,[ENSG00000156395],"[[ENST00000393176, ENST00000369701]]",[[]],[[membrane; single-pass type i membrane protein]],"[[{'id': '16699', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y2Z9,Q9Y2Z9,COQ6,"Ubiquinone biosynthesis monooxygenase COQ6, mi...",MAARLVSRCGAVRAAPHSGPLVSWRRWSGASTDTVYDVVVSGGGLV...,[ENSG00000119723],"[[ENST00000553922, ENST00000557780, ENST000003...",[[FAD-dependent monooxygenase required for the...,[[mitochondrion inner membrane ; peripheral me...,"[[{'id': '20233', 'source': 'HGNC'}, {'id': 'R...","[[R-HSA-2142789, Ubiquinol biosynthesis]]","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y328,Q9Y328,NSG2,Neuronal vesicle trafficking-associated protein 2,MVKLNSNPSEKGTKPPSVEDGFQTVPLITPLEVNHLQLPAPEKVIV...,[ENSG00000170091],"[[ENST00000517902, ENST00000517587, ENST000005...",[[]],[[membrane ; single- pass type ii membrane pro...,"[[{'id': '24955', 'source': 'HGNC'}, {'id': 'I...",[],"[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
Q9Y5S2,Q9Y5S2,CDC42BPB,Serine/threonine-protein kinase MRCK beta,MSAKVRLKKLEQLLLDGPWRNESALSVETLLDVLVCLYTECSHSAL...,[ENSG00000198752],"[[ENST00000559790, ENST00000558321, ENST000005...",[[Serine/threonine-protein kinase which is an ...,"[[cytoplasm, cell membrane ; peripheral membra...","[[{'id': '1738', 'source': 'HGNC'}, {'id': '3Q...","[[R-HSA-9013409, RHOJ GTPase cycle], [R-HSA-90...","[[{'modality': 'SM', 'id': 'Approved Drug', 'v..."
