In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [3]:
import helpers

In [4]:
from importlib import reload

In [5]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
from py2neo import Graph, Node, Relationship

In [7]:
graph = Graph(host="neo4j")

## Read in sheets

In [8]:
sheets = [#(file, sheet_name)
    ("v2.7.9_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions_New"), 
#     ("Model_CK.xlsx", "Reactions_new"), 
#     ("v2.7.2_PIS-model-JALR.xlsx", "Reactions_New")
]

In [9]:
# resave xlsx as tsv
drops = ['FOXMES', 'Legacy:Process', 'Legacy:ReactionMode', "Comment"]#, 'ConnID']
col_rename = {
    'Status':'Status',
    'AddedBy':'AddedBy',
    'Species':'Species',
    'ID':'input1_ID',
    'level':'input1_level',
    'localisation':'input1_localisation',
    'type':'input1_type',
    'ID.1':'input2_ID',
    'level.1':'input2_level',
    'localisation.1':'input2_localisation',
    'type.1':'input2_type',
    'ID.2':'input3_ID',
    'level.2':'input3_level',
    'localisation.2':'input3_localisation',
    'type.2':'input3_type',
    'ReactionEffect':'ReactionEffect',
    'ReactionMode':'ReactionMode',
    'Modifications':'Modifications',
    'ID.3':'output1_ID',
    'level.3':'output1_level',
    'localisation.3':'output1_localisation',
    'type.3':'output1_type',
    'TrustLevel':'TrustLevel',
    'Literature':'Literature',
    'AdditionalInfo':'AdditionalInfo',
    'Comment':'Comment',
    'Model-v':'ModelV',
    'KINETICS':'kinetics', 
    'ConnID': 'ConnID'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = input_path / file_name
    
    base_name, extension = os.path.splitext(file_name)
    new_file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    #if os.path.exists(new_file_path):
    #    continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)

    #df = df[~df["AddedBy"].isna()]
    #if 'Status' in df.columns:
    #    df = df[df['Status'].isin(["forCB", "forCB_INVENTED", np.nan])]

    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df['origin'] = df['Status'] + f'-{base_name}-{sheet_name}'
    
    df.to_csv(new_file_path, sep="\t", index=None)

v2.7.9_PIS-model.xlsx Reactions


In [10]:
dfs = []

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)

    file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    df = pd.read_csv(file_path, sep="\t")

    dfs.append(df)

v2.7.9_PIS-model.xlsx Reactions


In [11]:
df_edges = pd.concat(dfs, sort=False)
df_edges.reset_index(drop=True, inplace=True)

In [12]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,Modifications,output1_ID,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,ModelV,origin
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,SAMe,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,ACC,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,ET,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,translocation,Cu2+,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,v1.0,forCB-v2.7.9_PIS-model-Reactions
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,,ETR,family,ER,protein [active],"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,v1.0,forCB-v2.7.9_PIS-model-Reactions


In [13]:
df_edges['ReactionMode'].value_counts()

binding / oligomerisation                     104
catalysis / auto-catalysis                     79
transcriptional / translational induction      60
protein activation                             52
degradation / secretion                        32
cleavage / auto-cleavage                       29
transcriptional / translational repression     19
translocation                                   5
protein deactivation                            4
dissociation                                    1
Name: ReactionMode, dtype: int64

In [14]:
for c in df_edges.columns:
    df_edges[c] = df_edges[c].str.strip()

In [15]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
#df_edges.drop(x, inplace=True)

Int64Index([], dtype='int64')


In [16]:
for x in df_edges["TrustLevel"].unique():
    print(x)

[R1] targetted experiments (e.g. Y2H, BIFC)
[Ry] invented reaction
[R4] indirect reaction
[Rx] incomplete/unspecific reaction
[R2] high-throughput experiment (e.g. ChIP-seq)
[R3] in-silico prediction


In [17]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
#df_edges['observed_species'] = df_edges['Species'].apply(helpers.lower_string)
#df_edges['species_also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
#df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [18]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

array(['KG', 'MZ', 'ZR', 'MPE', 'ACR', 'MAK', 'ST', 'SB', 'JALR', 'AG'],
      dtype=object)

In [19]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

array(['v1.0', 'v2.5', 'v2.7', 'v2.6', 'vNA'], dtype=object)

In [20]:
# See here for keys https://www.utf8-chartable.de/unicode-utf8-table.pl 

def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def find_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print('    ', character, ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0'         : b' ',            # funky WIN whitespace
    b'\xe2\x80\xa6'     : b'...',          # …
    b'\xe2\x80\x8b'     : b'',             # have no figging clue
    b'\xe2\x80\x93'     : b'-',            # –
    
    b'\xce\xb1'         : b'&alpha;',      # α
    b'\xc3\x9f'         : b'&beta;',       # ß
    b'\xce\xb2'         : b'&beta;',       # β
    
    # some "prime" symbols...
    b'\xe2\x80\x98'     : b'&prime;',      # ‘ Left Single Quotation Mark
    b'\xe2\x80\x99'     : b'&prime;',      # ’ Right Single Quotation Mark
    b'\xc2\xb4'         : b'&prime;',      # ´ Acute Accent
    # actual prime
    b'\xe2\x80\xb2'     : b'&prime;',      # ′ Prime
    
    # Sorry accents :(
    b'\xc5\xa0'         : b'S',            # Š
    b'\xc5\xa1'         : b's',            # š
    b'\xc5\xbd'         : b'Z',            # Ž
    b'\xc4\x8d'         : b'c'             # č
}

def replacer(x, verbose=False):
    if type(x) == float:
        return x
    y = x.encode('utf-8')
    for old, new in ascii_replacers.items():
        y = y.replace(old, new)
    y = y.decode('utf-8')
    
    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
    
    return y.strip()


In [21]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        bad_cols.append(c)
    print()

Status 
-------------

AddedBy 
-------------

ConnID 
-------------

Species 
-------------

input1_ID 
-------------
     β 946 b'\xce\xb2'
all-trans-β-carotene
     β 946 b'\xce\xb2'
9-cis-β-carotene
     ′ 8242 b'\xe2\x80\xb2'
     β 946 b'\xce\xb2'
9-cis-10′-apo-β-carotenal

input1_level 
-------------

input1_localisation 
-------------

input1_type 
-------------

input2_ID 
-------------

input2_level 
-------------

input2_localisation 
-------------

input2_type 
-------------

input3_ID 
-------------

input3_level 
-------------

input3_localisation 
-------------

input3_type 
-------------

ReactionEffect 
-------------

ReactionMode 
-------------

Modifications 
-------------

output1_ID 
-------------
     β 946 b'\xce\xb2'
9-cis-β-carotene
     ′ 8242 b'\xe2\x80\xb2'
     β 946 b'\xce\xb2'
9-cis-10′-apo-β-carotenal

output1_level 
-------------

output1_localisation 
-------------

output1_type 
-------------

TrustLevel 
-------------

Literature 
-------------
     

In [22]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(replacer, verbose=True)
    print()

input1_ID 
-------------
'all-trans-β-carotene' : 'all-trans-&beta;-carotene'
'9-cis-β-carotene' : '9-cis-&beta;-carotene'
'9-cis-10′-apo-β-carotenal' : '9-cis-10&prime;-apo-&beta;-carotenal'

output1_ID 
-------------
'9-cis-β-carotene' : '9-cis-&beta;-carotene'
'9-cis-10′-apo-β-carotenal' : '9-cis-10&prime;-apo-&beta;-carotenal'

Literature 
-------------
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…) | DOI:10.​1104/​pp.​107.​104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent…)' : 'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes...) | DOI:10.1104/pp.107.104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent...)'
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…) | DOI:10.​1104/​pp.​107.​104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent…)' : 'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes...) | DOI:10.1104/pp.107.104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent...)'
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…

In [23]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        print()

input1_ID 
-------------
output1_ID 
-------------
Literature 
-------------
AdditionalInfo 
-------------


In [24]:
# also need to look for ' (quote) and " (doube; quote)
# which are used instead of prime, (may cause string issues??)

# replacing 5' with 5&prime; and 3' with 3&prime;
def find_quotes(x):
    x = str(x)
    if x.find('"') != -1: 
        print(x)
        return True
    elif x.find("'") != -1:
        print(x)
        return True
    else:
        return False
        

def quote_replacer(x, verbose=False):
    if type(x) == float:
        x = ""
    y = x.replace("5'", "5&prime;")
    y = y.replace("3'", "3&prime;")

    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
        
    return y



In [25]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        bad_cols.append(c)
    print()

Status 
-------------

AddedBy 
-------------

ConnID 
-------------

Species 
-------------

input1_ID 
-------------

input1_level 
-------------

input1_localisation 
-------------

input1_type 
-------------

input2_ID 
-------------

input2_level 
-------------

input2_localisation 
-------------

input2_type 
-------------

input3_ID 
-------------

input3_level 
-------------

input3_localisation 
-------------

input3_type 
-------------

ReactionEffect 
-------------

ReactionMode 
-------------

Modifications 
-------------

output1_ID 
-------------

output1_level 
-------------

output1_localisation 
-------------

output1_type 
-------------

TrustLevel 
-------------

Literature 
-------------

AdditionalInfo 
-------------
actually binding - acts as inhibition as it outcompetes bdingin to promotor, can be extended to include binding to HISTONE DEACETYLASE 6 (HDA6) 'interacts with JAZs and EIN3/EIL1 as a co-repressor'
RTM proteins block the long distance transport of plan

In [26]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(quote_replacer, verbose=True)
    print()

AdditionalInfo 
-------------
'Isopentenyladenosine-5'-triphosphate + H2O <=> Isopentenyladenosine-5'-diphosphate + Orthophosphate; https://www.genome.jp/entry/R08061;' : 'Isopentenyladenosine-5&prime;-triphosphate + H2O <=> Isopentenyladenosine-5&prime;-diphosphate + Orthophosphate; https://www.genome.jp/entry/R08061;'
'Isopentenyladenosine-5'-diphosphate + H2O <=> N6-(delta2-Isopentenyl)-adenosine 5'-monophosphate + Orthophosphate; https://www.genome.jp/entry/R08062;' : 'Isopentenyladenosine-5&prime;-diphosphate + H2O <=> N6-(delta2-Isopentenyl)-adenosine 5&prime;-monophosphate + Orthophosphate; https://www.genome.jp/entry/R08062;'



In [27]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        print()

AdditionalInfo 
-------------
actually binding - acts as inhibition as it outcompetes bdingin to promotor, can be extended to include binding to HISTONE DEACETYLASE 6 (HDA6) 'interacts with JAZs and EIN3/EIL1 as a co-repressor'
RTM proteins block the long distance transport of plant viruses. They don't directly interact with CP, but the process of RTM resistance involves at least 5 different proteins.
RTM proteins block the long distance transport of plant viruses. They don't directly interact with CP, but the process of RTM resistance involves at least 5 different proteins.
...Upon pathogen challenge, the defense repressor AtSR1 is degradedthrough the SR1IP1-CUL3-mediated ubiquitin/proteasome pathway torelieve the repression of EDS1 transcription exerted by Ca2+/CaM signaling...These results indicatethat SR1IP1-CUL3 E3/proteasome-mediated degradation of AtSR1 is a critical mechanism to relieve the suppression of immune responses exerted by Ca2+/CaM/AtSR1. '|'...The negative regulation

In [28]:
df_edges["Literature"].fillna(value="", inplace=True)

In [29]:
# format literature sources

re_ec = "ec(?:\:|\s)?(\d+(?:\.(?:\-|\d+)){1,3}(?:\.n\d+)?)(?:\s|$|\]|,|\.)"


def doi_list(x):
    x = x.lower()
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x =x.lower()
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []


def format_literature(row):
    issued = False
    dbs_list = []
    
    x = row['Literature'].lower().strip()
    doi_match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    

    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
                 
    for key in x.split("|"):
        if key =="":
            print(f"BLANK\\{row['ConnID']}")
            issued = True

        elif ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + key.split(":")[1].strip()
                dbs_list.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + key.split(":")[1].strip()
                dbs_list.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pmcid" in key) or ("pmid" in key):
                continue
            else:
                print(f"UNKOWN DB\\{row['ConnID']}\\{key}")                 
                issued = True
        elif "invented" in key:
            dbs_list.append("invented:reason")
        else:
            print(f"NOVALUE\\{row['ConnID']}\\{key}")
            issued = True

    if (len(dbs_list)==0) and not issued:
        print(f"BADorMISSING\\{row['ConnID']}\\{x}")

    ################
    x = row['AdditionalInfo']
    x = x.lower()
    
    # EC:3.3.3.-
    ec_match = re.findall(re_ec, x)
    dbs_list += [f"ec:{idf}" for idf in ec_match]
    
    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
    
    doi_match = re.findall("doi(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    
    
    kegg_match = re.findall(r"((?:k|map|ko|ec|rn)\d{5})", x)
    dbs_list += [f"kegg:{idf}" for idf in kegg_match]              
    
    ncbi_nuccore_match = re.findall("NCBI ID: (.+)", x)
    dbs_list += [f"ncbi_nuccore:{idf}" for idf in ncbi_nuccore_match]              
         
    return ','.join(list(set(dbs_list)))

In [30]:
df_edges['external_links'] = df_edges.apply(format_literature, axis=1)

BLANK\Conn039
BLANK\Conn115
NOVALUE\Conn116\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4 
BLANK\Conn117
NOVALUE\Conn118\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4 
BLANK\Conn119
NOVALUE\Conn120\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4 
BLANK\Conn178
BLANK\Conn179
NOVALUE\Conn194\? kg need to find reference
NOVALUE\Conn267\ 10.1073/pnas.98.4.2065
BLANK\Conn309


In [31]:
df_edges[df_edges['external_links']==''][['ConnID', 'origin', 'Literature', 'AdditionalInfo']]

Unnamed: 0,ConnID,origin,Literature,AdditionalInfo
38,Conn039,forCB-v2.7.9_PIS-model-Reactions,,
114,Conn115,forCB-v2.7.9_PIS-model-Reactions,,This is viral PTI.
116,Conn117,forCB-v2.7.9_PIS-model-Reactions,,
118,Conn119,forCB-v2.7.9_PIS-model-Reactions,,"RNA silencing inhibits virus, part of PTI."
177,Conn178,forCB-v2.7.9_PIS-model-Reactions,,"have to check which one, not sure that this mp..."
178,Conn179,forCB-v2.7.9_PIS-model-Reactions,,"have to check which one, not sure that this mp..."
193,Conn194,forCB-v2.7.9_PIS-model-Reactions,? Kg need to find reference,
308,Conn309,forCB-v2.7.9_PIS-model-Reactions,,


In [32]:
df_edges[["ConnID",  "external_links", "Literature", "AdditionalInfo"]].to_csv(output_path / "reactions-lit-check.tsv", sep="\t", index=None)

In [33]:
df_edges.reset_index(inplace=True, drop=True)

In [34]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [35]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,ModelV,origin,trust_level,external_links
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions,R1,aracyc:ethyl-pwy
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions,R1,aracyc:ethyl-pwy
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,v1.0,forCB-v2.7.9_PIS-model-Reactions,R1,aracyc:ethyl-pwy
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,v1.0,forCB-v2.7.9_PIS-model-Reactions,R1,doi:10.1105/tpc.001768
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,family,ER,protein [active],"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,v1.0,forCB-v2.7.9_PIS-model-Reactions,R1,doi:10.1105/tpc.001768


In [36]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [37]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [inactivated]":"protein", 
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    'protein_active': 'protein_active',
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    'complex_active': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    "process_active":"process_active", 
    'process [active]':"process_active",

    np.nan:"", 
}

node_type_to_active_form_dict = {
    "protein [active]":"protein_active",
    "protein":"protein_active",
    "protein_active":"protein_active",
    
    "complex":"complex_active", 
    "complex_active": "complex_active",
    
    "process":"process_active", 
    'process_active':"process_active",
}

node_type_to_inactive_form_dict = {
    "protein":"protein",
    "protein_active":"protein",
    "protein [inactivated]":"protein",
    
    "complex":"complex", 
    "complex_active": "complex",
    
    "process":"process", 
    'process_active':"process",
}



def get_node_form(row, prefix="input1"):
    id_ = row.iloc[0]
    if type(id_) == float:
        return np.nan
    
    type_ = row.iloc[1]
    form = node_type_to_node_form_dict[type_]
    if (  len(re.findall("\(a\)", id_)) > 0  ) or (  len(re.findall("\[active\]", id_)) > 0  ):
        form = node_type_to_active_form_dict[type_]
        print(id_, type_, "-->", form)
        
    return form

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    #x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[[id_col, type_col]].apply(get_node_form, prefix=prefix, axis=1)

input1
ARR-A(a)(p) protein --> protein_active
ARR-B(a)(p) protein --> protein_active
ARR-B(a)(p) protein --> protein_active
input2
AHK2,3,4(a) protein [active] --> protein_active
AHK2,3,4(a) protein [active] --> protein_active
ARR-B(a)(p) protein [active] --> protein_active
AHP1,2,3,4,5(a) protein [active] --> protein_active
AHP1,2,3,4,5(a) protein [active] --> protein_active
input3
output1
AHK2,3,4(a) protein [active] --> protein_active
AHK2,3,4(a) protein [active] --> protein_active
AHK2,3,4(a) protein [active] --> protein_active
AHK2,3,4(a) protein [active] --> protein_active
AHP1,2,3,4,5(a) protein [active] --> protein_active
ARR-A(a)(p) protein --> protein_active
ARR-B(a)(p) protein --> protein_active


In [38]:
def remove_brackets(x):
    # reorder ids for complexes
    if type(x) == np.float:
        return np.nan
    else:
        s = re.sub("\(a\)", "", x)
        s = re.sub("\(p\)", "", s)
        if s != x:
            print(x, s)
        return s

In [39]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(remove_brackets)

ARR-A(a)(p) ARR-A
ARR-B(a)(p) ARR-B
ARR-B(a)(p) ARR-B
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
ARR-B(a)(p) ARR-B
AHP1,2,3,4,5(a) AHP1,2,3,4,5
AHP1,2,3,4,5(a) AHP1,2,3,4,5
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHP1,2,3,4,5(a) AHP1,2,3,4,5
ARR-A(a)(p) ARR-A
ARR-B(a)(p) ARR-B


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if type(x) == np.float:


In [40]:
df_edges['species'] = df_edges['Species'].apply(lambda x: ",".join(x.lower().split('/')))

In [41]:
df_edges['species'].unique()

array(['ath', 'ath,nbe', 'ath,stu', 'all', 'ath,nta', 'stu', 'osa',
       'ath,osa', 'ath,osa,psa', 'ath,osa,sly,zma', 'ath,osa,phy'],
      dtype=object)

In [42]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus', 
    'mitochondria?': 'putative:mitochondrion', 
    'cytoplasm?': 'putative:cytoplasm', 
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])

good_localisations.update(['putative:' + s for s in good_localisations])


def node_localisation_std(x):
    if not type(x) == str:
        return "putative:cytoplasm"
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        print(x)
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col, location_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation', '_location')]
    
    x = df_edges[['ConnID', 'origin', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    for _, y in x.iterrows():
        if (not (y[id_col] in helpers.empty_strings)) and (y[localisation_col] in helpers.empty_strings + ['']):
            print(y['origin'], "|", y['ConnID'], "|", prefix, "|", y[id_col], "|", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[location_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[location_col])

In [43]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'cytoplasm?',
 'extracellular',
 'mitochondria?',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [44]:
new_localisation

{'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'putative:cytoplasm',
 'putative:mitochondrion',
 'vacuole'}

In [45]:
good_localisations

{'apoplast',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'mitochondrion',
 'nucleolus',
 'nucleus',
 'peroxisome',
 'putative:apoplast',
 'putative:chloroplast',
 'putative:cytoplasm',
 'putative:endoplasmic reticulum',
 'putative:extracellular',
 'putative:golgi apparatus',
 'putative:mitochondrion',
 'putative:nucleolus',
 'putative:nucleus',
 'putative:peroxisome',
 'putative:vacuole',
 'vacuole'}

In [46]:
print('reaction_mode_dict = {')
for s in df_edges['ReactionMode'].unique():
    print(f"\t'{s}':'{'/'.join([x.lower().strip() for x in str(s).split('/')])}',")
print('}')

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational induction',
	'degradation / secretion':'degradation/secretion',
	'nan':'nan',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}


In [47]:
# # Old version before edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'translation',
# 	'transcription':'transcription',
# 	'by binding':'by binding',
# 	'nan':'nan',
# 	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'protein phosphorylation',
# }

# Old version after edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'bad',
# 	'transcription':'bad',
# 	'by binding':'bad',
# 	np.nan:'bad',
# 	'cleavage / auto-cleavage':'bad',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'bad',
# }

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational activation',
	'degradation / secretion':'degradation/secretion',
	np.nan:'undefined',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}

In [48]:
df_edges['reaction_type'] = df_edges['ReactionMode'].apply(lambda x: reaction_mode_dict[x])

In [49]:
df_edges[df_edges['reaction_type']=='undefined'][['origin', 'ConnID', 'ReactionMode', 'ReactionEffect', 'reaction_type']]

Unnamed: 0,origin,ConnID,ReactionMode,ReactionEffect,reaction_type
31,_TBD-v2.7.9_PIS-model-Reactions,Conn032,,inhibition,undefined
60,_TBD-v2.7.9_PIS-model-Reactions,Conn061,,inhibition,undefined
87,_TBD-v2.7.9_PIS-model-Reactions,Conn088,,inhibition,undefined
88,_TBD-v2.7.9_PIS-model-Reactions,Conn089,,inhibition,undefined
91,_TBD-v2.7.9_PIS-model-Reactions,Conn092,,activation,undefined
120,_TBD-v2.7.9_PIS-model-Reactions,Conn121,,inhibition,undefined
150,_TBD-v2.7.9_PIS-model-Reactions,Conn151,,inhibition,undefined
209,_TBD-v2.7.9_PIS-model-Reactions,Conn210,,inhibition,undefined
310,_TBD-v2.7.9_PIS-model-Reactions,Conn311,,inhibition,undefined


In [50]:
df_edges.to_csv("parsed_reactions.tsv", sep="\t")

# END