# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [1]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [2]:
import helpers

In [3]:
from importlib import reload

In [4]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
graph = Graph(host="neo4j")

In [7]:
node_labels = helpers.node_labels

In [8]:
reaction_relationships = [
    'ACTIVATE',
    'INHIBIT',
    'PRODUCT',
    'SUBSTRATE',
    'TRANSLOCATE_FROM',
    'TRANSLOCATE_TO'
]

## Read in sheets

In [9]:
sheets = [#(file, sheet_name)
    ("v2.7.6_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions_New"), 
#     ("Model_CK.xlsx", "Reactions_new"), 
#     ("v2.7.2_PIS-model-JALR.xlsx", "Reactions_New")
]

In [10]:
# resave xlsx as tsv
drops = ['FOXMES', 'Legacy:Process', 'Legacy:ReactionMode', "Comment"]#, 'ConnID']
col_rename = {
    'Status':'Status',
    'AddedBy':'AddedBy',
    'Species':'Species',
    'ID':'input1_ID',
    'level':'input1_level',
    'localisation':'input1_localisation',
    'type':'input1_type',
    'ID.1':'input2_ID',
    'level.1':'input2_level',
    'localisation.1':'input2_localisation',
    'type.1':'input2_type',
    'ID.2':'input3_ID',
    'level.2':'input3_level',
    'localisation.2':'input3_localisation',
    'type.2':'input3_type',
    'ReactionEffect':'ReactionEffect',
    'ReactionMode':'ReactionMode',
    'Modifications':'Modifications',
    'ID.3':'output1_ID',
    'level.3':'output1_level',
    'localisation.3':'output1_localisation',
    'type.3':'output1_type',
    'TrustLevel':'TrustLevel',
    'Literature':'Literature',
    'AdditionalInfo':'AdditionalInfo',
    'Comment':'Comment',
    'Model-v':'ModelV',
    'KINETICS':'kinetics', 
    'ConnID': 'ConnID'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = input_path / file_name
    
    base_name, extension = os.path.splitext(file_name)
    new_file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    #if os.path.exists(new_file_path):
    #    continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)

    #df = df[~df["AddedBy"].isna()]
    #if 'Status' in df.columns:
    #    df = df[df['Status'].isin(["forCB", "forCB_INVENTED", np.nan])]

    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df['origin'] = df['Status'] + f'-{base_name}-{sheet_name}'
    
    df.to_csv(new_file_path, sep="\t", index=None)

v2.7.6_PIS-model.xlsx Reactions


In [11]:
dfs = []

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)

    file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    df = pd.read_csv(file_path, sep="\t")

    dfs.append(df)

v2.7.6_PIS-model.xlsx Reactions


In [12]:
df_edges = pd.concat(dfs, sort=False)
df_edges.reset_index(drop=True, inplace=True)

In [13]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,Modifications,output1_ID,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,ModelV,origin
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,SAMe,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,ACC,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,ET,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,translocation,Cu2+,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,v1.0,forCB-v2.7.6_PIS-model-Reactions
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,,ETR,family,ER,protein [active],"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,v1.0,forCB-v2.7.6_PIS-model-Reactions


In [14]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
#df_edges.drop(x, inplace=True)

Int64Index([], dtype='int64')


In [15]:
for x in df_edges["TrustLevel"].unique():
    print(x)

[R1] targetted experiments (e.g. Y2H, BIFC)
[Ry] invented reaction
[R4] indirect reaction
[Rx] incomplete/unspecific reaction
[R2] high-throughput experiment (e.g. ChIP-seq)
[R3] in-silico prediction


In [16]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
#df_edges['observed_species'] = df_edges['Species'].apply(helpers.lower_string)
#df_edges['species_also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
#df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [17]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

array(['KG', 'MZ', 'ZR', 'MPE', 'ACR', 'MAK', 'ST', 'SB', 'JALR', 'AG'],
      dtype=object)

In [18]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

array(['v1.0', 'v2.5', 'v2.7', 'v2.6'], dtype=object)

In [19]:
# See here for keys https://www.utf8-chartable.de/unicode-utf8-table.pl 

def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def find_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print(character, ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0'         : b' ',            # funky WIN whitespace
    b'\xe2\x80\xa6'     : b'...',          # …
    b'\xe2\x80\x8b'     : b'',             # have no figging clue
    b'\xe2\x80\x93'     : b'-',            # –
    b'\xce\xb1'         : b'&alpha;',       # α
    b'\xc3\x9f'         : b'&beta;',        # ß
    b'\xce\xb2'         : b'&beta;',        # β
    b'\xe2\x80\x98'     : b'&prime;',       # ‘
    b'\xe2\x80\x99'     : b'&prime;',       # ’
    b'\xc2\xb4'         : b'&prime;',       # ´
    # Sorry accents :(
    b'\xc5\xa0'         : b'S',            # Š
    b'\xc5\xa1'         : b's',            # š
    b'\xc5\xbd'         : b'Z',            # Ž
    b'\xc4\x8d'         : b'c'             # č
}


def replacer(x, verbose=False):
    if type(x) == float:
        return x
    y = x.encode('utf-8')
    for old, new in ascii_replacers.items():
        y = y.replace(old, new)
    y = y.decode('utf-8')
    
    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
    
    return y.strip()


In [20]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        bad_cols.append(c)
    print()

Status 
-------------

AddedBy 
-------------

ConnID 
-------------

Species 
-------------

input1_ID 
-------------
´ 180 b'\xc2\xb4'
9-cis-b-apo-10´-carotenal

input1_level 
-------------

input1_localisation 
-------------

input1_type 
-------------

input2_ID 
-------------

input2_level 
-------------

input2_localisation 
-------------

input2_type 
-------------

input3_ID 
-------------

input3_level 
-------------

input3_localisation 
-------------

input3_type 
-------------

ReactionEffect 
-------------

ReactionMode 
-------------

Modifications 
-------------

output1_ID 
-------------
´ 180 b'\xc2\xb4'
9-cis-b-apo-10´-carotenal

output1_level 
-------------

output1_localisation 
-------------

output1_type 
-------------

TrustLevel 
-------------

Literature 
-------------
… 8230 b'\xe2\x80\xa6'
​ 8203 b'\xe2\x80\x8b'
​ 8203 b'\xe2\x80\x8b'
​ 8203 b'\xe2\x80\x8b'
​ 8203 b'\xe2\x80\x8b'
… 8230 b'\xe2\x80\xa6'
DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encode

In [21]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(replacer, verbose=True)
    print()

input1_ID 
-------------
'9-cis-b-apo-10´-carotenal' : '9-cis-b-apo-10&prime;-carotenal'

output1_ID 
-------------
'9-cis-b-apo-10´-carotenal' : '9-cis-b-apo-10&prime;-carotenal'

Literature 
-------------
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…) | DOI:10.​1104/​pp.​107.​104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent…)' : 'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes...) | DOI:10.1104/pp.107.104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent...)'
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…) | DOI:10.​1104/​pp.​107.​104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent…)' : 'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes...) | DOI:10.1104/pp.107.104299 (RTE1 Is a Golgi-Associated and ETR1-Dependent...)'
'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes…) | DOI:10.1073/pnas.1214848109 (CTR1 phosphorylates the central regulator EIN2...) | DOI:10.1093/mp/ssq036 (New insight in ethylene signaling...) | DOI:

In [22]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        print()

input1_ID 
-------------
output1_ID 
-------------
Literature 
-------------
AdditionalInfo 
-------------


In [23]:
# also need to look for ' (quote) and " (doube; quote)
# which are used instead of prime, (may cause string issues??)

# replacing 5' with 5&prime; and 3' with 3&prime;
def find_quotes(x):
    x = str(x)
    if x.find('"') != -1: 
        print(x)
        return True
    elif x.find("'") != -1:
        print(x)
        return True
    else:
        return False
        

def quote_replacer(x, verbose=False):
    if type(x) == float:
        x = ""
    y = x.replace("5'", "5&prime;")
    y = y.replace("3'", "3&prime;")

    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
        
    return y



In [24]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        bad_cols.append(c)
    print()

Status 
-------------

AddedBy 
-------------

ConnID 
-------------

Species 
-------------

input1_ID 
-------------

input1_level 
-------------

input1_localisation 
-------------

input1_type 
-------------

input2_ID 
-------------

input2_level 
-------------

input2_localisation 
-------------

input2_type 
-------------

input3_ID 
-------------

input3_level 
-------------

input3_localisation 
-------------

input3_type 
-------------

ReactionEffect 
-------------

ReactionMode 
-------------

Modifications 
-------------

output1_ID 
-------------

output1_level 
-------------

output1_localisation 
-------------

output1_type 
-------------

TrustLevel 
-------------

Literature 
-------------

AdditionalInfo 
-------------
actually binding - acts as inhibition as it outcompetes bdingin to promotor, can be extended to include binding to HISTONE DEACETYLASE 6 (HDA6) 'interacts with JAZs and EIN3/EIL1 as a co-repressor'
RTM proteins block the long distance transport of plan

In [25]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(quote_replacer, verbose=True)
    print()

AdditionalInfo 
-------------



In [26]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        print()

AdditionalInfo 
-------------
actually binding - acts as inhibition as it outcompetes bdingin to promotor, can be extended to include binding to HISTONE DEACETYLASE 6 (HDA6) 'interacts with JAZs and EIN3/EIL1 as a co-repressor'
RTM proteins block the long distance transport of plant viruses. They don't directly interact with CP, but the process of RTM resistance involves at least 5 different proteins.
RTM proteins block the long distance transport of plant viruses. They don't directly interact with CP, but the process of RTM resistance involves at least 5 different proteins.
HC-Pro from PVY with Y2H, BiFC. Arabidopsis 20S proteasome subunits. Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H).
Elena's gathered literature and PPIs (Y2H

In [27]:
df_edges["Literature"].fillna(value="", inplace=True)

In [28]:
# format literature sources

re_ec = "ec(?:\:|\s)?(\d+(?:\.(?:\-|\d+)){1,3}(?:\.n\d+)?)(?:\s|$|\]|,|\.)"


def doi_list(x):
    x = x.lower()
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x =x.lower()
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []


def format_literature(row):
    issued = False
    dbs_list = []
    
    x = row['Literature'].lower().strip()
    doi_match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    

    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
                 
    for key in x.split("|"):
        if key =="":
            print(f"BLANK\\{row['ConnID']}")
            issued = True

        elif ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + key.split(":")[1].strip()
                dbs_list.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + key.split(":")[1].strip()
                dbs_list.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pmcid" in key) or ("pmid" in key):
                continue
            else:
                print(f"UNKOWN DB\\{row['ConnID']}\\{key}")                 
                issued = True
        elif "invented" in key:
            dbs_list.append("invented:reason")
        else:
            print(f"NOVALUE\\{row['ConnID']}\\{key}")
            issued = True

    if (len(dbs_list)==0) and not issued:
        print(f"BADorMISSING\\{row['ConnID']}\\{x}")

    ################
    x = row['AdditionalInfo']
    x = x.lower()
    
    # EC:3.3.3.-
    ec_match = re.findall(re_ec, x)
    dbs_list += [f"ec:{idf}" for idf in ec_match]
    
    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
    
    doi_match = re.findall("doi(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    
    
    kegg_match = re.findall(r"((?:k|map|ko|ec|rn)\d{5})", x)
    dbs_list += [f"kegg:{idf}" for idf in kegg_match]              
    
    ncbi_nuccore_match = re.findall("NCBI ID: (.+)", x)
    dbs_list += [f"ncbi_nuccore:{idf}" for idf in ncbi_nuccore_match]              
              
              
              
              
    return ','.join(list(set(dbs_list)))

In [29]:
df_edges['external_links'] = df_edges.apply(format_literature, axis=1)

BLANK\Conn039
BLANK\Conn115
NOVALUE\Conn116\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4
BLANK\Conn117
NOVALUE\Conn118\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4
BLANK\Conn119
NOVALUE\Conn120\embo j., 20 (2001), pp. 5400-5411 direct interaction between the arabidopsis disease resistance signaling proteins, eds1 and pad4
BLANK\Conn178
BLANK\Conn179
NOVALUE\Conn194\? kg need to find reference
NOVALUE\Conn264\kegg 
NOVALUE\Conn265\kegg 
NOVALUE\Conn266\kegg 
NOVALUE\Conn267\kegg 
NOVALUE\Conn267\ 10.1073/pnas.98.4.2065
NOVALUE\Conn268\kegg 
NOVALUE\Conn269\kegg 
NOVALUE\Conn270\kegg 
NOVALUE\Conn271\kegg 
NOVALUE\Conn272\kegg 
NOVALUE\Conn273\kegg 
NOVALUE\Conn274\kegg 
BLANK\Conn309


In [30]:
df_edges[["ConnID",  "external_links", "Literature", "AdditionalInfo"]].to_csv(output_path / "reactions-lit-check.tsv", sep="\t", index=None)

In [31]:
df_edges.reset_index(inplace=True, drop=True)

In [32]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [33]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,ModelV,origin,trust_level,external_links
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions,R1,aracyc:ethyl-pwy
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions,R1,aracyc:ethyl-pwy
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,v1.0,forCB-v2.7.6_PIS-model-Reactions,R1,aracyc:ethyl-pwy
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,v1.0,forCB-v2.7.6_PIS-model-Reactions,R1,doi:10.1105/tpc.001768
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,family,ER,protein [active],"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,v1.0,forCB-v2.7.6_PIS-model-Reactions,R1,doi:10.1105/tpc.001768


In [34]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [39]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [inactivated]":"protein", 
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    'process [active]':"process_active",

    np.nan:"", 
}

node_type_to_active_form_dict = {
    "protein":"protein_active",
    "protein_active":"protein_active",
    
    "complex":"complex_active", 
    "complex_active": "complex_active",
    
    "process":"process_active", 
    'process_active':"process_active",
}

node_type_to_inactive_form_dict = {
    "protein":"protein",
    "protein_active":"protein",
    "protein [inactivated]":"protein",
    
    "complex":"complex", 
    "complex_active": "complex",
    
    "process":"process", 
    'process_active':"process",
}



def get_node_form(row, prefix="input1"):
    id_ = row.iloc[0]
    if type(id_) == np.float:
        return np.nan
    
    type_ = row.iloc[1]
    form = node_type_to_node_form_dict[type_]
    if (  len(re.findall("\(a\)", id_)) > 0  ) or (  len(re.findall("\[active\]", id_)) > 0  ):
        form = node_type_to_active_form_dict[type_]
        print(id_, type_, form)
        
    return form

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    #x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[[id_col, type_col]].apply(get_node_form, prefix=prefix, axis=1)

input1
ARR-B(a)(p) protein protein_active
input2
AHK2,3,4(a) protein protein_active
AHK2,3,4(a) protein protein_active
ARR-B(a)(p) protein protein_active
ARR-A(a)(p) protein protein_active
AHP1,2,3,4,5(a)(p) protein protein_active
AHP1,2,3,4,5(a)(p) protein protein_active
input3
output1
AHK2,3,4(a) protein protein_active
AHK2,3,4(a) protein protein_active
AHK2,3,4(a) protein protein_active
AHK2,3,4(a) protein protein_active
AHP1,2,3,4,5(a)(p) protein protein_active
ARR-A(a)(p) protein protein_active
ARR-A(a)(p) protein protein_active
ARR-B(a)(p) protein protein_active


In [40]:
def remove_brackets(x):
    # reorder ids for complexes
    if type(x) == np.float:
        return np.nan
    else:
        s = re.sub("\(a\)", "", x)
        s = re.sub("\(p\)", "", s)
        if s != x:
            print(x, s)
        return s

In [41]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(remove_brackets)

ARR-B(a)(p) ARR-B
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
ARR-B(a)(p) ARR-B
ARR-A(a)(p) ARR-A
AHP1,2,3,4,5(a)(p) AHP1,2,3,4,5
AHP1,2,3,4,5(a)(p) AHP1,2,3,4,5
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHK2,3,4(a) AHK2,3,4
AHP1,2,3,4,5(a)(p) AHP1,2,3,4,5
ARR-A(a)(p) ARR-A
ARR-A(a)(p) ARR-A
ARR-B(a)(p) ARR-B


In [42]:
df_edges['species'] = df_edges['Species'].apply(lambda x: ",".join(x.lower().split('/')))

In [43]:
df_edges['species'].unique()

array(['ath', 'ath,nbe', 'ath,stu', 'all', 'ath,nta', 'stu', 'osa',
       'ath,osa', 'ath,osa,psa', 'ath,osa,sly,zma', 'ath,osa,phy'],
      dtype=object)

## Protein, Clade, Family keys

In [44]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

PlantCoding 162
PlantNonCoding 9
PlantAbstract 7
Complex 3
ExternalEntity 3
ExternalCoding 14
ExternalNonCoding 0
ExternalAbstract 0
Process 6
MetaboliteFamily 11
Metabolite 102
PseudoNode 0


In [45]:
file_name = os.path.join(output_path / "bio_elements.tsv")
df_bioelements = pd.read_csv(file_name, sep="\t")

In [46]:
df_bioelements.head()

Unnamed: 0,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms
0,ZR,all,Complex,plant_complex,SCF,SCF,SCF,SCF,,SCF,,,v2.7,use,,,,
1,ZR,all,Complex,plant_complex,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,,WD/bHLH/MYB,,,v2.7,use,,,,
2,ZR,all,Complex,plant_complex,ribosome,ribosome,ribosome,ribosome,,ribosome,,,v2.6,use,,,,
3,KG,ath,PlantAbstract,plant_abstract,BA2H,BA2H,BA2H,BA2H,,BA2H,Benzoic acid 2-hydroxylase: Isolated from toba...,Hormone:SA,v1.0,ignore,,,,
4,KG,ath,PlantAbstract,plant_abstract,IPL,IPL,IPL,IPL,,Arabidopsis contains two ICS genes but has no ...,,Hormone:SA,v1.0,ignore,,,,


In [47]:
all_species = list(df_bioelements['species'].unique())
try: all_species.remove('plant_all')
except ValueError: pass
try: all_species.remove('all')
except ValueError: pass    
all_species

['ath', 'osa', 'stu', 'sly']

In [48]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return v
    return {}


def get_species_homologues(level):
    df_level_species  = df_bioelements.groupby([level, 'species']).agg({
      'NodeID':lambda x: set(x), 
    })
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['species'] == specie, col] =\
         df_level_species[df_level_species['species'] == specie]['NodeID']
    
    df_level_species = df_level_species[cols].groupby(level).agg({
        y:pick_the_set for y in cols
    })
    
#     df_level = df_bioelements.groupby(level).agg({
#           'AddedBy':lambda x:list(x)[0], 
#           'NodeLabel':lambda x:list(x)[0], 
#           'NodeDescription':lambda x: ', '.join(list(set(x))), 
#           'AdditionalInfo':lambda x: helpers.list_to_string(x), 
#           'Process':lambda x:list(x)[0], 
#           'ModelV':helpers.get_latest_model, 
#           'Species':lambda x: set(x), 
#     })    

    df_level = df_bioelements.fillna('').groupby(level).agg({
          'AddedBy':lambda x:list(x)[0], 
          'NodeLabel':lambda x:list(x)[0], 
          'NodeDescription':lambda x: ', '.join(list(set(x))), 
          'AdditionalInfo':lambda x: helpers.list_to_string(x), 
          'Process':lambda x:list(x)[0], 
          'ModelV':helpers.get_latest_model, 
          'species':lambda x: ', '.join(list(set(x))),
          'ModelStatus':helpers.get_model_status,
          'external_links':lambda x: ', '.join([s for s in x if not s=='']), 
          'gmm_ocd':lambda x: ', '.join(list(set(x))),
          'GMM_Description':lambda x: ', '.join(list(set(x))),
          'GMM_ShortName':lambda x: ', '.join(list(set(x))),
          'GMM_Synonyms':lambda x: ', '.join(list(set(x))),
    })

    df_level = df_level.join(df_level_species[cols])
    df_level.reset_index(inplace=True)
    
    return df_level

In [49]:
id_to_name = df_bioelements[['NodeID', 'NodeName']]
id_to_name.duplicated().sum()

0

In [50]:
# these are multiple IDs to same NodeName (orthologues?)
id_to_name[id_to_name['NodeName'].duplicated(keep=False)].sort_values("NodeName")#['NodeName'].unique()

Unnamed: 0,NodeID,NodeName
212,AT4G02780,CPS
213,Sotub06g034690.1.1,CPS
214,Sotub08g006560.1.1,CPS.x1
215,Solyc06g084240.2.1,CPS.x1
216,Sotub08g020310.1.1,CPS.x2
217,Solyc08g005710.3.1,CPS.x2
354,Sotub06g023200.1.1,GA20ox.x1
355,Solyc06g050110.2.1,GA20ox.x1
356,Sotub09g017720.1.1,GA20ox.x2
357,Solyc09g009110.3.1,GA20ox.x2


In [51]:
df_nodes = get_species_homologues('NodeName')
df_nodes.set_index('NodeName', inplace=True)

df_families = get_species_homologues("Family")

# node name to node IDs
node_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    node_ids_key[col] = df_nodes[col].to_dict()
df_clades = get_species_homologues('Clade')
df_clades.set_index('Clade', inplace=True)

# clade name to node IDs
clade_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    clade_ids_key[col] = df_clades[col].to_dict() 

# family name to node IDs
df_families.set_index("Family", inplace=True)
family_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    family_ids_key[col] = df_families[col].to_dict() 

node_to_family = df_bioelements[["NodeName", "Family"]].set_index("NodeName")["Family"].to_dict()
clade_to_family = df_bioelements[["Clade", "Family"]].set_index("Clade")["Family"].to_dict()

In [52]:
def save_dict(d, file):
    with open(file, "w") as out:
        for key, value in d.items():
            out.write(f"{key}\t{value}\n")

In [53]:
save_dict(node_to_family, output_path / "node_to_family.tsv")
save_dict(clade_to_family, output_path / "clade_to_family.tsv")

In [54]:
def convert_node_to_family(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, type_, level_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan, np.nan

    new_label = None
    family_id = None
    
    ########################
    # Simple Cases
    ########################
    if type_ in ['complex', 'complex [active]', 'complex [activated]', 'complex [inactive]', 'plant_complex']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
        family_id = id_
    
    elif type_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_, " |a (label) not a listed metabolite")
        family_id = id_

    elif type_ in ['process']:
        if (id_ in node_dict["Process"]):
            family_id = id_
            new_label = "Process"
        else:
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |b (label) process not a listed process")
        
    else:
        ########################
        # family ID
        ########################
        check_external = False
        if level_ == "family":    
            family_id = id_
        elif level_ in ["clade", "clade/orthologue"]:
            try:
                family_id = clade_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        elif level_ == "node":
            try:
                family_id = node_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        
        if check_external:
            for label in ["ExternalEntity", "ExternalCoding", "ExternalNonCoding"]:
                if id_ in node_dict[label]:
                    new_label = label
                    family_id = id_
                    break
                    
        if not family_id:
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |c (family id) could not convert to family/external")
                
        ########################
        # Label
        ########################
        if (family_id) and (not new_label):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |d (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |e (label) could not find label")
                missing_in_components.update([id_])        

    return family_id, new_label
        

In [55]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, level_col, new_id, new_label_col  =\
            [prefix + x for x in ('_ID',  '_type',  '_level',  '_newID', '_label')]    
    
    df_edges[[new_id, new_label_col]] = df_edges[[id_col, type_col, level_col, 'ConnID', 'origin']].apply(convert_node_to_family, axis=1, result_type='expand')


forCB-v2.7.6_PIS-model-Reactions | Conn264 | Geranylgeranyl-PP | metabolite | family  |a (label) not a listed metabolite
[TBD]-v2.7.6_PIS-model-Reactions | Conn314 | All-trans-b-carotene | metabolite | family  |a (label) not a listed metabolite
[TBD:cleavage-auto-cleavage]-v2.7.6_PIS-model-Reactions | Conn315 | 9-cis-b-carotene | metabolite | node  |a (label) not a listed metabolite
[TBD:cleavage-auto-cleavage]-v2.7.6_PIS-model-Reactions | Conn316 | 9-cis-b-apo-10&prime;-carotenal | metabolite | node  |a (label) not a listed metabolite
[TBD]-v2.7.6_PIS-model-Reactions | Conn317 | CL | metabolite | node  |a (label) not a listed metabolite
[TBD:cleavage-auto-cleavage]-v2.7.6_PIS-model-Reactions | Conn318 | SLs | metabolite | family  |a (label) not a listed metabolite
[TBD]-v2.7.6_PIS-model-Reactions | Conn319 | D14 | protein [activated] | family  |e (label) could not find label
[TBD]-v2.7.6_PIS-model-Reactions | Conn320 | D53 | protein [activated] | family  |e (label) could not find labe

In [56]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_form,species,input1_newID,input1_label,input2_newID,input2_label,input3_newID,input3_label,output1_newID,output1_label
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,metabolite,ath,L-Met,Metabolite,SAM,PlantCoding,,,SAMe,Metabolite
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,metabolite,ath,SAMe,Metabolite,ACS,PlantCoding,,,ACC,Metabolite
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,metabolite,ath,ACC,Metabolite,ACO,PlantCoding,,,ET,Metabolite
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,metabolite,ath,Cu2+,Metabolite,HMA,PlantCoding,,,Cu2+,Metabolite
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,protein_active,ath,ETR,PlantCoding,Cu2+,Metabolite,,,ETR,PlantCoding


In [57]:
def get_homologues(x, prefix=""):
    id_, level_, label_ = x.values
    ########################
    # Specie specific homologues
    ########################
    
    return_D = {f"{prefix}_{specie}_homologues":"" for specie in all_species}
    
    if label_ in ['PlantCoding', 'PlantNonCoding',  'PlantAbstract']:

        for species in all_species:
            species = f"{species}_homologues"
            k = f"{prefix}_{species}"
            if level_ == 'node':
                return_D[k] = node_ids_key[species][id_]
            elif level_ == 'clade':
                return_D[k] =  clade_ids_key[species][id_]
            elif level_ == 'family':
                return_D[k] =  family_ids_key[species][id_]

    return_D = {x:helpers.list_to_string(list(return_D[x])) for x in return_D}
    return return_D

In [58]:
new_dfs = []
for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, level_col, new_label_col  =\
                [prefix + x for x in ('_ID',  '_level',  '_label')]        
    
    new_df = df_edges[[id_col, level_col, new_label_col ]].apply(get_homologues, axis=1, result_type='expand', prefix=prefix)
    new_dfs.append(new_df)

input1
input2
input3
output1


In [59]:
homologues_df = pd.concat(new_dfs, sort=False, axis=1)
homologues_df.head()

Unnamed: 0,input1_ath_homologues,input1_osa_homologues,input1_sly_homologues,input1_stu_homologues,input2_ath_homologues,input2_osa_homologues,input2_sly_homologues,input2_stu_homologues,input3_ath_homologues,input3_osa_homologues,input3_sly_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_sly_homologues,output1_stu_homologues
0,,,,,"AT2G36880,AT1G02500,AT4G01850,AT3G17390",,,,,,,,,,,
1,,,,,"AT4G37770,AT4G26200,AT5G51690,AT1G62960,AT1G01...",,,,,,,,,,,
2,,,,,"AT1G77330,AT1G05010,AT1G12010,AT1G62380,AT2G19590",,,,,,,,,,,
3,,,,,"AT4G33520,AT5G21930,AT1G63440,AT5G44790",,,,,,,,,,,
4,"AT2G40940,AT1G66340,AT3G23150,AT3G04580,AT1G04310",,,,,,,,,,,,"AT2G40940,AT1G66340,AT3G23150,AT3G04580,AT1G04310",,,


In [60]:
homologues_df.loc[0]

input1_ath_homologues                                            
input1_osa_homologues                                            
input1_sly_homologues                                            
input1_stu_homologues                                            
input2_ath_homologues     AT2G36880,AT1G02500,AT4G01850,AT3G17390
input2_osa_homologues                                            
input2_sly_homologues                                            
input2_stu_homologues                                            
input3_ath_homologues                                            
input3_osa_homologues                                            
input3_sly_homologues                                            
input3_stu_homologues                                            
output1_ath_homologues                                           
output1_osa_homologues                                           
output1_sly_homologues                                           
output1_st

In [61]:
df_edges = df_edges.join(homologues_df, sort=False)

In [62]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,input2_sly_homologues,input2_stu_homologues,input3_ath_homologues,input3_osa_homologues,input3_sly_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_sly_homologues,output1_stu_homologues
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,,,,,,,,,
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,,,,,,,,,
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,,,,,,,,,
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,,,,,,,,,,
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,,,,,,,"AT2G40940,AT1G66340,AT3G23150,AT3G04580,AT1G04310",,,


In [63]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus', 
    'mitochondria?': 'putative:mitochondrion', 
    'cytoplasm?': 'putative:cytoplasm', 
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])

good_localisations.update(['putative:' + s for s in good_localisations])


def node_localisation_std(x):
    if not type(x) == str:
        return "putative:cytoplasm"
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        print(x)
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col, location_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation', '_location')]
    
    x = df_edges[['ConnID', 'origin', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    for _, y in x.iterrows():
        if (not (y[id_col] in helpers.empty_strings)) and (y[localisation_col] in helpers.empty_strings + ['']):
            print(y['origin'], "|", y['ConnID'], "|", prefix, "|", y[id_col], "|", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[location_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[location_col])

In [64]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'cytoplasm?',
 'extracellular',
 'mitochondria?',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [65]:
new_localisation

{'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'putative:cytoplasm',
 'putative:mitochondrion',
 'vacuole'}

In [66]:
good_localisations

{'apoplast',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'mitochondrion',
 'nucleolus',
 'nucleus',
 'peroxisome',
 'putative:apoplast',
 'putative:chloroplast',
 'putative:cytoplasm',
 'putative:endoplasmic reticulum',
 'putative:extracellular',
 'putative:golgi apparatus',
 'putative:mitochondrion',
 'putative:nucleolus',
 'putative:nucleus',
 'putative:peroxisome',
 'putative:vacuole',
 'vacuole'}

In [67]:
print('reaction_mode_dict = {')
for s in df_edges['ReactionMode'].unique():
    print(f"\t'{s}':'{'/'.join([x.lower().strip() for x in str(s).split('/')])}',")
print('}')

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational induction',
	'degradation / secretion':'degradation/secretion',
	'nan':'nan',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}


In [71]:
# # Old version before edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'translation',
# 	'transcription':'transcription',
# 	'by binding':'by binding',
# 	'nan':'nan',
# 	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'protein phosphorylation',
# }

# Old version after edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'bad',
# 	'transcription':'bad',
# 	'by binding':'bad',
# 	np.nan:'bad',
# 	'cleavage / auto-cleavage':'bad',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'bad',
# }

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational induction',
	'degradation / secretion':'degradation/secretion',
	np.nan:'bad',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}

In [72]:
df_edges['reaction_type'] = df_edges['ReactionMode'].apply(lambda x: reaction_mode_dict[x])

In [73]:
df_edges[df_edges['reaction_type']=='bad'][['origin', 'ConnID', 'ReactionMode']]

Unnamed: 0,origin,ConnID,ReactionMode
31,_TBD-v2.7.6_PIS-model-Reactions,Conn032,
60,_TBD-v2.7.6_PIS-model-Reactions,Conn061,
87,_TBD-v2.7.6_PIS-model-Reactions,Conn088,
88,_TBD-v2.7.6_PIS-model-Reactions,Conn089,
91,_TBD-v2.7.6_PIS-model-Reactions,Conn092,
120,_TBD-v2.7.6_PIS-model-Reactions,Conn121,
150,_TBD-v2.7.6_PIS-model-Reactions,Conn151,
209,_TBD-v2.7.6_PIS-model-Reactions,Conn210,
310,_TBD-v2.7.6_PIS-model-Reactions,Conn311,
401,[TBD]-v2.7.6_PIS-model-Reactions,Conn402,


In [74]:
len(df_edges['ConnID'].unique()) == df_edges.shape[0]

True

In [75]:
df_edges['RxID'] = df_edges.index.map(lambda x: "rxID_" + str(x))

In [76]:
len(df_edges['RxID'].unique()) == df_edges.shape[0]

True

In [77]:
df_edges.to_csv(output_path / "edges-sheet.tsv", sep="\t")

In [78]:
homologue_cols = [f"{x}_homologues" for x in all_species]

In [79]:
all_species

['ath', 'osa', 'stu', 'sly']

In [80]:
with open(output_path / "complexes_to_add.tsv", "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")

# END