In [1]:
version = "v0.0.5"

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/5

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Components summary

In [10]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"]) for d in nodes])

In [11]:
all_nodes_in_components

{'&alpha;/&beta; hydroxylase',
 '&beta;-carotene isomerase',
 '12,13-EOT',
 '12-OH-JA-Ile',
 '13-HPOT',
 '4CLL',
 '6K1',
 '6K2',
 '9-cis-&beta;-carotene',
 '9-cis-10&prime;-apo-&beta;-carotenal',
 'AAO',
 'ACC',
 'ACH',
 'ACO',
 'ACS',
 'ACX',
 'ACX2|CAT2',
 'ACX3|CAT2',
 'ADK',
 'ADP',
 'ADT',
 'AGO',
 'AGO1,5,7,10|CI',
 'AGO1,5,7,10|HC-Pro',
 'AHK',
 'AHP',
 'AMP',
 'AOC',
 'AOS',
 'AOX',
 'APT',
 'APX',
 'ARF',
 'ARR',
 'ASK',
 'ATP',
 'ATPB',
 'ATPB|HC-Pro',
 'AUX-signalling',
 'Ade',
 'Ado',
 'Anthocyanin-accumulation',
 'BA',
 'BA2H',
 'BAK1|FLS2|flg22',
 'BG',
 'BIK1',
 'BIK1|Ca2+',
 'BR1',
 'CAM',
 'CAMTA',
 'CAM|Ca2+',
 'CAT',
 'CAT|LSD1',
 'CBP60G',
 'CCD',
 'CDPK',
 'CI',
 'CI|PSAK',
 'CKX',
 'CL',
 'CLA',
 'CLH',
 'CM',
 'CML40|HC-Pro',
 'CML42|Ca2+',
 'CO',
 'CO2-deficiency',
 'COI1',
 'COI1|JA-Ile|SCF',
 'COI1|OMR1',
 'COI1|RBCS-3B',
 'CO|OBE1',
 'CP',
 'CPS',
 'CP|CPIP1,2b',
 'CP|CPIP2a',
 'CRT',
 'CRT1|ETR1',
 'CRT1|HC-Pro',
 'CRT2|HC-Pro',
 'CRT3|HC-Pro',
 'CRT|Ca2+',


In [12]:
len(all_nodes_in_components)

411

In [13]:
node_labels = helpers.node_labels

In [14]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s


PlantCoding 162
PlantNonCoding 9
PlantAbstract 7
ForeignEntity 3
ForeignCoding 14
ForeignNonCoding 0
ForeignAbstract 0
Complex 94
Process 6
MetaboliteFamily 6
Metabolite 116
Reaction 0


In [15]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [16]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [17]:
homolgue_cols_old = ['input1_ath_homologues', 'input1_osa_homologues', 'input1_sly_homologues', 'input1_stu_homologues', 
          'input2_ath_homologues', 'input2_osa_homologues', 'input2_sly_homologues', 'input2_stu_homologues', 
          'input3_ath_homologues', 'input3_osa_homologues', 'input3_sly_homologues', 'input3_stu_homologues', 
          'output1_ath_homologues', 'output1_osa_homologues', 'output1_sly_homologues', 'output1_stu_homologues']
for col in homolgue_cols_old:
    df_edges[col].fillna('', inplace=True)


In [18]:
df_edges.head(10)

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00001
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00002
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00003
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,,,,,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,translocation,rx00004
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,"AT1G04310,AT3G04580,AT3G23150,AT2G40940,AT1G66340",,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00005
5,forCB,KG,Conn006,ath,CTR,family,cytoplasm,protein [active],ETR,family,...,,,,,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,binding/oligomerisation,rx00006
6,forCB,KG,Conn007,ath,CTR|ETR,family,ER,complex,RTE1,family,...,,,,,endoplasmic reticulum,golgi apparatus,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00007
7,_TBD,KG,Conn008,ath,ET,family,cytoplasm,metabolite,ETR,family,...,,,,,cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,binding/oligomerisation,rx00008
8,forCB,KG,Conn009,ath,EIN2,node,ER,protein,ETP|SCF,family,...,AT5G03280,,,,endoplasmic reticulum,cytoplasm,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00009
9,forCB,KG,Conn010,ath,CTR|ETR,family,ER,complex,EIN2,family,...,AT5G03280,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein deactivation,rx00010


In [19]:
df_edges[df_edges['input1_ID']=="DZ"][['input1_ID', 'input1_newID', 'input1_label']]

Unnamed: 0,input1_ID,input1_newID,input1_label
341,DZ,DZ,Metabolite
345,DZ,DZ,Metabolite
346,DZ,DZ,Metabolite
347,DZ,DZ,Metabolite
348,DZ,DZ,Metabolite
349,DZ,DZ,Metabolite
350,DZ,DZ,Metabolite
351,DZ,DZ,Metabolite
352,DZ,DZ,Metabolite


In [20]:
helpers.empty_strings

['-', '?', '[empty]', 'nan', 'n.a.', nan, '[undefined]', '']

In [21]:
def generate_list(subdf, ids, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols
        
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols

    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [22]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_newID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [23]:
def number_input_different(df, homologues=True, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate', homologues=homologues)
        rename_target(subdf2, 'input2', 'catalyst', homologues=homologues)

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate', homologues=homologues)        
        rename_target(subdf3, 'input3', 'catalyst', homologues=homologues) 
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate', homologues=homologues)

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate', homologues=homologues)
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [24]:
df_translate = df_edges[['reaction_id', 
          'input1_ID', 'input2_ID', 'input3_ID', 'output1_ID', 
          'input1_newID', 'input2_newID', 'input3_newID', 'output1_newID']].copy()


In [25]:
translate = defaultdict(dict)
for _, row in df_translate.iterrows():
    for suffix in ['input1', 'input2', 'input3', 'output1']:
        old = f"{suffix}_ID"
        new = f"{suffix}_newID"
        if not (row[new] is np.nan):
            translate[row['reaction_id']][row[new]] = row[old]

In [26]:
import json

In [27]:
with open("family-to-og-ID-v0.0.4.json", "w") as out:
    json.dump(translate, out)

In [28]:
# transcription genes
substrate_cols_wo_homologues = ['substrate_name', 'substrate_form', 'substrate_label', 'substrate_location']
product_cols_wo_homologues = ['product_name', 'product_form', 'product_label',  'product_location']
catalyst_cols_wo_homologues = ['catalyst_name', 'catalyst_form', 'catalyst_label', 'catalyst_location']

homologue_cols = [f"_{x}_homologues" for x in all_species]

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"substrate{x}" for x in homologue_cols]
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"catalyst{x}" for x in homologue_cols] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"product{x}" for x in homologue_cols]

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']

In [29]:
def read_dict(file):
    d = {}
    with open(file, "r") as f:
        for line in f:
            key, value = line.strip().split("\t")
            d[key] = value
    return d

In [30]:
node_to_family = read_dict(parsed_path / "node_to_family.tsv")
clade_to_family = read_dict(parsed_path / "clade_to_family.tsv")

In [31]:
def node_id_to_node_label(id_):

    query = '''MATCH (s) WHERE s.name=$x 
               RETURN s.name AS substrate_name, labels(s) AS substrate_label'''
    
    cursor = graph.run(query, x=id_)
    d = cursor.data()
    
    if len(d) == 0:
        print(id_, d, "no hit")
        return ""
    elif len(d) == 1:
        s = set(d[0]['substrate_label']) - set(['Family'])
        return s.pop()
    else:
        print(id_, d, 'multiple hits') # should be impossible
        return ""

In [32]:
def get_name(ids_):
    names = []
    for x in ids_:
        label1 = node_id_to_node_label(x)
        if not (label1 == ''):
            names.append(x)
        else:
            if x in clade_to_family:
                names.append(clade_to_family[x])
                print('using as clade id to get family\t', x, "\t", clade_to_family[x])

            elif x in node_to_family:
                names.append(node_to_family[x])
                print('using as node id to get family\t', x, "\t", node_to_family[x])

            else:
                names.append(x)
    
    return names

# Add reactions

In [33]:
def pretty_print_result(t, df, qr, input_type, multiplier=1):
    if input_type in ['nodes', 'relationships']:
        key = input_type + '_created'
    else:
        key = input_type
    
    try:
        stat = qr.stats()[key]
    except:
        stat = 0
    
    print(f"{t:20}\t{df.shape[0]:3}\t{stat:3}", end="")
    if df.shape[0]*multiplier == stat:
        print()
    
    elif df.shape[0]*multiplier < stat:
        print(f"\t**too many {input_type} created**")
    else:
        print(f"\t**not all {input_type} created**")

In [34]:
df_edges.index.duplicated().sum()

0

## binding / oligomerisation

In [35]:
key = 'binding/oligomerisation'
reaction_type = "BINDING_OGLIMERISATION"
subdf = df_edges.loc[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

BINDING_OGLIMERISATION 104


In [36]:
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']
binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']

In [37]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst)
subdf_w_catalyst = number_input_different(binding_w_catalyst, catalyst=True)

In [38]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	104	104


### Without catalyst

In [39]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_wo_catalyst, explode_cols)#.drop_duplicates()

In [40]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


Complex             	 16	 16
ForeignCoding       	 30	 30
ForeignEntity       	  3	  3
Metabolite          	 32	 32
PlantAbstract       	  4	  4
PlantCoding         	131	131
Process             	  3	  3


In [41]:
f = f"{reaction_type}-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
subdf_wo_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [42]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                "dne", "product",
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                                               )
qr = graph.run(query)
pretty_print_result('-', subdf_wo_catalyst, qr, 'relationships')

-                   	103	103


### With catalyst

In [43]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_w_catalyst, explode_cols).drop_duplicates()

In [44]:
# binding substrate edges

want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-wicat-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                    "substrate", "dne",
                                                    source_label=t, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

PlantCoding         	  1	  1


In [45]:
# binding catalyst edges

want_cols = reaction_standard_columns + catalyst_cols

for t, this_subdf in subdf_w_catalyst.groupby("catalyst_label"):
    f = f"{reaction_type}-wicat-{t}-catalyst_label_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                    "catalyst", "dne",
                                                    source_label=t, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, subdf_w_catalyst, qr, 'relationships')

PlantCoding         	  1	  1


In [46]:
f = f"{reaction_type}-wicat-catalyst-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
subdf_w_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [47]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT",
                                                "dne", "product", 
                                                source_name="line.reaction_id"
                                               )
qr = graph.run(query)
pretty_print_result('-', subdf_w_catalyst, qr, 'relationships')

-                   	  1	  1


## catalysis / auto-catalysis

In [48]:
key = 'catalysis/auto-catalysis'
reaction_type = 'CATALYSIS'
subdf = df_edges[df_edges['reaction_type'] == key]
print(reaction_type, subdf.shape[0])

CATALYSIS 79


In [49]:
subdf.duplicated().sum()

0

In [50]:
# make reaction nodes
f = f"{reaction_type}-wicat-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 79	 79


In [51]:
catalysis_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
catalysis_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

### With catalyst

In [52]:
subdf_w_catalyst = number_input_different(catalysis_w_catalyst, catalyst=True)
new_subdf = number_input_different(subdf_w_catalyst, catalyst=True)

In [53]:
# substrate to reaction
exploded_new_subdf = helpers.unnesting(new_subdf, substrate_cols)
want_cols = reaction_standard_columns + substrate_cols
for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-wicat-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 79	 79
PlantCoding         	 21	 21


In [54]:
# catalyst to reaction
want_cols = reaction_standard_columns + catalyst_cols

for t, this_subdf in subdf_w_catalyst.groupby("catalyst_label"):
    f = f"{reaction_type}-wicat-{t}-catalyst_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES', 
                                                    "catalyst", "dne",
                                                    source_label=t, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 21	 21
MetaboliteFamily    	  1	  1
PlantAbstract       	  3	  3
PlantCoding         	 52	 52
Process             	  1	  1


In [55]:
# product to reaction
want_cols = reaction_standard_columns + product_cols
for t, this_subdf in subdf_w_catalyst.groupby("product_label"):
    f = f"{reaction_type}-wicat-{t}-product_edges.tsv"    
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', 
                                                    "dne", "product",
                                                    target_label=t, source_label="Reaction", 
                                                    source_name="line.reaction_id"
                                                   )    
    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 76	 76
MetaboliteFamily    	  2	  2


### Without catalyst

In [56]:
rename_target(catalysis_wo_catalyst, 'output1', 'product')
rename_target(catalysis_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [57]:
catalysis_wo_catalyst['reaction_id']

378    rx00379
Name: reaction_id, dtype: object

In [58]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in catalysis_wo_catalyst.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


MetaboliteFamily    	  1	  1


In [59]:
f = f"{reaction_type}-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
catalysis_wo_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [60]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                "dne", "product",
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                                               )
qr = graph.run(query)
pretty_print_result('-', catalysis_wo_catalyst, qr, 'relationships')

-                   	  1	  1


In [61]:
q = '''MATCH ()-[r:SUBSTRATE {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

101

In [62]:
q = '''MATCH ()-[r:PRODUCT {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

79

## dissociation

In [63]:
key = 'dissociation'
reaction_type = 'DISSOCIATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DISSOCIATION 1


In [64]:
subdf

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
95,forCB,KG,Conn096,ath,NPR1|NPR1,family,cytoplasm,complex,TRX-H,family,...,AT1G64280,,,,cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,dissociation,rx00096


In [65]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [66]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  1	  1


In [67]:
# substrate to reaction edge
f =  f"{reaction_type}-{t}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                                                
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


In [68]:
# catalyst to reaction edge

f =  f"{reaction_type}-{t}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst" , "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


In [69]:
# product to reaction edge

f =  f"{reaction_type}-{t}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                                               
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


## degradation

In [70]:
key = 'degradation/secretion'
reaction_type = 'DEGRADATION_SECRETION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DEGRADATION_SECRETION 32


In [71]:
rename_target(subdf, 'input1',  'catalyst')
rename_target(subdf, 'input2',  'substrate')

In [72]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 32	 32


In [73]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 32	 32


In [74]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 32	 32


## translocation

In [75]:
key = 'translocation'
reaction_type = 'TRANSLOCATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

TRANSLOCATION 5


In [76]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [77]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  5	  5


In [78]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_FROM',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


In [79]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


In [80]:
# product edge
f =  f"{reaction_type}--product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_TO',
                                                "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


## protein activation

In [81]:
key = 'protein activation'
reaction_type = 'PROTEIN_ACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_ACTIVATION 52


In [82]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'catalyst')
rename_target(subdf, 'input1',  'substrate')

In [83]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 52	 52


In [84]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


In [85]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


In [86]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


## protein deactivation

In [87]:
key = 'protein deactivation'
reaction_type = 'PROTEIN_DEACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_DEACTIVATION 4


In [88]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [89]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  4	  4


In [90]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [91]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [92]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [93]:
# catalyst to product edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


## transcriptional / translational regulation

In [94]:
reaction_type = 'TRANSCRIPTIONAL_TRANSLATIONAL'
keys = ['transcriptional/translational induction', 'transcriptional/translational repression']
subdf = df_edges[df_edges['reaction_type'].isin(keys)]
print(reaction_type, subdf.shape[0])

TRANSCRIPTIONAL_TRANSLATIONAL 79


In [95]:
def number_input_different_reverse(df, homologues=True):
    ''' If catalyst is True, they are the first "input" cols. 
    last one is left as substrate. '''

    # two inputs, input1 -> catalyst
    subdf2 = df[df["input3_newID"].isna()].copy()
    generate_list(subdf2, ['input1'], 'catalyst', homologues=homologues)
    rename_target(subdf2, 'input2', 'substrate', homologues=homologues)

    # three inputs, input1, input2 -> catalyst
    subdf3 = df[~df["input3_newID"].isna()].copy()
    generate_list(subdf3, ['input1', 'input2'], 'catalyst', homologues=homologues)        
    rename_target(subdf3, 'input3', 'substrate', homologues=homologues)
     
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [96]:
new_subdf = number_input_different_reverse(subdf, homologues=True)

In [97]:
new_subdf[new_subdf['substrate_name'] != new_subdf['product_name']][["Status", "ConnID", 'substrate_name', 'product_name' ]]

Unnamed: 0,Status,ConnID,substrate_name,product_name
44,_TBD,Conn045,OMR,COI1|OMR1
92,_TBD,Conn093,PAD4,"NPR1|PAD4|TGA2,5,6"
115,_TBD,Conn116,EDS5,"EDS5|NPR1|TGA2,5,6"
117,_TBD,Conn118,EDS1,"EDS1|NPR1|TGA2,5,6"
119,_TBD,Conn120,PAD4,"NPR1|PAD4|TGA2,5,6"
148,_TBD,Conn149,GST,GSTU24|ROS


In [98]:
new_subdf[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location
27,forCB,Conn028,LOX,gene,PlantCoding,chloroplast
97,forCB,Conn098,WRKY,gene,PlantCoding,cytoplasm
110,forCB,Conn111,TAS3,gene,PlantNonCoding,cytoplasm
123,forCB,Conn124,WRKY,gene,PlantCoding,cytoplasm
160,forCB,Conn161,PAD4,gene,PlantCoding,cytoplasm
186,forCB,Conn187,ICS,gene,PlantCoding,chloroplast
187,forCB,Conn188,ICS,gene,PlantCoding,chloroplast
188,forCB,Conn189,ICS,gene,PlantCoding,chloroplast
189,forCB,Conn190,ICS,gene,PlantCoding,chloroplast
192,forCB,Conn193,ACO,gene,PlantCoding,endoplasmic reticulum


In [99]:
#manualfix
new_subdf.loc[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  ), 'substrate_location'] = 'nucleus'

In [100]:
new_subdf[(  (new_subdf['product_form'].isin(['rna', 'ncRNA']) ))]#& (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,substrate_stu_homologues,substrate_sly_homologues,product_name,product_location,product_label,product_form,product_ath_homologues,product_osa_homologues,product_stu_homologues,product_sly_homologues
110,forCB,KG,Conn111,ath,MIR390,family,cytoplasm,ncRNA,TAS3,family,...,,,TAS3,cytoplasm,PlantNonCoding,ncRNA,"AT3G17185,AT5G49615",,,


In [101]:
new_subdf[(  (new_subdf['substrate_form'].isin(['rna', 'ncRNA'])) & (new_subdf['substrate_location'] != 'cytoplams')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location


In [102]:
new_subdf[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  )][["Status", "ConnID"] + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,product_name,product_form,product_label,product_location
11,forCB,Conn012,EBF,protein,PlantCoding,nucleus
20,forCB,Conn021,PR3,protein,PlantCoding,nucleus
21,forCB,Conn022,PR4,protein,PlantCoding,nucleus
24,forCB,Conn025,PDF1,protein,PlantCoding,nucleus
27,forCB,Conn028,LOX,protein,PlantCoding,chloroplast
45,forCB,Conn046,JAZ,protein,PlantCoding,nucleus
47,forCB,Conn048,MYC,protein,PlantCoding,nucleus
48,forCB,Conn049,CLH,protein,PlantCoding,nucleus
49,forCB,Conn050,JAL,protein,PlantCoding,nucleus
50,forCB,Conn051,PR13,protein,PlantCoding,nucleus


In [103]:
# manualfix
new_subdf.loc[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  ), 'product_location'] = 'cytoplasm'

In [104]:
new_subdf[~new_subdf['substrate_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + substrate_cols_wo_homologues] #+ catalyst_cols_wo_homologues + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location
62,forCB,Conn063,Anthocyanin-accumulation,process,Process,cytoplasm
63,forCB,Conn064,Trichome-initiation,process,Process,cytoplasm


In [105]:
new_subdf[~new_subdf['product_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,product_name,product_form,product_label,product_location
44,_TBD,Conn045,COI1|OMR1,complex,Complex,nucleus
62,forCB,Conn063,Anthocyanin-accumulation,process_active,Process,cytoplasm
63,forCB,Conn064,Trichome-initiation,process_active,Process,cytoplasm
92,_TBD,Conn093,"NPR1|PAD4|TGA2,5,6",complex,Complex,nucleus
115,_TBD,Conn116,"EDS5|NPR1|TGA2,5,6",complex,Complex,nucleus
117,_TBD,Conn118,"EDS1|NPR1|TGA2,5,6",complex,Complex,nucleus
119,_TBD,Conn120,"NPR1|PAD4|TGA2,5,6",complex,Complex,nucleus
148,_TBD,Conn149,GSTU24|ROS,complex,Complex,nucleus


## Reaction nodes

In [106]:
new_subdf[new_subdf['reaction_id'].isin(['rx00148', 'rx00185'])][substrate_cols_wo_homologues + product_cols_wo_homologues + catalyst_cols_wo_homologues]

Unnamed: 0,substrate_name,substrate_form,substrate_label,substrate_location,product_name,product_form,product_label,product_location,catalyst_name,catalyst_form,catalyst_label,catalyst_location
147,EDS1,gene,PlantCoding,nucleus,EDS1,protein,PlantCoding,cytoplasm,[CAMTA],[protein],[PlantCoding],[nucleus]
184,EDS1,gene,PlantCoding,nucleus,EDS1,protein,PlantCoding,cytoplasm,[CAMTA],[protein],[PlantCoding],[nucleus]


In [107]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
new_subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', new_subdf, qr, 'nodes')

Reaction            	 79	 79


## induction

In [108]:
edge_label = "ACTIVATES"
reaction_type = 'transcriptional/translational induction'
reaction_type_nice = 'transcriptional_translational_induction'
act_subdf = new_subdf[new_subdf['reaction_type']==reaction_type]

In [109]:
# substrate to reaction edge
f =  f"{reaction_type_nice}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
act_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', act_subdf, qr, 'relationships')

-                   	 60	 60


In [110]:
# catalyst to reaction edge
exploded_new_subdf = helpers.unnesting(act_subdf, catalyst_cols)

f =  f"{reaction_type_nice}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
exploded_new_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', exploded_new_subdf, qr, 'relationships')

-                   	 62	 62


In [111]:
# product to reaction edge
f =  f"{reaction_type_nice}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
act_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', act_subdf, qr, 'relationships')

-                   	 60	 60


## repression

In [112]:
reaction_type = 'transcriptional/translational repression'
reaction_type_nice = 'transcriptional_translational_repression'
inh_subdf = new_subdf[new_subdf['reaction_type']==reaction_type]

In [113]:
# substrate to reaction edge
f =  f"{reaction_type_nice}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
inh_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', inh_subdf, qr, 'relationships')

-                   	 19	 19


In [114]:
# catalyst to reaction edge
exploded_new_subdf = helpers.unnesting(inh_subdf, catalyst_cols)


f =  f"{reaction_type_nice}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
exploded_new_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'INHIBITS',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', exploded_new_subdf, qr, 'relationships')

-                   	 19	 19


In [115]:
# product to reaction edge
f =  f"{reaction_type_nice}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
inh_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', inh_subdf, qr, 'relationships')

-                   	 19	 19


## cleavage/auto-cleavage

In [116]:
#Image(filename='./reaction_types/activation/cleavage_autocleavage.png')

In [117]:
key = 'cleavage/auto-cleavage'
reaction_type = 'CLEAVAGE_AUTOCLEAVAGE'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

CLEAVAGE_AUTOCLEAVAGE 29


In [118]:
# make reaction nodes
f = f"{reaction_type}-wicat-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 29	 29


In [119]:
cleavage_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
cleavage_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

## with catalyst

In [120]:
rename_target(cleavage_w_catalyst, 'output1', 'product')
rename_target(cleavage_w_catalyst, 'input1',  'substrate')
rename_target(cleavage_w_catalyst, 'input2',  'catalyst')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [121]:
# substrate to reaction edge
f =  f"{reaction_type}-wicat-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [122]:
# catalyst to reaction edge

f =  f"{reaction_type}-wicat-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [123]:
# product to reaction edge
f =  f"{reaction_type}-wicat-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


## w/o catalyst

In [124]:
rename_target(cleavage_wo_catalyst, 'output1', 'product')
rename_target(cleavage_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [125]:
# substrate to reaction edge
f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
cleavage_wo_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_wo_catalyst, qr, 'relationships')

-                   	 10	 10


In [126]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
cleavage_wo_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_wo_catalyst, qr, 'relationships')

-                   	 10	 10


## undefined

In [127]:
#Image(filename='./reaction_types/...png')

In [128]:
key = 'undefined'
reaction_type = 'UNDEFINED'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

UNDEFINED 9


In [129]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [130]:
subdf[substrate_cols_wo_homologues + product_cols_wo_homologues + catalyst_cols_wo_homologues]

Unnamed: 0,substrate_name,substrate_form,substrate_label,substrate_location,product_name,product_form,product_label,product_location,catalyst_name,catalyst_form,catalyst_label,catalyst_location
31,OPDA,metabolite,Metabolite,putative:cytoplasm,OPDA,metabolite,Metabolite,putative:cytoplasm,SA,metabolite,Metabolite,putative:cytoplasm
60,potyvirus,protein_active,ForeignEntity,putative:cytoplasm,potyvirus,protein,ForeignEntity,putative:cytoplasm,RTM1|RTM2|RTM3,complex,Complex,putative:cytoplasm
87,EDS1|PAD4,complex_active,Complex,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm,NPR,protein,PlantCoding,putative:cytoplasm
88,EDS1|PAD4,complex_active,Complex,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm,MAPK,protein,PlantCoding,putative:cytoplasm
91,EDS5,protein,PlantCoding,putative:cytoplasm,EDS5,protein_active,PlantCoding,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm
120,AUX-signalling,process_active,Process,putative:cytoplasm,AUX-signalling,process,Process,putative:cytoplasm,ARF,protein,PlantCoding,putative:cytoplasm
150,TGA,protein_active,PlantCoding,putative:cytoplasm,TGA,protein,PlantCoding,putative:cytoplasm,GRX,protein_active,PlantCoding,putative:cytoplasm
209,PR1,protein_active,PlantCoding,putative:cytoplasm,PR1,protein,PlantCoding,putative:cytoplasm,EIN2,protein,PlantCoding,putative:cytoplasm
310,CAT,protein_active,PlantCoding,putative:cytoplasm,CAT,protein,PlantCoding,putative:cytoplasm,SA,metabolite,Metabolite,putative:cytoplasm


In [131]:
subdf[reaction_standard_columns]

Unnamed: 0,AddedBy,Species,AdditionalInfo,external_links,trust_level,ModelV,ReactionEffect,reaction_type,Modifications,reaction_id
31,KG,ath,,doi:10.1007/s00344-003-0027-6,R4,v1.0,inhibition,undefined,,rx00032
60,ZR,ath,RTM proteins block the long distance transport...,doi:10.1371/journal.pone.0039169,R4,v2.7,inhibition,undefined,,rx00061
87,KG,ath,Check if this simplification would work (inste...,doi:10.1016/s1369-5266(03)00058-x,R4,v1.0,inhibition,undefined,,rx00088
88,KG,ath,Check if this simplification would work (inste...,"doi:10.1111/j.1365-313x.2006.02806.x,doi:10.11...",R4,v1.0,inhibition,undefined,,rx00089
91,KG,ath,,"doi:10.1016/s1369-5266(03)00058-x,doi:10.1111/...",R4,v1.0,activation,undefined,,rx00092
120,KG,ath,demonstrating that ARF2 is a repressor of auxi...,doi:10.1093/jxb/erq010,R4,v2.7,inhibition,undefined,,rx00121
150,KG,ath,This inactivation of tga256 diables binding on...,doi:10.1093/mp/ssr113,R1,v2.7,inhibition,undefined,,rx00151
209,ZR,ath,,doi:10.1104/pp.104.056028,R4,v2.7,inhibition,undefined,,rx00210
310,SB,ath,SA decreased CAT2 activity in a dose-dependent...,doi:10.1016/j.chom.2017.01.007,R4,v2.7,inhibition,undefined,,rx00311


In [132]:
activation_subdf = subdf[subdf['ReactionEffect'] == 'activation' ]
inhibition_subdf = subdf[subdf['ReactionEffect'] == 'inhibition' ]

In [133]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  9	  9


In [134]:
# substrate to reaction edge
f =  f"{reaction_type}-wicat-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  9	  9


In [135]:
# product to reaction edge
f =  f"{reaction_type}-wicat-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  9	  9


## activation

In [136]:
activation_subdf[substrate_cols_wo_homologues + catalyst_cols_wo_homologues + product_cols_wo_homologues]

Unnamed: 0,substrate_name,substrate_form,substrate_label,substrate_location,catalyst_name,catalyst_form,catalyst_label,catalyst_location,product_name,product_form,product_label,product_location
91,EDS5,protein,PlantCoding,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm,EDS5,protein_active,PlantCoding,putative:cytoplasm


In [137]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
activation_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', activation_subdf, qr, 'relationships')

-                   	  1	  1


## inhibition

In [138]:
inhibition_subdf[substrate_cols_wo_homologues + catalyst_cols_wo_homologues + product_cols_wo_homologues]

Unnamed: 0,substrate_name,substrate_form,substrate_label,substrate_location,catalyst_name,catalyst_form,catalyst_label,catalyst_location,product_name,product_form,product_label,product_location
31,OPDA,metabolite,Metabolite,putative:cytoplasm,SA,metabolite,Metabolite,putative:cytoplasm,OPDA,metabolite,Metabolite,putative:cytoplasm
60,potyvirus,protein_active,ForeignEntity,putative:cytoplasm,RTM1|RTM2|RTM3,complex,Complex,putative:cytoplasm,potyvirus,protein,ForeignEntity,putative:cytoplasm
87,EDS1|PAD4,complex_active,Complex,putative:cytoplasm,NPR,protein,PlantCoding,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm
88,EDS1|PAD4,complex_active,Complex,putative:cytoplasm,MAPK,protein,PlantCoding,putative:cytoplasm,EDS1|PAD4,complex,Complex,putative:cytoplasm
120,AUX-signalling,process_active,Process,putative:cytoplasm,ARF,protein,PlantCoding,putative:cytoplasm,AUX-signalling,process,Process,putative:cytoplasm
150,TGA,protein_active,PlantCoding,putative:cytoplasm,GRX,protein_active,PlantCoding,putative:cytoplasm,TGA,protein,PlantCoding,putative:cytoplasm
209,PR1,protein_active,PlantCoding,putative:cytoplasm,EIN2,protein,PlantCoding,putative:cytoplasm,PR1,protein,PlantCoding,putative:cytoplasm
310,CAT,protein_active,PlantCoding,putative:cytoplasm,SA,metabolite,Metabolite,putative:cytoplasm,CAT,protein,PlantCoding,putative:cytoplasm


In [139]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
inhibition_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'INHIBITS',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', inhibition_subdf, qr, 'relationships')

-                   	  8	  8


# END 