In [1]:
version = "v0.1.0"

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/5

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Components summary

In [10]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"]) for d in nodes])

In [11]:
all_nodes_in_components

{'&alpha;/&beta; hydroxylase',
 '&beta;-carotene isomerase',
 '12,13-EOT',
 '12-OH-JA-Ile',
 '13-HPOT',
 '4CLL',
 '6K1',
 '6K2',
 '9-cis-&beta;-carotene',
 '9-cis-10&prime;-apo-&beta;-carotenal',
 'AAO',
 'AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT4G34900,AT5G20960]',
 'ACC',
 'ACH',
 'ACH[AT2G30720,AT5G48370]',
 'ACO',
 'ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590]',
 'ACS',
 'ACS2[AT1G01480]',
 'ACS6[AT4G11280]',
 'ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT3G61510,AT4G08040,AT4G11280,AT4G26200,AT4G37770,AT5G51690,AT5G65800]',
 'ACX',
 'ACX2[AT5G65110]',
 'ACX2|CAT2',
 'ACX3[AT1G06290]',
 'ACX3|CAT2',
 'ACX[AT1G06290,AT1G06310,AT2G35690,AT3G06690,AT3G51840,AT4G16760,AT5G65110]',
 'ADK',
 'ADK[AT3G09820,AT5G03300]',
 'ADP',
 'ADT',
 'ADT[AT1G08250,AT1G11790,AT2G27820,AT3G07630,AT3G44720,AT5G22630]',
 'AGO',
 'AGO1,5,7,10[AT1G48410,AT1G69440,AT2G27880,AT5G43810]',
 'AGO1,5,7,10|CI',
 'AGO1,5,7,10|HC-Pro',
 'AHK',
 'AHK2,3,4[AT1G27320,AT2G01830,AT5G35750]',
 'AHP',
 'AH

In [12]:
len(all_nodes_in_components)

656

In [13]:
reload(helpers)

<module 'helpers' from '/home/jovyan/work/helpers.py'>

In [14]:
node_dict = {}
for label in helpers.reaction_participant_labels:
    if label == "Metabolite":
        q = '''MATCH (n:%s) WHERE NOT n:MetaboliteFamily RETURN DISTINCT n.name'''%label
    else:
        q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

    

FunctionalCluster 245
ForeignEntity 3
ForeignCoding 14
ForeignNonCoding 0
ForeignAbstract 0
Complex 94
Process 6
MetaboliteFamily 6
Metabolite 110


In [15]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [16]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [17]:
df_edges.head(10)

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_family,input1_newID,input2_newID,input3_newID,output1_newID,input1_location,input2_location,input3_location,output1_location,reaction_type
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,L-Met,"SAMS[AT1G02500,AT2G36880,AT3G17390,AT4G01850]",,SAMe,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,SAMe,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT...",,ACC,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,ACC,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",,ET,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,,Cu2+,"HMA[AT1G63440,AT4G33520,AT5G21930,AT5G44790]",,Cu2+,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,translocation
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,ETR,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",Cu2+,,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein activation
5,forCB,KG,Conn006,ath,CTR,family,cytoplasm,protein [active],ETR,family,...,,"CTR[AT4G24480,AT5G03730]","ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",,CTR|ETR,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,binding/oligomerisation
6,forCB,KG,Conn007,ath,CTR|ETR,family,ER,complex,RTE1,family,...,,CTR|ETR,RTE1[AT2G26070],,CTR|ETR,endoplasmic reticulum,golgi apparatus,putative:cytoplasm,endoplasmic reticulum,protein activation
7,_TBD,KG,Conn008,ath,ET,family,cytoplasm,metabolite,ETR,family,...,,ET,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",,ET|ETR,cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,binding/oligomerisation
8,forCB,KG,Conn009,ath,EIN2,node,ER,protein,ETP|SCF,family,...,EIN2,EIN2[AT5G03280],ETP|SCF,,EIN2[AT5G03280],endoplasmic reticulum,cytoplasm,putative:cytoplasm,endoplasmic reticulum,protein activation
9,forCB,KG,Conn010,ath,CTR|ETR,family,ER,complex,EIN2,family,...,EIN2,CTR|ETR,EIN2[AT5G03280],,EIN2[AT5G03280],endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein deactivation


In [18]:
df_edges[df_edges['input1_ID']=="DZ"][['input1_ID', 'input1_newID', 'input1_label']]

Unnamed: 0,input1_ID,input1_newID,input1_label
341,DZ,DZ,Metabolite
345,DZ,DZ,Metabolite
346,DZ,DZ,Metabolite
347,DZ,DZ,Metabolite
348,DZ,DZ,Metabolite
349,DZ,DZ,Metabolite
350,DZ,DZ,Metabolite
351,DZ,DZ,Metabolite
352,DZ,DZ,Metabolite


In [19]:
helpers.empty_strings

['-', '?', '[empty]', 'nan', 'n.a.', nan, '[undefined]', '']

In [20]:
def generate_list(subdf, ids, new_name):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']
    
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']
    
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [21]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_newID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [22]:
def number_input_different(df, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate')
        rename_target(subdf2, 'input2', 'catalyst')

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate')        
        rename_target(subdf3, 'input3', 'catalyst') 
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate')

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate')
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product')
    
    return new_subdf

In [23]:
# transcription genes

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] 
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] 

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']

# Add reactions

In [24]:
def pretty_print_result(t, df, qr, input_type, multiplier=1):
    if input_type in ['nodes', 'relationships']:
        key = input_type + '_created'
    else:
        key = input_type
    
    try:
        stat = qr.stats()[key]
    except:
        stat = 0
    
    print(f"{t:20}\t{df.shape[0]:3}\t{stat:3}", end="")
    if df.shape[0]*multiplier == stat:
        print()
    
    elif df.shape[0]*multiplier < stat:
        print(f"\t**too many {input_type} created**")
    else:
        print(f"\t**not all {input_type} created**")

In [25]:
df_edges.index.duplicated().sum()

0

## binding / oligomerisation

In [26]:
key = 'binding/oligomerisation'
reaction_type = "BINDING_OGLIMERISATION"
subdf = df_edges.loc[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

BINDING_OGLIMERISATION 104


In [27]:
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']
binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']

In [28]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst)
subdf_w_catalyst = number_input_different(binding_w_catalyst, catalyst=True)

In [29]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	104	104


### Without catalyst

In [30]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_wo_catalyst, explode_cols)#.drop_duplicates()

In [31]:
reload(helpers)

<module 'helpers' from '/home/jovyan/work/helpers.py'>

In [32]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=labels, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


Complex             	 16	 16
ForeignCoding       	 30	 30
ForeignEntity       	  3	  3
Metabolite          	 32	 32
PlantAbstract       	  4	  4
PlantCoding         	131	131
Process             	  3	  3


In [33]:
subdf_wo_catalyst.shape[0]

103

In [34]:
# binding product edges
want_cols = reaction_standard_columns + product_cols

for t, this_subdf in subdf_wo_catalyst.groupby("product_label"):
    f = f"{reaction_type}-{t}-product_edges.tsv"
    this_subdf[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                    "dne", "product",
                                                    source_name="line.reaction_id", 
                                                    source_label="Reaction",
                                                    target_label=labels
                                                   )
    qr = graph.run(query)
    pretty_print_result('-', this_subdf, qr, 'relationships')

-                   	 90	 90
-                   	  9	  9
-                   	  4	  4


### With catalyst

In [35]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_w_catalyst, explode_cols).drop_duplicates()

In [36]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-wicat-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=labels, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


PlantCoding         	  1	  1


In [37]:
# binding catalyst edges

want_cols = reaction_standard_columns + catalyst_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-wicat-{t}-catalyst_label_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES', 
                                                    "catalyst", "dne",
                                                    source_label=labels, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')
    

PlantCoding         	  1	  1


In [38]:
f = f"{reaction_type}-wicat-catalyst-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
subdf_w_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [39]:
# binding product edges
want_cols = reaction_standard_columns + product_cols

for t, this_subdf in subdf_w_catalyst.groupby("product_label"):
    f = f"{reaction_type}-wicat-{t}-catalyst-product_edges.tsv"
    this_subdf[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                    "dne", "product",
                                                    source_name="line.reaction_id", 
                                                    source_label="Reaction",
                                                    target_label=labels
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Complex             	  1	  1


## catalysis / auto-catalysis

In [40]:
key = 'catalysis/auto-catalysis'
reaction_type = 'CATALYSIS'
subdf = df_edges[df_edges['reaction_type'] == key]
print(reaction_type, subdf.shape[0])

CATALYSIS 79


In [41]:
subdf.duplicated().sum()

0

In [42]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 79	 79


In [43]:
catalysis_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
catalysis_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

### With catalyst

In [44]:
subdf_w_catalyst = number_input_different(catalysis_w_catalyst, catalyst=True)
new_subdf = number_input_different(subdf_w_catalyst, catalyst=True)

In [45]:
# substrate to reaction
exploded_new_subdf = helpers.unnesting(new_subdf, substrate_cols)
want_cols = reaction_standard_columns + substrate_cols
for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-wicat-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=labels, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 79	 79
PlantCoding         	 21	 21


In [46]:
# catalyst to reaction
want_cols = reaction_standard_columns + catalyst_cols

for t, this_subdf in subdf_w_catalyst.groupby("catalyst_label"):
    f = f"{reaction_type}-wicat-{t}-catalyst_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t        
    query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES', 
                                                    "catalyst", "dne",
                                                    source_label=labels, target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 21	 21
MetaboliteFamily    	  1	  1
PlantAbstract       	  3	  3
PlantCoding         	 52	 52
Process             	  1	  1


In [47]:
# product to reaction
want_cols = reaction_standard_columns + product_cols
for t, this_subdf in subdf_w_catalyst.groupby("product_label"):
    f = f"{reaction_type}-wicat-{t}-product_edges.tsv"    
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t       
    query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', 
                                                    "dne", "product",
                                                    target_label=labels, source_label="Reaction", 
                                                    source_name="line.reaction_id"
                                                   )    
    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 76	 76
MetaboliteFamily    	  2	  2


### Without catalyst

In [48]:
rename_target(catalysis_wo_catalyst, 'output1', 'product')
rename_target(catalysis_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [49]:
catalysis_wo_catalyst['reaction_id']

378    rx00379
Name: reaction_id, dtype: object

In [50]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in catalysis_wo_catalyst.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t          
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=labels, 
                                                    target_label="Reaction", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


MetaboliteFamily    	  1	  1


In [51]:
# binding product edges
want_cols = reaction_standard_columns + product_cols

for t, this_subdf in catalysis_wo_catalyst.groupby("product_label"):
    f = f"{reaction_type}-{t}-product_edges.tsv"
    this_subdf[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)
    if t in helpers.plant_node_labels:
        labels = ['FunctionalCluster', t]
    else:
        labels = t
    query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                    "dne", "product",
                                                    source_name="line.reaction_id", 
                                                    source_label="Reaction",
                                                    target_label=labels
                                                   )
    qr = graph.run(query)
    pretty_print_result('-', this_subdf, qr, 'relationships')

-                   	  1	  1


In [52]:
q = '''MATCH ()-[r:SUBSTRATE {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

101

In [53]:
q = '''MATCH ()-[r:PRODUCT {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

79

## dissociation

In [54]:
key = 'dissociation'
reaction_type = 'DISSOCIATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DISSOCIATION 1


In [55]:
subdf

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_family,input1_newID,input2_newID,input3_newID,output1_newID,input1_location,input2_location,input3_location,output1_location,reaction_type
95,forCB,KG,Conn096,ath,NPR1|NPR1,family,cytoplasm,complex,TRX-H,family,...,NPR,NPR1|NPR1,"TRX-H[AT1G19730,AT1G45145,AT5G42980]",,NPR1[AT1G64280],cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,dissociation


In [56]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [57]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  1	  1


In [58]:
# substrate to reaction edge
f =  f"{reaction_type}-{t}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                                                
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


In [59]:
# catalyst to reaction edge

f =  f"{reaction_type}-{t}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst" , "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


In [60]:
reload(helpers)

<module 'helpers' from '/home/jovyan/work/helpers.py'>

In [61]:
# product to reaction edge

f =  f"{reaction_type}-{t}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                                               
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


## degradation

In [62]:
key = 'degradation/secretion'
reaction_type = 'DEGRADATION_SECRETION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DEGRADATION_SECRETION 32


In [63]:
rename_target(subdf, 'input1',  'catalyst')
rename_target(subdf, 'input2',  'substrate')

In [64]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 32	 32


In [65]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 32	 32


In [66]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 32	 32


## translocation

In [67]:
key = 'translocation'
reaction_type = 'TRANSLOCATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

TRANSLOCATION 5


In [68]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [69]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  5	  5


In [70]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_FROM',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


In [71]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


In [72]:
# product edge
f =  f"{reaction_type}--product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_TO',
                                                "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


## protein activation

In [73]:
key = 'protein activation'
reaction_type = 'PROTEIN_ACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_ACTIVATION 52


In [74]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'catalyst')
rename_target(subdf, 'input1',  'substrate')

In [75]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 52	 52


In [76]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


In [77]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


In [78]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


## protein deactivation

In [79]:
key = 'protein deactivation'
reaction_type = 'PROTEIN_DEACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_DEACTIVATION 4


In [80]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [81]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  4	  4


In [82]:
# substrate to reaction edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [83]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [84]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', 
                                                "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


## transcriptional / translational regulation

In [85]:
reaction_type = 'TRANSCRIPTIONAL_TRANSLATIONAL'
keys = ['transcriptional/translational induction', 'transcriptional/translational repression']
subdf = df_edges[df_edges['reaction_type'].isin(keys)]
print(reaction_type, subdf.shape[0])

TRANSCRIPTIONAL_TRANSLATIONAL 79


In [86]:
def number_input_different_reverse(df):
    ''' If catalyst is True, they are the first "input" cols. 
    last one is left as substrate. '''

    # two inputs, input1 -> catalyst
    subdf2 = df[df["input3_newID"].isna()].copy()
    generate_list(subdf2, ['input1'], 'catalyst')
    rename_target(subdf2, 'input2', 'substrate')

    # three inputs, input1, input2 -> catalyst
    subdf3 = df[~df["input3_newID"].isna()].copy()
    generate_list(subdf3, ['input1', 'input2'], 'catalyst')        
    rename_target(subdf3, 'input3', 'substrate')
     
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product')
    
    return new_subdf

In [87]:
new_subdf = number_input_different_reverse(subdf)

In [88]:
new_subdf[new_subdf['substrate_name'] != new_subdf['product_name']][["Status", "ConnID", 'substrate_name', 'product_name' ]]

Unnamed: 0,Status,ConnID,substrate_name,product_name
44,_TBD,Conn045,OMR1[AT3G10050],COI1|OMR1
92,_TBD,Conn093,PAD4[AT3G52430],"NPR1|PAD4|TGA2,5,6"
115,_TBD,Conn116,"EDS5[AT2G21340,AT4G39030]","EDS5|NPR1|TGA2,5,6"
117,_TBD,Conn118,EDS1[AT3G48090],"EDS1|NPR1|TGA2,5,6"
119,_TBD,Conn120,PAD4[AT3G52430],"NPR1|PAD4|TGA2,5,6"
148,_TBD,Conn149,GSTU24[AT1G17170],GSTU24|ROS


In [89]:
new_subdf[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols]

Unnamed: 0,Status,ConnID,substrate_name,substrate_label,substrate_form,substrate_location
27,forCB,Conn028,"LOX[AT1G17420,AT1G55020,AT1G67560,AT1G72520,AT...",PlantCoding,gene,chloroplast
97,forCB,Conn098,WRKY70[AT3G56400],PlantCoding,gene,cytoplasm
110,forCB,Conn111,"TAS3[AT3G17185,AT5G49615]",PlantNonCoding,gene,cytoplasm
123,forCB,Conn124,WRKY53[AT4G23810],PlantCoding,gene,cytoplasm
160,forCB,Conn161,PAD4[AT3G52430],PlantCoding,gene,cytoplasm
186,forCB,Conn187,ICS1[AT1G74710],PlantCoding,gene,chloroplast
187,forCB,Conn188,ICS1[AT1G74710],PlantCoding,gene,chloroplast
188,forCB,Conn189,ICS1[AT1G74710],PlantCoding,gene,chloroplast
189,forCB,Conn190,ICS1[AT1G74710],PlantCoding,gene,chloroplast
192,forCB,Conn193,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",PlantCoding,gene,endoplasmic reticulum


In [90]:
#manualfix
new_subdf.loc[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  ), 'substrate_location'] = 'nucleus'

In [91]:
new_subdf[(  (new_subdf['product_form'].isin(['rna', 'ncRNA']) ))]#& (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,catalyst_label,catalyst_form,substrate_name,substrate_location,substrate_label,substrate_form,product_name,product_location,product_label,product_form
110,forCB,KG,Conn111,ath,MIR390,family,cytoplasm,ncRNA,TAS3,family,...,[PlantNonCoding],[ncRNA],"TAS3[AT3G17185,AT5G49615]",nucleus,PlantNonCoding,gene,"TAS3[AT3G17185,AT5G49615]",cytoplasm,PlantNonCoding,ncRNA


In [92]:
new_subdf[(  (new_subdf['substrate_form'].isin(['rna', 'ncRNA'])) & (new_subdf['substrate_location'] != 'cytoplams')  )][["Status", "ConnID"] + substrate_cols]

Unnamed: 0,Status,ConnID,substrate_name,substrate_label,substrate_form,substrate_location


In [93]:
new_subdf[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  )][["Status", "ConnID"] + product_cols]

Unnamed: 0,Status,ConnID,product_name,product_label,product_form,product_location
11,forCB,Conn012,"EBF[AT2G25490,AT5G25350]",PlantCoding,protein,nucleus
20,forCB,Conn021,PR3[AT3G12500],PlantCoding,protein,nucleus
21,forCB,Conn022,PR4[AT3G04720],PlantCoding,protein,nucleus
24,forCB,Conn025,"PDF1.2[AT2G26020,AT5G44420,AT5G44430]",PlantCoding,protein,nucleus
27,forCB,Conn028,"LOX[AT1G17420,AT1G55020,AT1G67560,AT1G72520,AT...",PlantCoding,protein,chloroplast
45,forCB,Conn046,"JAZ[AT1G17380,AT1G19180,AT1G30135,AT1G48500,AT...",PlantCoding,protein,nucleus
47,forCB,Conn048,"MYC[AT1G32640,AT4G17880,AT5G46760]",PlantCoding,protein,nucleus
48,forCB,Conn049,"CLH[AT1G19670,AT5G43860]",PlantCoding,protein,nucleus
49,forCB,Conn050,JR1[AT3G16470],PlantCoding,protein,nucleus
50,forCB,Conn051,"PR13[AT1G12660,AT1G12663,AT1G12665,AT1G12672,A...",PlantCoding,protein,nucleus


In [94]:
# manualfix
new_subdf.loc[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  ), 'product_location'] = 'cytoplasm'

In [95]:
new_subdf[~new_subdf['substrate_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + substrate_cols] #+ catalyst_cols_wo_homologues + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_label,substrate_form,substrate_location
62,forCB,Conn063,Anthocyanin-accumulation,Process,process,cytoplasm
63,forCB,Conn064,Trichome-initiation,Process,process,cytoplasm


In [96]:
new_subdf[~new_subdf['product_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + product_cols]

Unnamed: 0,Status,ConnID,product_name,product_label,product_form,product_location
44,_TBD,Conn045,COI1|OMR1,Complex,complex,nucleus
62,forCB,Conn063,Anthocyanin-accumulation,Process,process_active,cytoplasm
63,forCB,Conn064,Trichome-initiation,Process,process_active,cytoplasm
92,_TBD,Conn093,"NPR1|PAD4|TGA2,5,6",Complex,complex,nucleus
115,_TBD,Conn116,"EDS5|NPR1|TGA2,5,6",Complex,complex,nucleus
117,_TBD,Conn118,"EDS1|NPR1|TGA2,5,6",Complex,complex,nucleus
119,_TBD,Conn120,"NPR1|PAD4|TGA2,5,6",Complex,complex,nucleus
148,_TBD,Conn149,GSTU24|ROS,Complex,complex,nucleus


## Reaction nodes

In [97]:
new_subdf[new_subdf['reaction_id'].isin(['rx00148', 'rx00185'])][substrate_cols + product_cols + catalyst_cols]

Unnamed: 0,substrate_name,substrate_label,substrate_form,substrate_location,product_name,product_label,product_form,product_location,catalyst_name,catalyst_label,catalyst_form,catalyst_location
147,EDS1[AT3G48090],PlantCoding,gene,nucleus,EDS1[AT3G48090],PlantCoding,protein,cytoplasm,[CAMTA3[AT2G22300]],[PlantCoding],[protein],[nucleus]
184,EDS1[AT3G48090],PlantCoding,gene,nucleus,EDS1[AT3G48090],PlantCoding,protein,cytoplasm,[CAMTA3[AT2G22300]],[PlantCoding],[protein],[nucleus]


In [98]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
new_subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', new_subdf, qr, 'nodes')

Reaction            	 79	 79


## induction

In [99]:
edge_label = "ACTIVATES"
reaction_type = 'transcriptional/translational induction'
reaction_type_nice = 'transcriptional_translational_induction'
act_subdf = new_subdf[new_subdf['reaction_type']==reaction_type]

In [100]:
# substrate to reaction edge
f =  f"{reaction_type_nice}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
act_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', act_subdf, qr, 'relationships')

-                   	 60	 60


In [101]:
# catalyst to reaction edge
exploded_new_subdf = helpers.unnesting(act_subdf, catalyst_cols)

f =  f"{reaction_type_nice}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
exploded_new_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', exploded_new_subdf, qr, 'relationships')

-                   	 62	 62


In [102]:
# product to reaction edge
f =  f"{reaction_type_nice}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
act_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', act_subdf, qr, 'relationships')

-                   	 60	 60


## repression

In [103]:
reaction_type = 'transcriptional/translational repression'
reaction_type_nice = 'transcriptional_translational_repression'
inh_subdf = new_subdf[new_subdf['reaction_type']==reaction_type]

In [104]:
# substrate to reaction edge
f =  f"{reaction_type_nice}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
inh_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', inh_subdf, qr, 'relationships')

-                   	 19	 19


In [105]:
# catalyst to reaction edge
exploded_new_subdf = helpers.unnesting(inh_subdf, catalyst_cols)


f =  f"{reaction_type_nice}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
exploded_new_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'INHIBITS',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', exploded_new_subdf, qr, 'relationships')

-                   	 19	 19


In [106]:
# product to reaction edge
f =  f"{reaction_type_nice}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
inh_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', inh_subdf, qr, 'relationships')

-                   	 19	 19


## cleavage/auto-cleavage

In [107]:
#Image(filename='./reaction_types/activation/cleavage_autocleavage.png')

In [108]:
key = 'cleavage/auto-cleavage'
reaction_type = 'CLEAVAGE_AUTOCLEAVAGE'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

CLEAVAGE_AUTOCLEAVAGE 29


In [109]:
# make reaction nodes
f = f"{reaction_type}-wicat-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	 29	 29


In [110]:
cleavage_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
cleavage_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

## with catalyst

In [111]:
rename_target(cleavage_w_catalyst, 'output1', 'product')
rename_target(cleavage_w_catalyst, 'input1',  'substrate')
rename_target(cleavage_w_catalyst, 'input2',  'catalyst')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [112]:
# substrate to reaction edge
f =  f"{reaction_type}-wicat-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [113]:
# catalyst to reaction edge

f =  f"{reaction_type}-wicat-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [114]:
# product to reaction edge
f =  f"{reaction_type}-wicat-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
cleavage_w_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


## w/o catalyst

In [115]:
rename_target(cleavage_wo_catalyst, 'output1', 'product')
rename_target(cleavage_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [116]:
# substrate to reaction edge
f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
cleavage_wo_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_wo_catalyst, qr, 'relationships')

-                   	 10	 10


In [117]:
# product to reaction edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
cleavage_wo_catalyst[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', cleavage_wo_catalyst, qr, 'relationships')

-                   	 10	 10


## undefined

In [118]:
#Image(filename='./reaction_types/...png')

In [119]:
key = 'undefined'
reaction_type = 'UNDEFINED'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

UNDEFINED 9


In [120]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [121]:
subdf[substrate_cols + product_cols + catalyst_cols]

Unnamed: 0,substrate_name,substrate_label,substrate_form,substrate_location,product_name,product_label,product_form,product_location,catalyst_name,catalyst_label,catalyst_form,catalyst_location
31,OPDA,Metabolite,metabolite,putative:cytoplasm,OPDA,Metabolite,metabolite,putative:cytoplasm,SA,Metabolite,metabolite,putative:cytoplasm
60,potyvirus,ForeignEntity,protein_active,putative:cytoplasm,potyvirus,ForeignEntity,protein,putative:cytoplasm,RTM1|RTM2|RTM3,Complex,complex,putative:cytoplasm
87,EDS1|PAD4,Complex,complex_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm,NPR1[AT1G64280],PlantCoding,protein,putative:cytoplasm
88,EDS1|PAD4,Complex,complex_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm,MPK4[AT4G01370],PlantCoding,protein,putative:cytoplasm
91,"EDS5[AT2G21340,AT4G39030]",PlantCoding,protein,putative:cytoplasm,"EDS5[AT2G21340,AT4G39030]",PlantCoding,protein_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm
120,AUX-signalling,Process,process_active,putative:cytoplasm,AUX-signalling,Process,process,putative:cytoplasm,ARF2[AT5G62000],PlantCoding,protein,putative:cytoplasm
150,"TGA2,5,6[AT3G12250,AT5G06950,AT5G06960]",PlantCoding,protein_active,putative:cytoplasm,"TGA2,5,6[AT3G12250,AT5G06950,AT5G06960]",PlantCoding,protein,putative:cytoplasm,GRX480[AT1G28480],PlantCoding,protein_active,putative:cytoplasm
209,"PR1[AT1G50060,AT2G14580,AT2G14610,AT4G33710,AT...",PlantCoding,protein_active,putative:cytoplasm,"PR1[AT1G50060,AT2G14580,AT2G14610,AT4G33710,AT...",PlantCoding,protein,putative:cytoplasm,EIN2[AT5G03280],PlantCoding,protein,putative:cytoplasm
310,CAT2[AT4G35090],PlantCoding,protein_active,putative:cytoplasm,CAT2[AT4G35090],PlantCoding,protein,putative:cytoplasm,SA,Metabolite,metabolite,putative:cytoplasm


In [122]:
subdf[reaction_standard_columns]

Unnamed: 0,AddedBy,Species,AdditionalInfo,external_links,trust_level,ModelV,ReactionEffect,reaction_type,Modifications,reaction_id
31,KG,ath,,doi:10.1007/s00344-003-0027-6,R4,v1.0,inhibition,undefined,,rx00032
60,ZR,ath,RTM proteins block the long distance transport...,doi:10.1371/journal.pone.0039169,R4,v2.7,inhibition,undefined,,rx00061
87,KG,ath,Check if this simplification would work (inste...,doi:10.1016/s1369-5266(03)00058-x,R4,v1.0,inhibition,undefined,,rx00088
88,KG,ath,Check if this simplification would work (inste...,"doi:10.1104/pp.112.194647,doi:10.1111/j.1365-3...",R4,v1.0,inhibition,undefined,,rx00089
91,KG,ath,,"doi:10.1111/j.1365-313x.2004.02241.x,doi:10.10...",R4,v1.0,activation,undefined,,rx00092
120,KG,ath,demonstrating that ARF2 is a repressor of auxi...,doi:10.1093/jxb/erq010,R4,v2.7,inhibition,undefined,,rx00121
150,KG,ath,This inactivation of tga256 diables binding on...,doi:10.1093/mp/ssr113,R1,v2.7,inhibition,undefined,,rx00151
209,ZR,ath,,doi:10.1104/pp.104.056028,R4,v2.7,inhibition,undefined,,rx00210
310,SB,ath,SA decreased CAT2 activity in a dose-dependent...,doi:10.1016/j.chom.2017.01.007,R4,v2.7,inhibition,undefined,,rx00311


In [123]:
activation_subdf = subdf[subdf['ReactionEffect'] == 'activation' ]
inhibition_subdf = subdf[subdf['ReactionEffect'] == 'inhibition' ]

In [124]:
# make reaction nodes
f = f"{reaction_type}-reaction.tsv"
subdf[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.reaction_node_query(f, name="line.reaction_id")

qr = graph.run(query)
pretty_print_result('Reaction', subdf, qr, 'nodes')

Reaction            	  9	  9


In [125]:
# substrate to reaction edge
f =  f"{reaction_type}-wicat-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  9	  9


In [126]:
# product to reaction edge
f =  f"{reaction_type}-wicat-product_edges.tsv" 
want_cols = reaction_standard_columns + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', "dne" , "product", 
                                                source_name="line.reaction_id", 
                                                source_label="Reaction"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  9	  9


## activation

In [127]:
activation_subdf[substrate_cols + product_cols + catalyst_cols]

Unnamed: 0,substrate_name,substrate_label,substrate_form,substrate_location,product_name,product_label,product_form,product_location,catalyst_name,catalyst_label,catalyst_form,catalyst_location
91,"EDS5[AT2G21340,AT4G39030]",PlantCoding,protein,putative:cytoplasm,"EDS5[AT2G21340,AT4G39030]",PlantCoding,protein_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm


In [128]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
activation_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'ACTIVATES',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', activation_subdf, qr, 'relationships')

-                   	  1	  1


## inhibition

In [129]:
inhibition_subdf[substrate_cols + product_cols + catalyst_cols]

Unnamed: 0,substrate_name,substrate_label,substrate_form,substrate_location,product_name,product_label,product_form,product_location,catalyst_name,catalyst_label,catalyst_form,catalyst_location
31,OPDA,Metabolite,metabolite,putative:cytoplasm,OPDA,Metabolite,metabolite,putative:cytoplasm,SA,Metabolite,metabolite,putative:cytoplasm
60,potyvirus,ForeignEntity,protein_active,putative:cytoplasm,potyvirus,ForeignEntity,protein,putative:cytoplasm,RTM1|RTM2|RTM3,Complex,complex,putative:cytoplasm
87,EDS1|PAD4,Complex,complex_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm,NPR1[AT1G64280],PlantCoding,protein,putative:cytoplasm
88,EDS1|PAD4,Complex,complex_active,putative:cytoplasm,EDS1|PAD4,Complex,complex,putative:cytoplasm,MPK4[AT4G01370],PlantCoding,protein,putative:cytoplasm
120,AUX-signalling,Process,process_active,putative:cytoplasm,AUX-signalling,Process,process,putative:cytoplasm,ARF2[AT5G62000],PlantCoding,protein,putative:cytoplasm
150,"TGA2,5,6[AT3G12250,AT5G06950,AT5G06960]",PlantCoding,protein_active,putative:cytoplasm,"TGA2,5,6[AT3G12250,AT5G06950,AT5G06960]",PlantCoding,protein,putative:cytoplasm,GRX480[AT1G28480],PlantCoding,protein_active,putative:cytoplasm
209,"PR1[AT1G50060,AT2G14580,AT2G14610,AT4G33710,AT...",PlantCoding,protein_active,putative:cytoplasm,"PR1[AT1G50060,AT2G14580,AT2G14610,AT4G33710,AT...",PlantCoding,protein,putative:cytoplasm,EIN2[AT5G03280],PlantCoding,protein,putative:cytoplasm
310,CAT2[AT4G35090],PlantCoding,protein_active,putative:cytoplasm,CAT2[AT4G35090],PlantCoding,protein,putative:cytoplasm,SA,Metabolite,metabolite,putative:cytoplasm


In [130]:
# catalyst to reaction edge

f =  f"{reaction_type}-catalyst_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols
inhibition_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'INHIBITS',
                                                "catalyst", "dne", 
                                                target_label="Reaction", 
                                                target_name="line.reaction_id"
                              )
qr = graph.run(query)
pretty_print_result('-', inhibition_subdf, qr, 'relationships')

-                   	  8	  8


# END 