In [140]:
version = "v0.0.4"

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/5

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Components summary

In [10]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"]) for d in nodes])

In [11]:
all_nodes_in_components

{'&alpha;/&beta; hydroxylase',
 '&beta;-carotene isomerase',
 '12,13-EOT',
 '12-OH-JA-Ile',
 '13-HPOT',
 '4CLL',
 '6K1',
 '6K2',
 '9-cis-&beta;-carotene',
 '9-cis-10&prime;-apo-&beta;-carotenal',
 'AAO',
 'ACC',
 'ACH',
 'ACO',
 'ACS',
 'ACX',
 'ACX2|CAT2',
 'ACX3|CAT2',
 'ADK',
 'ADP',
 'ADT',
 'AGO',
 'AGO1,5,7,10|CI',
 'AGO1,5,7,10|HC-Pro',
 'AHK',
 'AHP',
 'AMP',
 'AOC',
 'AOS',
 'AOX',
 'APT',
 'APX',
 'ARF',
 'ARR',
 'ASK',
 'ATP',
 'ATPB',
 'ATPB|HC-Pro',
 'AUX-signalling',
 'Ade',
 'Ado',
 'Anthocyanin-accumulation',
 'BA',
 'BA2H',
 'BAK1|FLS2|flg22',
 'BG',
 'BIK1',
 'BIK1|Ca2+',
 'BR1',
 'CAM',
 'CAMTA',
 'CAM|Ca2+',
 'CAT',
 'CAT|LSD1',
 'CBP60G',
 'CCD',
 'CDPK',
 'CI',
 'CI|PSAK',
 'CKX',
 'CL',
 'CLA',
 'CLH',
 'CM',
 'CML40|HC-Pro',
 'CML42|Ca2+',
 'CO',
 'CO2-deficiency',
 'COI1',
 'COI1|JA-Ile|SCF',
 'COI1|OMR1',
 'COI1|RBCS-3B',
 'CO|OBE1',
 'CP',
 'CPS',
 'CP|CPIP1,2b',
 'CP|CPIP2a',
 'CRT',
 'CRT1|ETR1',
 'CRT1|HC-Pro',
 'CRT2|HC-Pro',
 'CRT3|HC-Pro',
 'CRT|Ca2+',


In [12]:
len(all_nodes_in_components)

411

In [13]:
node_labels = helpers.node_labels

In [14]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s


PlantCoding 162
PlantNonCoding 9
PlantAbstract 7
Complex 94
ExternalEntity 3
ExternalCoding 14
ExternalNonCoding 0
ExternalAbstract 0
Process 6
MetaboliteFamily 6
Metabolite 110
PseudoNode 0


In [15]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [16]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [17]:
homolgue_cols_old = ['input1_ath_homologues', 'input1_osa_homologues', 'input1_sly_homologues', 'input1_stu_homologues', 
          'input2_ath_homologues', 'input2_osa_homologues', 'input2_sly_homologues', 'input2_stu_homologues', 
          'input3_ath_homologues', 'input3_osa_homologues', 'input3_sly_homologues', 'input3_stu_homologues', 
          'output1_ath_homologues', 'output1_osa_homologues', 'output1_sly_homologues', 'output1_stu_homologues']
for col in homolgue_cols_old:
    df_edges[col].fillna('', inplace=True)


In [18]:
df_edges.head(10)

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00001
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00002
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00003
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,,,,,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,translocation,rx00004
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,"AT2G40940,AT1G66340,AT3G04580,AT1G04310,AT3G23150",,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00005
5,forCB,KG,Conn006,ath,CTR,family,cytoplasm,protein [active],ETR,family,...,,,,,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,binding/oligomerisation,rx00006
6,forCB,KG,Conn007,ath,CTR|ETR,family,ER,complex,RTE1,family,...,,,,,endoplasmic reticulum,golgi apparatus,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00007
7,_TBD,KG,Conn008,ath,ET,family,cytoplasm,metabolite,ETR,family,...,,,,,cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,binding/oligomerisation,rx00008
8,forCB,KG,Conn009,ath,EIN2,node,ER,protein,ETP|SCF,family,...,AT5G03280,,,,endoplasmic reticulum,cytoplasm,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00009
9,forCB,KG,Conn010,ath,CTR|ETR,family,ER,complex,EIN2,family,...,AT5G03280,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein deactivation,rx00010


In [19]:
df_edges[df_edges['input1_ID']=="DZ"][['input1_ID', 'input1_newID', 'input1_label']]

Unnamed: 0,input1_ID,input1_newID,input1_label
341,DZ,DZ,Metabolite
345,DZ,DZ,Metabolite
346,DZ,DZ,Metabolite
347,DZ,DZ,Metabolite
348,DZ,DZ,Metabolite
349,DZ,DZ,Metabolite
350,DZ,DZ,Metabolite
351,DZ,DZ,Metabolite
352,DZ,DZ,Metabolite


In [20]:
helpers.empty_strings

['-', '?', '[empty]', 'nan', 'n.a.', nan, '[undefined]', '']

In [21]:
def generate_list(subdf, ids, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols
        
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols

    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [22]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_newID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [23]:
def number_input_different(df, homologues=True, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate', homologues=homologues)
        rename_target(subdf2, 'input2', 'catalyst', homologues=homologues)

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate', homologues=homologues)        
        rename_target(subdf3, 'input3', 'catalyst', homologues=homologues) 
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate', homologues=homologues)

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate', homologues=homologues)
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [24]:
df_translate = df_edges[['reaction_id', 
          'input1_ID', 'input2_ID', 'input3_ID', 'output1_ID', 
          'input1_newID', 'input2_newID', 'input3_newID', 'output1_newID']].copy()


In [25]:
translate = defaultdict(dict)
for _, row in df_translate.iterrows():
    for suffix in ['input1', 'input2', 'input3', 'output1']:
        old = f"{suffix}_ID"
        new = f"{suffix}_newID"
        if not (row[new] is np.nan):
            translate[row['reaction_id']][row[new]] = row[old]

In [26]:
import json

In [141]:
with open("family-to-og-ID-v0.0.4.json", "w") as out:
    json.dump(translate, out)

In [28]:
# transcription genes
substrate_cols_wo_homologues = ['substrate_name', 'substrate_form', 'substrate_label', 'substrate_location']
product_cols_wo_homologues = ['product_name', 'product_form', 'product_label',  'product_location']
catalyst_cols_wo_homologues = ['catalyst_name', 'catalyst_form', 'catalyst_label', 'catalyst_location']

homologue_cols = [f"_{x}_homologues" for x in all_species]

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"substrate{x}" for x in homologue_cols]
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"catalyst{x}" for x in homologue_cols] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"product{x}" for x in homologue_cols]

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']

In [29]:
def read_dict(file):
    d = {}
    with open(file, "r") as f:
        for line in f:
            key, value = line.strip().split("\t")
            d[key] = value
    return d

In [30]:
node_to_family = read_dict(parsed_path / "node_to_family.tsv")
clade_to_family = read_dict(parsed_path / "clade_to_family.tsv")

In [31]:
def node_id_to_node_label(id_):

    query = '''MATCH (s) WHERE s.name=$x 
               RETURN s.name AS substrate_name, labels(s) AS substrate_label'''
    
    cursor = graph.run(query, x=id_)
    d = cursor.data()
    
    if len(d) == 0:
        print(id_, d, "no hit")
        return ""
    elif len(d) == 1:
        s = set(d[0]['substrate_label']) - set(['Family'])
        return s.pop()
    else:
        print(id_, d, 'multiple hits') # should be impossible
        return ""

In [32]:
def get_name(ids_):
    names = []
    for x in ids_:
        label1 = node_id_to_node_label(x)
        if not (label1 == ''):
            names.append(x)
        else:
            if x in clade_to_family:
                names.append(clade_to_family[x])
                print('using as clade id to get family\t', x, "\t", clade_to_family[x])

            elif x in node_to_family:
                names.append(node_to_family[x])
                print('using as node id to get family\t', x, "\t", node_to_family[x])

            else:
                names.append(x)
    
    return names

# Add reactions

In [33]:
def pretty_print_result(t, df, qr, input_type, multiplier=1):
    if input_type in ['nodes', 'relationships']:
        key = input_type + '_created'
    else:
        key = input_type
    
    try:
        stat = qr.stats()[key]
    except:
        stat = 0
    
    print(f"{t:20}\t{df.shape[0]:3}\t{stat:3}", end="")
    if df.shape[0]*multiplier == stat:
        print()
    
    elif df.shape[0]*multiplier < stat:
        print(f"\t**too many {input_type} created**")
    else:
        print(f"\t**not all {input_type} created**")

In [34]:
df_edges.index.duplicated().sum()

0

## binding / oligomerisation

In [35]:
key = 'binding/oligomerisation'
reaction_type = "BINDING_OGLIMERISATION"
subdf = df_edges.loc[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

BINDING_OGLIMERISATION 104


In [36]:
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']
binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']

In [37]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst)
subdf_w_catalyst = number_input_different(binding_w_catalyst, catalyst=True)

### Without catalyst

In [38]:
binding_wo_catalyst.columns

Index(['Status', 'AddedBy', 'ConnID', 'Species', 'input1_ID', 'input1_level',
       'input1_localisation', 'input1_type', 'input2_ID', 'input2_level',
       'input2_localisation', 'input2_type', 'input3_ID', 'input3_level',
       'input3_localisation', 'input3_type', 'ReactionEffect', 'ReactionMode',
       'Modifications', 'output1_ID', 'output1_level', 'output1_localisation',
       'output1_type', 'TrustLevel', 'Literature', 'AdditionalInfo', 'ModelV',
       'origin', 'trust_level', 'external_links', 'input1_form', 'input2_form',
       'input3_form', 'output1_form', 'species', 'input1_newID',
       'input1_label', 'input2_newID', 'input2_label', 'input3_newID',
       'input3_label', 'output1_newID', 'output1_label',
       'input1_ath_homologues', 'input1_stu_homologues',
       'input1_sly_homologues', 'input1_osa_homologues',
       'input2_ath_homologues', 'input2_stu_homologues',
       'input2_sly_homologues', 'input2_osa_homologues',
       'input3_ath_homologues', 'inp

In [39]:
subdf_wo_catalyst[['ConnID'] + substrate_cols]

Unnamed: 0,ConnID,substrate_name,substrate_label,substrate_form,substrate_location,substrate_ath_homologues,substrate_osa_homologues,substrate_stu_homologues,substrate_sly_homologues
5,Conn006,"[CTR, ETR]","[PlantCoding, PlantCoding]","[protein_active, protein_active]","[cytoplasm, endoplasmic reticulum]","[AT4G24480,AT5G03730, AT2G40940,AT1G66340,AT3G...","[, ]","[, ]","[, ]"
7,Conn008,"[ET, ETR]","[Metabolite, PlantCoding]","[metabolite, protein_active]","[cytoplasm, cytoplasm]","[, AT2G40940,AT1G66340,AT3G04580,AT1G04310,AT3...","[, ]","[, ]","[, ]"
10,Conn011,"[ETP, SCF]","[PlantCoding, Complex]","[protein, complex]","[cytoplasm, cytoplasm]","[AT3G17570,AT3G18320,AT3G24700,AT3G18980,AT2G0...","[, ]","[, ]","[, ]"
15,Conn016,"[EBF, SCF]","[PlantCoding, Complex]","[protein, complex]","[cytoplasm, cytoplasm]","[AT5G25350,AT2G25490, ]","[, ]","[, ]","[, ]"
17,Conn018,"[JAZ, EIN3(like)]","[PlantCoding, PlantCoding]","[protein, protein_active]","[nucleus, nucleus]","[AT5G13220,AT1G74950,AT2G34600,AT1G48500,AT3G4...","[, ]","[, ]","[, ]"
...,...,...,...,...,...,...,...,...,...
336,Conn338,"[cZ, UGT77, Glu]","[Metabolite, PlantCoding, Metabolite]","[metabolite, protein, metabolite]","[putative:cytoplasm, putative:cytoplasm, putat...","[, AT5G05870,AT5G05860, ]","[, , ]","[, , ]","[, , ]"
337,Conn339,"[cZ, UGT85, Glu]","[Metabolite, PlantCoding, Metabolite]","[metabolite, protein, metabolite]","[putative:cytoplasm, putative:cytoplasm, putat...","[, AT1G22400, ]","[, , ]","[, , ]","[, , ]"
338,Conn340,"[cZ, UGT85, Glu]","[Metabolite, PlantCoding, Metabolite]","[metabolite, protein, metabolite]","[putative:cytoplasm, putative:cytoplasm, putat...","[, AT1G22400, ]","[, , ]","[, , ]","[, , ]"
339,Conn341,"[cZ, UGT73, Glu]","[Metabolite, PlantCoding, Metabolite]","[metabolite, protein, metabolite]","[putative:cytoplasm, putative:cytoplasm, putat...","[, AT2G36750,AT2G36800, ]","[, , ]","[, , ]","[, , ]"


In [40]:
# make pseudo nodes
f = f"{reaction_type}-pseudo.tsv"
subdf_wo_catalyst[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.pseudo_node_query(f, name="line.reaction_id")

In [41]:
qr = graph.run(query)
pretty_print_result('PseudoNode', subdf_wo_catalyst, qr, 'nodes')

PseudoNode          	103	103


In [42]:
subdf_wo_catalyst

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,substrate_stu_homologues,substrate_sly_homologues,product_name,product_location,product_label,product_form,product_ath_homologues,product_osa_homologues,product_stu_homologues,product_sly_homologues
5,forCB,KG,Conn006,ath,CTR,family,cytoplasm,protein [active],ETR,family,...,"[, ]","[, ]",CTR|ETR,endoplasmic reticulum,Complex,complex_active,,,,
7,_TBD,KG,Conn008,ath,ET,family,cytoplasm,metabolite,ETR,family,...,"[, ]","[, ]",ET|ETR,cytoplasm,Complex,complex_active,,,,
10,forCB,KG,Conn011,ath,ETP,family,cytoplasm,protein,SCF,family,...,"[, ]","[, ]",ETP|SCF,cytoplasm,Complex,complex_active,,,,
15,forCB,KG,Conn016,ath,EBF,family,cytoplasm,protein,SCF,family,...,"[, ]","[, ]",EBF|SCF,cytoplasm,Complex,complex_active,,,,
17,forCB,KG,Conn018,ath,JAZ,family,nucleus,protein,EIN3(like),family,...,"[, ]","[, ]",EIN3(like)|JAZ,nucleus,Complex,complex_active,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,[TBD],AG,Conn338,Ath,cZ,node,cytoplasm?,metabolite,"UGT76C1,C2",clade,...,"[, , ]","[, , ]",cZ9G,putative:cytoplasm,Metabolite,metabolite,,,,
337,[TBD],AG,Conn339,Ath,cZ,node,cytoplasm?,metabolite,UGT85A1,clade,...,"[, , ]","[, , ]",cZOG,putative:cytoplasm,Metabolite,metabolite,,,,
338,[TBD],AG,Conn340,Ath,cZ,node,cytoplasm?,metabolite,UGT85A1,clade,...,"[, , ]","[, , ]",cZROG,putative:cytoplasm,Metabolite,metabolite,,,,
339,[TBD],AG,Conn341,Ath,cZ,node,cytoplasm?,metabolite,"UGT73C1,C5",clade,...,"[, , ]","[, , ]",cZROG,putative:cytoplasm,Metabolite,metabolite,,,,


In [43]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_wo_catalyst, explode_cols)#.drop_duplicates()

In [44]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, 
                                                    target_label="PseudoNode", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


Complex             	 16	 16
ExternalCoding      	 30	 30
ExternalEntity      	  3	  3
Metabolite          	 32	 32
PlantAbstract       	  4	  4
PlantCoding         	131	131
Process             	  3	  3


In [45]:
!head ../data/import/$f

AddedBy	Species	AdditionalInfo	external_links	trust_level	ModelV	ReactionEffect	reaction_type	Modifications	reaction_id	substrate_name	substrate_label	substrate_form	substrate_location	substrate_ath_homologues	substrate_osa_homologues	substrate_stu_homologues	substrate_sly_homologues
KG	all	This is viral PTI.		Rx	v2.6	activation	binding/oligomerisation		rx00115	RNA-silencing	Process	process	cytoplasm				
KG	all			Rx	v2.6	inhibition	binding/oligomerisation		rx00117	RNA-silencing	Process	process	cytoplasm				
KG	all	RNA silencing inhibits virus, part of PTI.		Rx	v2.6	inhibition	binding/oligomerisation		rx00119	RNA-silencing	Process	process	cytoplasm				


In [46]:
exploded_new_subdf[want_cols]

Unnamed: 0,AddedBy,Species,AdditionalInfo,external_links,trust_level,ModelV,ReactionEffect,reaction_type,Modifications,reaction_id,substrate_name,substrate_label,substrate_form,substrate_location,substrate_ath_homologues,substrate_osa_homologues,substrate_stu_homologues,substrate_sly_homologues
5,KG,ath,"When ETR is active, more ETR(a)/CTR(a) complex...","doi:10.1104/pp.107.104299,doi:10.1073/pnas.060...",R1,v1.0,activation,binding/oligomerisation,,rx00006,CTR,PlantCoding,protein_active,cytoplasm,"AT4G24480,AT5G03730",,,
5,KG,ath,"When ETR is active, more ETR(a)/CTR(a) complex...","doi:10.1104/pp.107.104299,doi:10.1073/pnas.060...",R1,v1.0,activation,binding/oligomerisation,,rx00006,ETR,PlantCoding,protein_active,endoplasmic reticulum,"AT2G40940,AT1G66340,AT3G04580,AT1G04310,AT3G23150",,,
7,KG,ath,Modelled as ET competitively binding CTR(a) co...,doi:10.1105/tpc.001768,R1,v1.0,activation,binding/oligomerisation,,rx00008,ET,Metabolite,metabolite,cytoplasm,,,,
7,KG,ath,Modelled as ET competitively binding CTR(a) co...,doi:10.1105/tpc.001768,R1,v1.0,activation,binding/oligomerisation,,rx00008,ETR,PlantCoding,protein_active,cytoplasm,"AT2G40940,AT1G66340,AT3G04580,AT1G04310,AT3G23150",,,
10,KG,ath,Technically the ETR(a)-CTR(a) complex keeps EI...,"doi:10.1093/mp/ssq036,doi:10.1042/bj20091102,d...",R1,v2.5,activation,binding/oligomerisation,,rx00011,ETP,PlantCoding,protein,cytoplasm,"AT3G17570,AT3G18320,AT3G24700,AT3G18980,AT2G04...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,AG,Ath,,doi:10.1002/cbin.10805,R1,v2.7,activation,binding/oligomerisation,,rx00385,cZ,Metabolite,metabolite,putative:cytoplasm,,,,
385,AG,Ath,,doi:10.1002/cbin.10805,R3,v2.7,activation,binding/oligomerisation,,rx00386,AHK,PlantCoding,protein,putative:cytoplasm,"AT1G27320,AT5G35750,AT2G01830",,,
385,AG,Ath,,doi:10.1002/cbin.10805,R3,v2.7,activation,binding/oligomerisation,,rx00386,DZ,Metabolite,metabolite,putative:cytoplasm,,,,
386,AG,Ath,,doi:10.1002/cbin.10805,R1,v2.7,activation,binding/oligomerisation,,rx00387,AHK,PlantCoding,protein,putative:cytoplasm,"AT1G27320,AT5G35750,AT2G01830",,,


In [47]:
subdf_wo_catalyst[subdf_wo_catalyst['ConnID'].isin(exploded_new_subdf[exploded_new_subdf["substrate_label"]=="nan"]['ConnID'].unique())][['ConnID', 'input1_newID', 'input2_newID', 'input3_newID', 'substrate_name']]

Unnamed: 0,ConnID,input1_newID,input2_newID,input3_newID,substrate_name


In [48]:
for x in this_subdf['ConnID'].unique():
    print(x)

Conn115
Conn117
Conn119


In [49]:
f = f"{reaction_type}-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
subdf_wo_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [50]:
subdf_wo_catalyst[subdf_wo_catalyst["product_name"].isna()][product_cols + ["output1_ID", "output1_newID", "ConnID"]]

Unnamed: 0,product_name,product_label,product_form,product_location,product_ath_homologues,product_osa_homologues,product_stu_homologues,product_sly_homologues,output1_ID,output1_newID,ConnID


In [51]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                "dne", "product",
                                                source_name="line.reaction_id", 
                                                source_label="PseudoNode"
                                               )
qr = graph.run(query)
pretty_print_result('-', subdf_wo_catalyst, qr, 'relationships')

-                   	103	103


In [52]:
q = '''LOAD CSV WITH HEADERS FROM  'file:///BINDING_OGLIMERISATION-product_edges.tsv' AS line FIELDTERMINATOR '	'
       OPTIONAL MATCH (source:PseudoNode { name:line.reaction_id})
       OPTIONAL MATCH (target { name:line.product_name})

        WITH source, target, line
            WHERE source IS NULL OR target is NULL
            RETURN target, line.reaction_id, line.product_name
           '''


result = graph.run(q)

In [53]:
d = result.data(); d

[]

In [54]:
rx_ids = [x['line.reaction_id'] for x in d]

In [55]:
subdf_wo_catalyst[subdf_wo_catalyst['reaction_id'].isin(rx_ids)][['Status', 'ConnID', 'output1_ID']]

Unnamed: 0,Status,ConnID,output1_ID


### With catalyst

In [56]:
subdf_w_catalyst

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,catalyst_stu_homologues,catalyst_sly_homologues,product_name,product_location,product_label,product_form,product_ath_homologues,product_osa_homologues,product_stu_homologues,product_sly_homologues
93,forCB,KG,Conn094,ath,NPR1,node,cytoplasm,protein,NPR1,node,...,,,NPR1|NPR1,cytoplasm,Complex,complex_active,,,,


In [57]:
subdf_w_catalyst.shape[0]

1

In [58]:
explode_cols = substrate_cols
exploded_new_subdf = helpers.unnesting(subdf_w_catalyst, explode_cols).drop_duplicates()

In [59]:
# binding substrate edges

want_cols = reaction_standard_columns + substrate_cols + catalyst_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-catalyst-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                    "substrate", "catalyst",
                                                    source_label=t
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

PlantCoding         	  1	  1


In [60]:
f = f"{reaction_type}-catalyst-product_edges.tsv"
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf_w_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [61]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT",
                                                "catalyst", "product"
                                               )
qr = graph.run(query)
pretty_print_result('-', subdf_w_catalyst, qr, 'relationships')

-                   	  1	  1


## catalysis / auto-catalysis

In [62]:
key = 'catalysis/auto-catalysis'
reaction_type = 'CATALYSIS'
subdf = df_edges[df_edges['reaction_type'] == key]
print(reaction_type, subdf.shape[0])

CATALYSIS 79


In [63]:
# with and without catalyst

In [64]:
subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
378,[TBD],AG,Conn389,Ath,tZRMP,node,cytoplasm?,metabolite,,,...,,,,,putative:cytoplasm,putative:cytoplasm,putative:cytoplasm,putative:cytoplasm,catalysis/auto-catalysis,rx00379


In [65]:
catalysis_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
catalysis_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

### With catalyst

In [66]:
subdf_w_catalyst = number_input_different(catalysis_w_catalyst, catalyst=True)
new_subdf = number_input_different(subdf_w_catalyst, catalyst=True)

In [67]:
exploded_new_subdf = helpers.unnesting(new_subdf, substrate_cols)

In [68]:
# substrate to catalyst
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "catalyst",
                                                    source_label=t
                                                   )
    qr = graph.run(query)
    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 79	 79
PlantCoding         	 21	 21


In [69]:
q = '''LOAD CSV WITH HEADERS FROM  'file:///CATALYSIS-Metabolite-substrate_edges.tsv' AS line FIELDTERMINATOR '	'
       OPTIONAL MATCH (source { name:line.substrate_name})
       OPTIONAL MATCH (target { name:line.catalyst_name})

        WITH source, target, line
            WHERE source IS NULL OR target is NULL
            RETURN source.name, line.substrate_name,  target.name, line.catalyst_name, line.reaction_id
           '''

result = graph.run(q)
result.data()

[]

In [70]:
# catalyst to product
want_cols = reaction_standard_columns + catalyst_cols + product_cols
for t, this_subdf in new_subdf.groupby("product_label"):
    f = f"{reaction_type}-{t}-product_edges.tsv"    
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'PRODUCT', 
                                                    "catalyst", "product",
                                                    target_label=t
                                                   )    
    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')

Metabolite          	 76	 76
MetaboliteFamily    	  2	  2


### Without catalyst

In [71]:
rename_target(catalysis_wo_catalyst, 'output1', 'product')
rename_target(catalysis_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [72]:
catalysis_wo_catalyst['reaction_id']

378    rx00379
Name: reaction_id, dtype: object

In [73]:
# make pseudo nodes
f = f"{reaction_type}-pseudo.tsv"
catalysis_wo_catalyst[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.pseudo_node_query(f, name="line.reaction_id")

In [74]:
qr = graph.run(query)
pretty_print_result('PseudoNode', catalysis_wo_catalyst, qr, 'nodes')

PseudoNode          	  1	  1


In [75]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in catalysis_wo_catalyst.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, 
                                                    target_label="PseudoNode", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


MetaboliteFamily    	  1	  1


In [76]:
f = f"{reaction_type}-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
catalysis_wo_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [77]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                "dne", "product",
                                                source_name="line.reaction_id", 
                                                source_label="PseudoNode"
                                               )
qr = graph.run(query)
pretty_print_result('-', catalysis_wo_catalyst, qr, 'relationships')

-                   	  1	  1


In [78]:
q = '''MATCH ()-[r:SUBSTRATE {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

101

In [79]:
q = '''MATCH ()-[r:PRODUCT {reaction_type:"catalysis/auto-catalysis"}]->() RETURN r'''
c = graph.run(q).data()
len(c) # should be ??

79

## dissociation

In [80]:
key = 'dissociation'
reaction_type = 'DISSOCIATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DISSOCIATION 1


In [81]:
subdf

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
95,forCB,KG,Conn096,ath,NPR1|NPR1,family,cytoplasm,complex,TRX-H,family,...,AT1G64280,,,,cytoplasm,cytoplasm,putative:cytoplasm,cytoplasm,dissociation,rx00096


In [82]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [83]:
# substrate to node edge

f =  f"{reaction_type}-{t}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


In [84]:
# catalyst to product edge

f =  f"{reaction_type}-{t}-product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result("-", subdf, qr, 'relationships')

-                   	  1	  1


## degradation

In [85]:
key = 'degradation/secretion'
reaction_type = 'DEGRADATION_SECRETION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

DEGRADATION_SECRETION 32


In [86]:
rename_target(subdf, 'input1',  'catalyst')
rename_target(subdf, 'input2',  'substrate')

In [87]:
# substrate to catalyst edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 32	 32


In [88]:

q = '''LOAD CSV WITH HEADERS FROM  'file:///DEGRADATION_SECRETION-substrate_edges.tsv' AS line FIELDTERMINATOR '	'
       OPTIONAL MATCH (source { name:line.substrate_name})
       OPTIONAL MATCH (target { name:line.catalyst_name})

        WITH source, target, line
            WHERE source IS NULL OR target is NULL
            RETURN source.name, line.substrate_name,  target.name, line.catalyst_name, line.reaction_id
           '''

result = graph.run(q)
result.data()

[]

In [89]:
subdf[subdf['reaction_id'] == 'rx00320']

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,catalyst_stu_homologues,catalyst_sly_homologues,substrate_name,substrate_location,substrate_label,substrate_form,substrate_ath_homologues,substrate_osa_homologues,substrate_stu_homologues,substrate_sly_homologues
319,[TBD],JALR,Conn320,ath/osa,D14|MAX2|SCF,family,cytoplasm?,complex [active],D53,node,...,,,Class I Clp ATPase,putative:cytoplasm,PlantCoding,protein_active,,OS11G0104300,,


## translocation

In [90]:
key = 'translocation'
reaction_type = 'TRANSLOCATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

TRANSLOCATION 5


In [91]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input1',  'substrate')
rename_target(subdf, 'input2',  'catalyst')

In [92]:
# substrate to catalyst edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_FROM',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


In [93]:
# catalyst to product edge
f =  f"{reaction_type}--product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'TRANSLOCATE_TO',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  5	  5


## protein activation

In [94]:
key = 'protein activation'
reaction_type = 'PROTEIN_ACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_ACTIVATION 52


In [95]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'catalyst')
rename_target(subdf, 'input1',  'substrate')

In [96]:
# substrate to node edge

f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


In [97]:
# catalyst to product edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	 52	 52


## protein deactivation

In [98]:
key = 'protein deactivation'
reaction_type = 'PROTEIN_DEACTIVATION'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

PROTEIN_DEACTIVATION 4


In [99]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [100]:
# substrate to catalyst edge
f =  f"{reaction_type}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


In [101]:
# catalyst to product edge
f =  f"{reaction_type}-product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
subdf.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result('-', subdf, qr, 'relationships')

-                   	  4	  4


## transcriptional / translational regulation

In [102]:
reaction_type = 'TRANSCRIPTIONAL_TRANSLATIONAL'
keys = ['transcriptional/translational induction', 'transcriptional/translational repression']
subdf = df_edges[df_edges['reaction_type'].isin(keys)]
print(reaction_type, subdf.shape[0])

TRANSCRIPTIONAL_TRANSLATIONAL 79


In [103]:
def number_input_different_reverse(df, homologues=True):
    ''' If catalyst is True, they are the first "input" cols. 
    last one is left as substrate. '''

    # two inputs, input1 -> catalyst
    subdf2 = df[df["input3_newID"].isna()].copy()
    generate_list(subdf2, ['input1'], 'catalyst', homologues=homologues)
    rename_target(subdf2, 'input2', 'substrate', homologues=homologues)

    # three inputs, input1, input2 -> catalyst
    subdf3 = df[~df["input3_newID"].isna()].copy()
    generate_list(subdf3, ['input1', 'input2'], 'catalyst', homologues=homologues)        
    rename_target(subdf3, 'input3', 'substrate', homologues=homologues)
     
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [104]:
new_subdf = number_input_different_reverse(subdf, homologues=True)

In [105]:
new_subdf[new_subdf['substrate_name'] != new_subdf['product_name']][["Status", "ConnID", 'substrate_name', 'product_name' ]]

Unnamed: 0,Status,ConnID,substrate_name,product_name
44,_TBD,Conn045,OMR,COI1|OMR1
92,_TBD,Conn093,PAD4,"NPR1|PAD4|TGA2,5,6"
115,_TBD,Conn116,EDS5,"EDS5|NPR1|TGA2,5,6"
117,_TBD,Conn118,EDS1,"EDS1|NPR1|TGA2,5,6"
119,_TBD,Conn120,PAD4,"NPR1|PAD4|TGA2,5,6"
148,_TBD,Conn149,GST,GSTU24|ROS


In [106]:
new_subdf[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location
27,forCB,Conn028,LOX,gene,PlantCoding,chloroplast
97,forCB,Conn098,WRKY,gene,PlantCoding,cytoplasm
110,forCB,Conn111,TAS3,gene,PlantNonCoding,cytoplasm
123,forCB,Conn124,WRKY,gene,PlantCoding,cytoplasm
160,forCB,Conn161,PAD4,gene,PlantCoding,cytoplasm
186,forCB,Conn187,ICS,gene,PlantCoding,chloroplast
187,forCB,Conn188,ICS,gene,PlantCoding,chloroplast
188,forCB,Conn189,ICS,gene,PlantCoding,chloroplast
189,forCB,Conn190,ICS,gene,PlantCoding,chloroplast
192,forCB,Conn193,ACO,gene,PlantCoding,endoplasmic reticulum


In [107]:
#manualfix
new_subdf.loc[(  (new_subdf['substrate_form'] == 'gene') & (new_subdf['substrate_location'] != 'nucleus')  ), 'substrate_location'] = 'nucleus'

In [108]:
new_subdf[(  (new_subdf['product_form'].isin(['rna', 'ncRNA']) ))]#& (new_subdf['substrate_location'] != 'nucleus')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,substrate_stu_homologues,substrate_sly_homologues,product_name,product_location,product_label,product_form,product_ath_homologues,product_osa_homologues,product_stu_homologues,product_sly_homologues
110,forCB,KG,Conn111,ath,MIR390,family,cytoplasm,ncRNA,TAS3,family,...,,,TAS3,cytoplasm,PlantNonCoding,ncRNA,"AT5G49615,AT3G17185",,,


In [109]:
new_subdf[(  (new_subdf['substrate_form'].isin(['rna', 'ncRNA'])) & (new_subdf['substrate_location'] != 'cytoplams')  )][["Status", "ConnID"] + substrate_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location


In [110]:
new_subdf[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  )][["Status", "ConnID"] + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,product_name,product_form,product_label,product_location
11,forCB,Conn012,EBF,protein,PlantCoding,nucleus
20,forCB,Conn021,PR3,protein,PlantCoding,nucleus
21,forCB,Conn022,PR4,protein,PlantCoding,nucleus
24,forCB,Conn025,PDF1,protein,PlantCoding,nucleus
27,forCB,Conn028,LOX,protein,PlantCoding,chloroplast
45,forCB,Conn046,JAZ,protein,PlantCoding,nucleus
47,forCB,Conn048,MYC,protein,PlantCoding,nucleus
48,forCB,Conn049,CLH,protein,PlantCoding,nucleus
49,forCB,Conn050,JAL,protein,PlantCoding,nucleus
50,forCB,Conn051,PR13,protein,PlantCoding,nucleus


In [111]:
# manualfix
new_subdf.loc[(  (new_subdf['product_form'] == 'protein') & (new_subdf['product_location'] != 'cytoplasm')  ), 'product_location'] = 'cytoplasm'

In [112]:
new_subdf[~new_subdf['substrate_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + substrate_cols_wo_homologues] #+ catalyst_cols_wo_homologues + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,substrate_name,substrate_form,substrate_label,substrate_location
62,forCB,Conn063,Anthocyanin-accumulation,process,Process,cytoplasm
63,forCB,Conn064,Trichome-initiation,process,Process,cytoplasm


In [113]:
new_subdf[~new_subdf['product_label'].isin(["PlantCoding", "PlantNonCoding"])][["Status", "ConnID"] + product_cols_wo_homologues]

Unnamed: 0,Status,ConnID,product_name,product_form,product_label,product_location
44,_TBD,Conn045,COI1|OMR1,complex,Complex,nucleus
62,forCB,Conn063,Anthocyanin-accumulation,process_active,Process,cytoplasm
63,forCB,Conn064,Trichome-initiation,process_active,Process,cytoplasm
92,_TBD,Conn093,"NPR1|PAD4|TGA2,5,6",complex,Complex,nucleus
115,_TBD,Conn116,"EDS5|NPR1|TGA2,5,6",complex,Complex,nucleus
117,_TBD,Conn118,"EDS1|NPR1|TGA2,5,6",complex,Complex,nucleus
119,_TBD,Conn120,"NPR1|PAD4|TGA2,5,6",complex,Complex,nucleus
148,_TBD,Conn149,GSTU24|ROS,complex,Complex,nucleus


In [114]:
exploded_new_subdf = helpers.unnesting(new_subdf, catalyst_cols)

In [115]:
edge_label = "ACTIVATES"
reaction_type = 'transcriptional/translational induction'
reaction_type_nice = 'transcriptional_translational_induction'
act_subdf = exploded_new_subdf[exploded_new_subdf['reaction_type']==reaction_type]

# substrate to catalyst edge
f =  f"{reaction_type_nice}-{edge_label}.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols + substrate_cols
act_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

In [116]:
query = helpers.make_create_requlatory_edge_query(f, edge_label,
                                        "catalyst", "substrate", "product"
                              )
qr = graph.run(query)
pretty_print_result('-', act_subdf, qr, 'relationships')

-                   	 62	 62


In [117]:
edge_label = "INHIBITS"
reaction_type = 'transcriptional/translational repression'
reaction_type_nice = 'transcriptional_translational_repression'
inh_subdf = exploded_new_subdf[exploded_new_subdf['reaction_type']==reaction_type]

# substrate to catalyst edge
f =  f"{reaction_type_nice}-{edge_label}.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols + substrate_cols
inh_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

In [118]:
query = helpers.make_create_requlatory_edge_query(f, edge_label,
                                        "catalyst", "substrate", "product"
                              )
qr = graph.run(query)
pretty_print_result('-', inh_subdf, qr, 'relationships')

-                   	 19	 19


## cleavage/auto-cleavage

In [119]:
#Image(filename='./reaction_types/activation/cleavage_autocleavage.png')

In [120]:
key = 'cleavage/auto-cleavage'
reaction_type = 'CLEAVAGE_AUTOCLEAVAGE'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

CLEAVAGE_AUTOCLEAVAGE 29


In [121]:
cleavage_wo_catalyst = subdf[subdf['input2_ID'].isna() & subdf['input3_ID'].isna()]
cleavage_w_catalyst = subdf[~(subdf['input2_ID'].isna() & subdf['input3_ID'].isna())]

In [122]:
rename_target(cleavage_w_catalyst, 'output1', 'product')
rename_target(cleavage_w_catalyst, 'input1',  'substrate')
rename_target(cleavage_w_catalyst, 'input2',  'catalyst')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [123]:
# substrate to node edge

f =  f"{reaction_type}-{t}-substrate_edges.tsv" 
want_cols = reaction_standard_columns + substrate_cols + catalyst_cols
cleavage_w_catalyst.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE',
                                                "substrate", "catalyst"
                              )
qr = graph.run(query)
pretty_print_result("-", cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [124]:
# catalyst to product edge

f =  f"{reaction_type}-{t}-product_edges.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols
cleavage_w_catalyst.to_csv(f"../data/import/{f}", sep="\t", index=None)

query = helpers.make_create_reaction_edge_query(f, 'PRODUCT',
                                                "catalyst" , "product"
                              )
qr = graph.run(query)
pretty_print_result("-", cleavage_w_catalyst, qr, 'relationships')

-                   	 19	 19


In [125]:
rename_target(cleavage_wo_catalyst, 'output1', 'product')
rename_target(cleavage_wo_catalyst, 'input1',  'substrate')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[new_col] = subdf[old_col]


In [126]:
# make pseudo nodes
f = f"{reaction_type}-pseudo.tsv"
cleavage_wo_catalyst[reaction_standard_columns].to_csv(f"../data/import/{f}", sep="\t", index=None)
query = helpers.pseudo_node_query(f, name="line.reaction_id")

In [127]:
qr = graph.run(query)
pretty_print_result('PseudoNode', cleavage_wo_catalyst, qr, 'nodes')

PseudoNode          	 10	 10


In [128]:
# binding substrate edges
want_cols = reaction_standard_columns + substrate_cols

for t, this_subdf in cleavage_wo_catalyst.groupby("substrate_label"):
    f = f"{reaction_type}-{t}-substrate_edges.tsv"
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")
    
    query = helpers.make_create_reaction_edge_query(f, 'SUBSTRATE', 
                                                    "substrate", "dne",
                                                    source_label=t, 
                                                    target_label="PseudoNode", 
                                                    target_name="line.reaction_id"
                                                   )

    qr = graph.run(query)

    pretty_print_result(t, this_subdf, qr, 'relationships')


Metabolite          	 10	 10


In [129]:
f = f"{reaction_type}-product_edges.tsv"
want_cols = reaction_standard_columns + product_cols
cleavage_wo_catalyst[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None)

In [130]:
# binding product edges
query = helpers.make_create_reaction_edge_query(f, "PRODUCT", 
                                                "dne", "product",
                                                source_name="line.reaction_id", 
                                                source_label="PseudoNode"
                                               )
qr = graph.run(query)
pretty_print_result('-', cleavage_wo_catalyst, qr, 'relationships')

-                   	 10	 10


## undefined

In [131]:
#Image(filename='./reaction_types/...png')

In [132]:
key = 'undefined'
reaction_type = 'UNDEFINED'
subdf = df_edges[df_edges['reaction_type'] == key].copy()
print(reaction_type, subdf.shape[0])

UNDEFINED 9


In [133]:
rename_target(subdf, 'output1', 'product')
rename_target(subdf, 'input2',  'substrate')
rename_target(subdf, 'input1',  'catalyst')

In [134]:
activation_subdf = subdf[subdf['ReactionEffect'] == 'activation' ]
inhibition_subdf = subdf[subdf['ReactionEffect'] == 'inhibition' ]

In [135]:
edge_label = "ACTIVATES"

# substrate to catalyst edge
f =  f"{reaction_type}-{edge_label}.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + substrate_cols
activation_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

In [136]:
query = helpers.make_create_requlatory_edge_query(f, edge_label,
                                        "catalyst", "substrate", "product"
                              )
qr = graph.run(query)
pretty_print_result('-', activation_subdf, qr, 'relationships')

-                   	  1	  1


In [137]:
edge_label = "INHIBITS"

# substrate to catalyst edge
f =  f"{reaction_type}-{edge_label}.tsv" 
want_cols = reaction_standard_columns + catalyst_cols + product_cols + substrate_cols
inhibition_subdf[want_cols].to_csv(f"../data/import/{f}", sep="\t", index=None)

In [138]:
query = helpers.make_create_requlatory_edge_query(f, edge_label,
                                        "catalyst", "substrate", "product"
                              )
qr = graph.run(query)
pretty_print_result('-', inhibition_subdf, qr, 'relationships')

-                   	  8	  8


# END 