In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image
from IPython.display import display

In [3]:
from py2neo import Graph, Node, Relationship

In [4]:
import helpers

In [5]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
graph = Graph(host="neo4j")

In [7]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [8]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [9]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [10]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,output1_ath_homologues,output1_stu_homologues,output1_sly_homologues,output1_osa_homologues,input1_location,input2_location,input3_location,output1_location,reaction_type,reaction_id
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00001
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00002
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,,,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis/auto-catalysis,rx00003
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,,,,,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,translocation,rx00004
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,"AT3G04580,AT2G40940,AT1G66340,AT3G23150,AT1G04310",,,,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein activation,rx00005


In [11]:
def generate_list(subdf, ids, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols
        
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols

    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [12]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_newID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [13]:
def number_input_different(df, homologues=True, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate', homologues=homologues)
        rename_target(subdf2, 'input2', 'catalyst', homologues=homologues)

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate', homologues=homologues)        
        rename_target(subdf3, 'input3', 'catalyst', homologues=homologues)
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate', homologues=homologues)

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate', homologues=homologues)
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [14]:
# transcription genes
substrate_cols_wo_homologues = ['substrate_name', 'substrate_form', 'substrate_label', 'substrate_location']
product_cols_wo_homologues = ['product_name', 'product_form', 'product_label',  'product_location']
catalyst_cols_wo_homologues = ['catalyst_name', 'catalyst_form', 'catalyst_label', 'catalyst_location']

homologue_cols = [f"_{x}_homologues" for x in all_species]

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"substrate{x}" for x in homologue_cols]
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"catalyst{x}" for x in homologue_cols] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] +\
                [f"product{x}" for x in homologue_cols]

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']

In [15]:
def read_dict(file):
    d = {}
    with open(file, "r") as f:
        for line in f:
            key, value = line.strip().split("\t")
            d[key] = value
    return d

In [16]:
node_to_family = read_dict(parsed_path / "node_to_family.tsv")
clade_to_family = read_dict(parsed_path / "clade_to_family.tsv")

In [17]:
def node_id_to_node_label(id_):

    query = '''MATCH (s) WHERE s.name=$x 
               RETURN s.name AS substrate_name, labels(s) AS substrate_label'''
    
    cursor = graph.run(query, x=id_)
    d = cursor.data()
    
    if len(d) == 0:
        print(id_, d, "no hit")
        return ""
    elif len(d) == 1:
        s = set(d[0]['substrate_label']) - set(['Family'])
        return s.pop()
    else:
        print(id_, d, 'multiple hits') # should be impossible
        return ""

### Add complexes

In [18]:
label = "Complex"
q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
already_defined_complexes = set([d['n.name'] for d in graph.run(q).data()])
already_defined_complexes

{'SCF', 'WD/bHLH/MYB', 'ribosome'}

In [19]:
complexes_to_add = set(pd.read_csv(parsed_path / "complexes_to_add.tsv", sep="\t", header=None)[0]) - set(already_defined_complexes)
len(complexes_to_add)

91

In [20]:
rows_w_new_complex = get_x_nodes(df_edges, complexes_to_add)

In [21]:
want_cols = ['reaction_type', 'Modifications']
for prefix in ['input1', 'input2', 'input3', 'output1']:
    want_cols += [f"{prefix}_{x}" for x in ['newID', 'location', 'label', 'form']]

df_new_complex = df_edges.loc[rows_w_new_complex, want_cols]

In [22]:
# first complexes defined by binding/oglimerisation reactions
key = 'binding/oligomerisation'
subdf = df_new_complex.loc[df_new_complex['reaction_type'] == key]

binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']

In [23]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst, homologues=False)
subdf_w_catalyst = number_input_different(binding_w_catalyst, homologues=False, catalyst=True)

In [24]:
subdf_wo_catalyst = subdf_wo_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]
subdf_w_catalyst = subdf_w_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]

In [25]:
new_subdf = pd.concat([subdf_wo_catalyst, subdf_w_catalyst])

In [26]:
new_subdf.drop_duplicates('product_name', keep='first', inplace=True)

In [27]:
new_subdf.head()

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
5,"[CTR, ETR]","[PlantCoding, PlantCoding]","[protein_active, protein_active]",CTR|ETR,CTR|ETR
7,"[ET, ETR]","[Metabolite, PlantCoding]","[metabolite, protein_active]",ET|ETR,ET|ETR
10,"[ETP, SCF]","[PlantCoding, Complex]","[protein, complex]",ETP|SCF,ETP|SCF
15,"[EBF, SCF]","[PlantCoding, Complex]","[protein, complex]",EBF|SCF,EBF|SCF
17,"[JAZ, EIN3(like)]","[PlantCoding, PlantCoding]","[protein, protein_active]",EIN3(like)|JAZ,EIN3(like)|JAZ


In [28]:
binding_defined_complexes = set(new_subdf['product_name'].values)

In [29]:
len(binding_defined_complexes)

85

In [30]:
# other complexes
other_complexes = set()
for i , row in df_new_complex.iterrows():
    for col_prefix in ['input1', 'input2', 'input3', 'output1']:
        if row[col_prefix + "_label"] == 'Complex':
            c = row[col_prefix + "_newID"]
            if not ((c in binding_defined_complexes) or (c in already_defined_complexes)):
                other_complexes.add(row[col_prefix + "_newID"])

In [31]:
other_complexes

{'CAM|Ca2+',
 'COI1|OMR1',
 'EDS1|NPR1|TGA2,5,6',
 'EDS5|NPR1|TGA2,5,6',
 'GSTU24|ROS',
 'NPR1|PAD4|TGA2,5,6'}

In [32]:
other_complexes = pd.DataFrame(other_complexes, columns=['output1_newID'])

In [33]:
def get_subunits(x):
    if '|' in x:
        return x.split('|')
    else:
        return []

In [34]:
other_complexes['substrate_og_name'] = other_complexes['output1_newID'].apply(get_subunits)
other_complexes['substrate_form'] = other_complexes['substrate_og_name'].apply(lambda x: ["" for c in x])
other_complexes['product_name'] = other_complexes['output1_newID']

In [35]:
other_complexes

Unnamed: 0,output1_newID,substrate_og_name,substrate_form,product_name
0,GSTU24|ROS,"[GSTU24, ROS]","[, ]",GSTU24|ROS
1,"EDS1|NPR1|TGA2,5,6","[EDS1, NPR1, TGA2,5,6]","[, , ]","EDS1|NPR1|TGA2,5,6"
2,CAM|Ca2+,"[CAM, Ca2+]","[, ]",CAM|Ca2+
3,"EDS5|NPR1|TGA2,5,6","[EDS5, NPR1, TGA2,5,6]","[, , ]","EDS5|NPR1|TGA2,5,6"
4,COI1|OMR1,"[COI1, OMR1]","[, ]",COI1|OMR1
5,"NPR1|PAD4|TGA2,5,6","[NPR1, PAD4, TGA2,5,6]","[, , ]","NPR1|PAD4|TGA2,5,6"


In [36]:
def get_name(ids_):
    names = []
    for x in ids_:
        label1 = node_id_to_node_label(x)
        if not (label1 == ''):
            names.append(x)
        else:
            if x in clade_to_family:
                names.append(clade_to_family[x])
                print('using as clade id to get family\t', x, "\t", clade_to_family[x])

            elif x in node_to_family:
                names.append(node_to_family[x])
                print('using as node id to get family\t', x, "\t", node_to_family[x])

            else:
                names.append(x)
    
    return names

In [37]:
def components_missing_label(ids_):
    labels = []
    for x in ids_:
        label1 = node_id_to_node_label(x)
        if label1 == '':
            if x in clade_to_family:
                label1 = node_id_to_node_label(clade_to_family[x])
                print('using as clade id to get family label\t', x, "\t", label1)
            elif x in node_to_family:
                label1 = node_id_to_node_label(node_to_family[x])
                print('using as node id to get family label\t', x, "\t", label1)
        
        if label == '':
            print(x)
        labels.append(label1)
    return labels


In [38]:
other_complexes['substrate_name'] = other_complexes['substrate_og_name'].apply(get_name)
other_complexes['substrate_label'] = other_complexes['substrate_name'].apply(components_missing_label)

GSTU24 [] no hit
using as node id to get family	 GSTU24 	 GST
NPR1 [] no hit
using as node id to get family	 NPR1 	 NPR
TGA2,5,6 [] no hit
using as clade id to get family	 TGA2,5,6 	 TGA
NPR1 [] no hit
using as node id to get family	 NPR1 	 NPR
TGA2,5,6 [] no hit
using as clade id to get family	 TGA2,5,6 	 TGA
OMR1 [] no hit
using as clade id to get family	 OMR1 	 OMR
NPR1 [] no hit
using as node id to get family	 NPR1 	 NPR
TGA2,5,6 [] no hit
using as clade id to get family	 TGA2,5,6 	 TGA


In [39]:
del other_complexes['substrate_og_name']

In [40]:
new_complexes = other_complexes.append(new_subdf, sort=True)

In [41]:
new_complexes.columns

Index(['output1_newID', 'product_name', 'substrate_form', 'substrate_label',
       'substrate_name'],
      dtype='object')

In [42]:
# save new complexes 
label = 'Complex'
f = f'{label}-new-components.tsv'
want_cols = 'product_name'
new_complexes[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None, header=True)

In [43]:
print(new_complexes.shape[0])

91


In [44]:
query = helpers.bioelement_node_query(f, "Complex", 
                           n_name="line.product_name")
#print(query)
qr = graph.run(query)
if not new_complexes.shape[0] == qr.stats()['nodes_created']:
     raise Exception

In [45]:
exploded_new_subdf = helpers.unnesting(new_complexes, ['substrate_name', 'substrate_label', 'substrate_form']).drop_duplicates()

In [46]:
exploded_new_subdf[exploded_new_subdf['product_name']=='D14|MAX2|SCF']

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
318,&alpha;/&beta; hydroxylase,PlantCoding,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
318,F-box/LRR-repeat,PlantCoding,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
318,SCF,Complex,complex,D14|MAX2|SCF,D14|MAX2|SCF


In [47]:
exploded_new_subdf.tail()

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
312,CAT,PlantCoding,protein,ACX3|CAT2,ACX3|CAT2
312,ACX,PlantCoding,protein,ACX3|CAT2,ACX3|CAT2
318,&alpha;/&beta; hydroxylase,PlantCoding,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
318,F-box/LRR-repeat,PlantCoding,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
318,SCF,Complex,complex,D14|MAX2|SCF,D14|MAX2|SCF


In [48]:
# component to complex edges
edge_type = 'COMPONENT_OF'
want_cols = ['substrate_name', 'substrate_form', 'substrate_label', 'product_name']

for t, this_subdf in exploded_new_subdf[exploded_new_subdf['substrate_label'].isin(helpers.node_labels)].groupby("substrate_label"):
    f = f'{edge_type}-{label}-{t}-edges.tsv'  
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")

    query = helpers.make_create_type_of_edge_query(f, 'COMPONENT_OF',
                           source_label=t, target_label="Complex",
                           source_name="line.substrate_name", target_name="line.product_name",
                           #source_form="line.substrate_form"
                          )
    #print(query)
    qr = graph.run(query)

    print(t, this_subdf.shape[0], qr.stats()['relationships_created'])    
    if not this_subdf.shape[0] == qr.stats()['relationships_created']:
        print("\tnot all edges created")

Complex 16 16
ForeignCoding 29 29
ForeignEntity 2 2
Metabolite 10 10
MetaboliteFamily 1 1
PlantAbstract 4 4
PlantCoding 128 128
Process 2 2


# END 