In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/?

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image
from IPython.display import display

In [3]:
from py2neo import Graph, Node, Relationship

In [4]:
import helpers

In [5]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
graph = Graph(host="neo4j")

In [7]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [8]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [9]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,reaction_type,reaction_id,input1_label,input2_label,input3_label,output1_label,input1_newID,input2_newID,input3_newID,output1_newID
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,catalysis,rx00001,Metabolite,PlantCoding,,Metabolite,L-Met,"SAMS[AT1G02500,AT2G36880,AT3G17390,AT4G01850]",,SAMe
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,catalysis,rx00002,Metabolite,PlantCoding,,Metabolite,SAMe,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT...",,ACC
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,catalysis,rx00003,Metabolite,PlantCoding,,Metabolite,ACC,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",,ET
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,translocation,rx00004,Metabolite,PlantCoding,,Metabolite,Cu2+,"HMA[AT1G63440,AT5G44790]",,Cu2+
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,protein activation,rx00005,PlantCoding,Metabolite,,PlantCoding,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",Cu2+,,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT..."


In [10]:
translate_functional_clusters = pd.read_csv(parsed_path / "functional_clusters.tsv", sep="\t")
translate_functional_clusters = translate_functional_clusters.set_index(['node_name', 'level', 'species'])
translate_functional_clusters.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,family,reaction_id,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name,labels
node_name,level,species,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAO,family,ath,AAO,rx00081,"AT1G04580,AT2G27150,AT5G20960","AT2G27150,AT5G20960,AT1G04580",,,,"AAO[AT1G04580,AT2G27150,AT5G20960]",PlantCoding
ACH,family,ath,ACH,rx00038,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]",PlantCoding
ACO,family,ath,ACO,rx00003,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G62380,AT1G12010,AT2G19590,AT1G05010,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",PlantCoding
ACS,family,ath,ACS,rx00002,"AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT4G08...","AT4G26200,AT4G11280,AT3G49700,AT3G61510,AT4G37...",,,,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT...",PlantCoding
ACS2,node,ath,ACS,rx00248,AT1G01480,AT1G01480,,,,ACS2[AT1G01480],PlantCoding


In [11]:
def generate_list(subdf, ids, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols
        
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols

    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [12]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_ID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [13]:
def number_input_different(df, homologues=True, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate', homologues=homologues)
        rename_target(subdf2, 'input2', 'catalyst', homologues=homologues)

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate', homologues=homologues)        
        rename_target(subdf3, 'input3', 'catalyst', homologues=homologues)
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate', homologues=homologues)

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate', homologues=homologues)
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [14]:
# transcription genes
substrate_cols_wo_homologues = ['substrate_name', 'substrate_form', 'substrate_label', 'substrate_location']
product_cols_wo_homologues = ['product_name', 'product_form', 'product_label',  'product_location']
catalyst_cols_wo_homologues = ['catalyst_name', 'catalyst_form', 'catalyst_label', 'catalyst_location']

# homologue_cols = [f"_{x}_homologues" for x in all_species]

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] #+#\
               # [f"substrate{x}" for x in homologue_cols]
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] #+\
                #[f"catalyst{x}" for x in homologue_cols] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] #+\
               # [f"product{x}" for x in homologue_cols]

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']



In [15]:
# def read_dict(file):
#     d = {}
#     with open(file, "r") as f:
#         for line in f:
#             key, value = line.strip().split("\t")
#             d[key] = value
#     return d

### Add complexes

In [16]:
label = "Complex"
q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
already_defined_complexes = set([d['n.name'] for d in graph.run(q).data()])
already_defined_complexes

{'SCF', 'WD/bHLH/MYB', 'ribosome'}

In [17]:
complexes_to_add = set(pd.read_csv(parsed_path / "complexes_to_add.tsv", sep="\t", header=None)[0]) - set(already_defined_complexes)
len(complexes_to_add)

91

In [18]:
rows_w_new_complex = get_x_nodes(df_edges, complexes_to_add)

In [19]:
want_cols = ['reaction_type', 'Modifications', 'Species']
for prefix in ['input1', 'input2', 'input3', 'output1']:
    want_cols += [f"{prefix}_{x}" for x in ['newID', 'location', 'label', 'form']]

df_new_complex = df_edges.loc[rows_w_new_complex, want_cols]

In [20]:
# first complexes defined by binding/oglimerisation reactions
key = 'binding/oligomerisation'
subdf = df_new_complex.loc[df_new_complex['reaction_type'] == key]

binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']

In [21]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst, homologues=False)
subdf_w_catalyst = number_input_different(binding_w_catalyst, homologues=False, catalyst=True)

In [22]:
subdf_wo_catalyst = subdf_wo_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]
subdf_w_catalyst = subdf_w_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]

In [23]:
new_subdf = pd.concat([subdf_wo_catalyst, subdf_w_catalyst])

In [24]:
new_subdf.drop_duplicates('product_name', keep='first', inplace=True)

In [25]:
new_subdf['substrate_label'] = new_subdf['substrate_label'].apply(lambda x: [z + ":FunctionalCluster" if (z in helpers.plant_node_labels) else z  for z in x ])

In [26]:
new_subdf.head()

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
5,"[CTR[AT5G03730], ETR[AT1G04310,AT1G66340,AT2G4...","[PlantCoding:FunctionalCluster, PlantCoding:Fu...","[protein_active, protein_active]",CTR|ETR,CTR|ETR
7,"[ET, ETR[AT1G04310,AT1G66340,AT2G40940,AT3G045...","[Metabolite, PlantCoding:FunctionalCluster]","[metabolite, protein_active]",ET|ETR,ET|ETR
10,"[ETP[AT3G18910,AT3G18980], SCF]","[PlantCoding:FunctionalCluster, Complex]","[protein, complex]",ETP|SCF,ETP|SCF
15,"[EBF[AT2G25490,AT5G25350], SCF]","[PlantCoding:FunctionalCluster, Complex]","[protein, complex]",EBF|SCF,EBF|SCF
17,"[JAZ[AT1G17380,AT1G19180,AT1G30135,AT1G48500,A...","[PlantCoding:FunctionalCluster, PlantCoding:Fu...","[protein, protein_active]",EIN3(like)|JAZ,EIN3(like)|JAZ


In [27]:
binding_defined_complexes = set(new_subdf['product_name'].values)

In [28]:
len(binding_defined_complexes)

85

In [30]:
# other complexes
other_complexes_set = set()
for i , row in df_new_complex.iterrows():
    for col_prefix in ['input1', 'input2', 'input3', 'output1']:
        if row[col_prefix + "_label"] == 'Complex':
            c = row[col_prefix + "_newID"]
            if not ((c in binding_defined_complexes) or (c in already_defined_complexes)):
                other_complexes_set.add((row['Species'], row[col_prefix + "_newID"]))
print(other_complexes_set)

{('ath', 'CAM|Ca2+'), ('ath', 'NPR1|PAD4|TGA2,5,6'), ('ath', 'EDS1|NPR1|TGA2,5,6'), ('ath', 'GSTU24|ROS'), ('ath/nbe', 'COI1|OMR1'), ('ath', 'EDS5|NPR1|TGA2,5,6')}


In [31]:
def get_subunits(x):
    if '|' in x:
        return x.split('|')
    else:
        return []

In [32]:
def get_name_label(x):
    ids_ = x['substrate_og_name']
    species = x['species']
    species = ','.join(species.split('/'))
    names = []
    labels = []
    levels_ = ['node', 'clade', 'family']
    for id_ in ids_:
        functional_cluster = None
        label = None
        for level_ in levels_:
            try: 
                functional_cluster = translate_functional_clusters.loc[(id_, level_, species)]['functional_cluster_name']
                label = translate_functional_clusters.loc[(id_, level_, species)]['labels']
                #print(id_, level_, functional_cluster, label)
            except:
                #print(id_, level_)
                pass

        #display(functional_cluster)
        if functional_cluster:
            label = "FunctionalCluster:" + label
        else:
            functional_cluster, label = node_id_to_node(id_)
        if not functional_cluster:
            print("ERROR: cannot identify correct subunit:", id_, level_)

            
        names.append(functional_cluster)
        labels.append(label)
    return names, labels


def clean_labels(labels):
	for x in ['Family', 'Plant', 'Foreign', 'Node']:
		if x in labels:
			labels.remove(x)
	return labels[0]

def node_id_to_node(id_):

    query = '''MATCH (s) WHERE s.name=$x 
               RETURN s.name AS name, labels(s) AS labels'''
    
    cursor = graph.run(query, x=id_)
    d = cursor.data()
    
    if len(d) == 0:
        print(id_, d, "no hit")
        return None, None
    elif len(d) == 1:
        label = clean_labels(d[0]['labels'])
        name = d[0]['name']
        return name, label
    else:
        print(id_, d, 'multiple hits') # should be impossible
        return ""

In [33]:
other_complexes = pd.DataFrame(other_complexes_set, columns=['species', 'output1_newID'])
other_complexes['substrate_og_name'] = other_complexes['output1_newID'].apply(get_subunits)
other_complexes['substrate_form'] = other_complexes['substrate_og_name'].apply(lambda x: ["" for c in x])
other_complexes['product_name'] = other_complexes['output1_newID']
other_complexes

Unnamed: 0,species,output1_newID,substrate_og_name,substrate_form,product_name
0,ath,CAM|Ca2+,"[CAM, Ca2+]","[, ]",CAM|Ca2+
1,ath,"NPR1|PAD4|TGA2,5,6","[NPR1, PAD4, TGA2,5,6]","[, , ]","NPR1|PAD4|TGA2,5,6"
2,ath,"EDS1|NPR1|TGA2,5,6","[EDS1, NPR1, TGA2,5,6]","[, , ]","EDS1|NPR1|TGA2,5,6"
3,ath,GSTU24|ROS,"[GSTU24, ROS]","[, ]",GSTU24|ROS
4,ath/nbe,COI1|OMR1,"[COI1, OMR1]","[, ]",COI1|OMR1
5,ath,"EDS5|NPR1|TGA2,5,6","[EDS5, NPR1, TGA2,5,6]","[, , ]","EDS5|NPR1|TGA2,5,6"


In [34]:
other_complexes[['substrate_name', 'substrate_label']] = other_complexes[['species', 'substrate_og_name']].apply(get_name_label, axis=1, result_type='expand')
other_complexes

Unnamed: 0,species,output1_newID,substrate_og_name,substrate_form,product_name,substrate_name,substrate_label
0,ath,CAM|Ca2+,"[CAM, Ca2+]","[, ]",CAM|Ca2+,"[CAM[AT1G66410,AT3G01830,AT4G20780], Ca2+]","[FunctionalCluster:PlantCoding, Metabolite]"
1,ath,"NPR1|PAD4|TGA2,5,6","[NPR1, PAD4, TGA2,5,6]","[, , ]","NPR1|PAD4|TGA2,5,6","[NPR1[AT1G64280], PAD4[AT3G52430], TGA2,5,6[AT...","[FunctionalCluster:PlantCoding, FunctionalClus..."
2,ath,"EDS1|NPR1|TGA2,5,6","[EDS1, NPR1, TGA2,5,6]","[, , ]","EDS1|NPR1|TGA2,5,6","[EDS1[AT3G48090], NPR1[AT1G64280], TGA2,5,6[AT...","[FunctionalCluster:PlantCoding, FunctionalClus..."
3,ath,GSTU24|ROS,"[GSTU24, ROS]","[, ]",GSTU24|ROS,"[GSTU24[AT1G17170], ROS]","[FunctionalCluster:PlantCoding, MetaboliteFamily]"
4,ath/nbe,COI1|OMR1,"[COI1, OMR1]","[, ]",COI1|OMR1,"[COI1, OMR1[AT3G10050]]","[PlantCoding, FunctionalCluster:PlantCoding]"
5,ath,"EDS5|NPR1|TGA2,5,6","[EDS5, NPR1, TGA2,5,6]","[, , ]","EDS5|NPR1|TGA2,5,6","[EDS5[AT4G39030], NPR1[AT1G64280], TGA2,5,6[AT...","[FunctionalCluster:PlantCoding, FunctionalClus..."


In [35]:
del other_complexes['substrate_og_name']
del other_complexes['species']

In [36]:
new_complexes = other_complexes.append(new_subdf, sort=True).reset_index(drop=True)
new_complexes.head()

Unnamed: 0,output1_newID,product_name,substrate_form,substrate_label,substrate_name
0,CAM|Ca2+,CAM|Ca2+,"[, ]","[FunctionalCluster:PlantCoding, Metabolite]","[CAM[AT1G66410,AT3G01830,AT4G20780], Ca2+]"
1,"NPR1|PAD4|TGA2,5,6","NPR1|PAD4|TGA2,5,6","[, , ]","[FunctionalCluster:PlantCoding, FunctionalClus...","[NPR1[AT1G64280], PAD4[AT3G52430], TGA2,5,6[AT..."
2,"EDS1|NPR1|TGA2,5,6","EDS1|NPR1|TGA2,5,6","[, , ]","[FunctionalCluster:PlantCoding, FunctionalClus...","[EDS1[AT3G48090], NPR1[AT1G64280], TGA2,5,6[AT..."
3,GSTU24|ROS,GSTU24|ROS,"[, ]","[FunctionalCluster:PlantCoding, MetaboliteFamily]","[GSTU24[AT1G17170], ROS]"
4,COI1|OMR1,COI1|OMR1,"[, ]","[PlantCoding, FunctionalCluster:PlantCoding]","[COI1, OMR1[AT3G10050]]"


In [37]:
exploded_new_subdf = helpers.unnesting(new_complexes, ['substrate_name', 'substrate_label', 'substrate_form']).drop_duplicates()
exploded_new_subdf[exploded_new_subdf['product_name']=='NPR1|PAD4|TGA2,5,6']

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
1,NPR1[AT1G64280],FunctionalCluster:PlantCoding,,"NPR1|PAD4|TGA2,5,6","NPR1|PAD4|TGA2,5,6"
1,PAD4[AT3G52430],FunctionalCluster:PlantCoding,,"NPR1|PAD4|TGA2,5,6","NPR1|PAD4|TGA2,5,6"
1,"TGA2,5,6[AT3G12250,AT5G06950,AT5G06960]",FunctionalCluster:PlantCoding,,"NPR1|PAD4|TGA2,5,6","NPR1|PAD4|TGA2,5,6"


In [38]:
# save new complexes 
label = 'Complex'
f = f'{label}-new-components.tsv'
want_cols = 'product_name'
new_complexes[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None, header=True)

In [39]:
print(new_complexes.shape[0])

91


In [40]:
query = helpers.bioelement_node_query(f, "Complex", 
                           n_name="line.product_name")
#print(query)
qr = graph.run(query)
if not new_complexes.shape[0] == qr.stats()['nodes_created']:
     raise Exception

In [41]:
exploded_new_subdf[exploded_new_subdf['product_name']=='D14|MAX2|SCF']

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
89,D14[AT3G03990],PlantCoding:FunctionalCluster,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
89,MAX2[AT2G42620],PlantCoding:FunctionalCluster,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
89,SCF,Complex,complex,D14|MAX2|SCF,D14|MAX2|SCF


In [42]:
exploded_new_subdf.tail()

Unnamed: 0,substrate_name,substrate_label,substrate_form,output1_newID,product_name
88,BAK1[AT4G33430],PlantCoding:FunctionalCluster,protein,BAK1|FLS2|flg22,BAK1|FLS2|flg22
89,D14[AT3G03990],PlantCoding:FunctionalCluster,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
89,MAX2[AT2G42620],PlantCoding:FunctionalCluster,protein_active,D14|MAX2|SCF,D14|MAX2|SCF
89,SCF,Complex,complex,D14|MAX2|SCF,D14|MAX2|SCF
90,NPR1[AT1G64280],PlantCoding:FunctionalCluster,protein,NPR1|NPR1,NPR1|NPR1


In [43]:
# component to complex edges
edge_type = 'COMPONENT_OF'
want_cols = ['substrate_name', 'substrate_form', 'substrate_label', 'product_name']

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f'{edge_type}-{label}-{t}-edges.tsv'  
    print(t, this_subdf.shape[0])
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")

    query = helpers.make_create_type_of_edge_query(f, edge_type,
                           source_label=t, target_label="Complex",
                           source_name="line.substrate_name", target_name="line.product_name",
                           #source_form="line.substrate_form"
                          )
    print(query)
    qr = graph.run(query)
    
    r_created = qr.stats()['relationships_created']
    print(t, this_subdf.shape[0], r_created)    
    if not this_subdf.shape[0] == r_created:
        print("\tnot all edges created")

Complex 16
USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///COMPONENT_OF-Complex-Complex-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:Complex { name:line.substrate_name}),
                 (target:Complex { name:line.product_name})
           
           CREATE (source)-[:COMPONENT_OF {
                        added_by:line.AddedBy,
                        additional_information: line.AdditionalInfo, 
                        model_version:line.ModelV,
                        model_status:line.ModelStatus,
                        
                        pathway:line.Process
                        }]->(target)
Complex 16 16
ForeignCoding 29
USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///COMPONENT_OF-Complex-ForeignCoding-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:ForeignCoding { name:line.substrate_name}),
                 (target:Complex { name:line.product_name})
      

# END 