In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 4/?

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [None]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image
from IPython.display import display

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
import helpers

In [None]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [None]:
graph = Graph(host="neo4j")

In [None]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [None]:
all_species = ['ath', 'osa', 'stu', 'sly']

### Reactions sheet

In [None]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)
df_edges.head()

In [None]:
translate_functional_clusters = pd.read_csv(parsed_path / "functional_clusters.tsv", sep="\t")
translate_functional_clusters = translate_functional_clusters.set_index(['node_name', 'level', 'species'])
translate_functional_clusters.head()

In [None]:
def generate_list(subdf, ids, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols
        
    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_cols = [id_ + old_suf for id_ in ids]
        subdf[new_col] = subdf[old_cols].apply(lambda x: [i for i in x.values], axis=1)
        
        
def rename_target(subdf, id_, new_name, homologues=True):
    col_suffixes = ['_newID', '_location', '_label', '_form']
    new_col_suffixes = ['_name', '_location', '_label', '_form']

    if homologues:
        col_suffixes += homologue_cols
        new_col_suffixes += homologue_cols

    for old_suf, new_suf in zip(col_suffixes, new_col_suffixes):
        new_col = new_name + new_suf
        old_col = id_ + old_suf
        subdf[new_col] = subdf[old_col]

In [None]:
def get_x_nodes(df, x):
    rows_with_x = set()
    for i, row in df.iterrows():
        for col_prefix in ['input1', 'input2', 'input3', 'output1']:
            if row[col_prefix + "_ID"] in x:
                rows_with_x.add(i)
    return rows_with_x

In [None]:
def number_input_different(df, homologues=True, catalyst=False):
    ''' If catalyst is True, it is the last "input" col. '''

    if catalyst:
        # two inputs, input2 -> catalyst
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1'], 'substrate', homologues=homologues)
        rename_target(subdf2, 'input2', 'catalyst', homologues=homologues)

        # three inputs, input3 -> catalyst
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2'], 'substrate', homologues=homologues)        
        rename_target(subdf3, 'input3', 'catalyst', homologues=homologues)
        
    else:
        # two inputs
        subdf2 = df[df["input3_newID"].isna()].copy()
        generate_list(subdf2, ['input1', 'input2'], 'substrate', homologues=homologues)

        # three inputs
        subdf3 = df[~df["input3_newID"].isna()].copy()
        generate_list(subdf3, ['input1', 'input2', 'input3'], 'substrate', homologues=homologues)
    
    # combine
    new_subdf = subdf2.append(subdf3)
    rename_target(new_subdf, 'output1', 'product', homologues=homologues)
    
    return new_subdf

In [None]:
# transcription genes
substrate_cols_wo_homologues = ['substrate_name', 'substrate_form', 'substrate_label', 'substrate_location']
product_cols_wo_homologues = ['product_name', 'product_form', 'product_label',  'product_location']
catalyst_cols_wo_homologues = ['catalyst_name', 'catalyst_form', 'catalyst_label', 'catalyst_location']

# homologue_cols = [f"_{x}_homologues" for x in all_species]

substrate_cols = [ f'substrate{x}' for x in ['_name', '_label', '_form', '_location']] #+#\
               # [f"substrate{x}" for x in homologue_cols]
catalyst_cols = [ f'catalyst{x}' for x in ['_name', '_label', '_form', '_location']] #+\
                #[f"catalyst{x}" for x in homologue_cols] 
product_cols = [ f'product{x}' for x in ['_name', '_label', '_form', '_location']] #+\
               # [f"product{x}" for x in homologue_cols]

reaction_standard_columns = ['AddedBy', 'Species', 
       'AdditionalInfo',  'external_links', 'trust_level',
       'ModelV', 'ReactionEffect', 'reaction_type', 'Modifications', 'reaction_id']



In [None]:
# def read_dict(file):
#     d = {}
#     with open(file, "r") as f:
#         for line in f:
#             key, value = line.strip().split("\t")
#             d[key] = value
#     return d

### Add complexes

In [None]:
label = "Complex"
q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
already_defined_complexes = set([d['n.name'] for d in graph.run(q).data()])
already_defined_complexes

In [None]:
complexes_to_add = set(pd.read_csv(parsed_path / "complexes_to_add.tsv", sep="\t", header=None)[0]) - set(already_defined_complexes)
len(complexes_to_add)

In [None]:
rows_w_new_complex = get_x_nodes(df_edges, complexes_to_add)

In [None]:
want_cols = ['reaction_type', 'Modifications', 'Species']
for prefix in ['input1', 'input2', 'input3', 'output1']:
    want_cols += [f"{prefix}_{x}" for x in ['newID', 'location', 'label', 'form']]

df_new_complex = df_edges.loc[rows_w_new_complex, want_cols]

In [None]:
# first complexes defined by binding/oglimerisation reactions
key = 'binding/oligomerisation'
subdf = df_new_complex.loc[df_new_complex['reaction_type'] == key]

binding_w_catalyst = subdf.loc[subdf['Modifications'] == 'with catalyst']
binding_wo_catalyst = subdf.loc[subdf['Modifications'] != 'with catalyst']

In [None]:
subdf_wo_catalyst = number_input_different(binding_wo_catalyst, homologues=False)
subdf_w_catalyst = number_input_different(binding_w_catalyst, homologues=False, catalyst=True)

In [None]:
subdf_wo_catalyst = subdf_wo_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]
subdf_w_catalyst = subdf_w_catalyst[['substrate_name', 'substrate_label', 'substrate_form', 'output1_newID', 'product_name']]

In [None]:
new_subdf = pd.concat([subdf_wo_catalyst, subdf_w_catalyst])

In [None]:
new_subdf.drop_duplicates('product_name', keep='first', inplace=True)

In [None]:
new_subdf['substrate_label'] = new_subdf['substrate_label'].apply(lambda x: [z + ":FunctionalCluster" if (z in helpers.plant_node_labels) else z  for z in x ])

In [None]:
new_subdf.head()

In [None]:
binding_defined_complexes = set(new_subdf['product_name'].values)

In [None]:
len(binding_defined_complexes)

In [None]:
# other complexes
other_complexes_set = set()
for i , row in df_new_complex.iterrows():
    for col_prefix in ['input1', 'input2', 'input3', 'output1']:
        if row[col_prefix + "_label"] == 'Complex':
            c = row[col_prefix + "_newID"]
            if not ((c in binding_defined_complexes) or (c in already_defined_complexes)):
                other_complexes_set.add((row['Species'], row[col_prefix + "_newID"]))
print(other_complexes_set)

In [None]:
def get_subunits(x):
    if '|' in x:
        return x.split('|')
    else:
        return []

In [None]:
def get_name_label(x):
    ids_ = x['substrate_og_name']
    species = x['species']
    species = ','.join(species.split('/'))
    names = []
    labels = []
    levels_ = ['node', 'clade', 'family']
    for id_ in ids_:
        functional_cluster = None
        label = None
        for level_ in levels_:
            try: 
                functional_cluster = translate_functional_clusters.loc[(id_, level_, species)]['functional_cluster_name']
                label = translate_functional_clusters.loc[(id_, level_, species)]['labels']
                #print(id_, level_, functional_cluster, label)
            except:
                #print(id_, level_)
                pass

        #display(functional_cluster)
        if functional_cluster:
            label = "FunctionalCluster:" + label
        else:
            functional_cluster, label = node_id_to_node(id_)
        if not functional_cluster:
            print("ERROR: cannot identify correct subunit:", id_, level_)

            
        names.append(functional_cluster)
        labels.append(label)
    return names, labels


def clean_labels(labels):
	for x in ['Family', 'Plant', 'Foreign', 'Node']:
		if x in labels:
			labels.remove(x)
	return labels[0]

def node_id_to_node(id_):

    query = '''MATCH (s) WHERE s.name=$x 
               RETURN s.name AS name, labels(s) AS labels'''
    
    cursor = graph.run(query, x=id_)
    d = cursor.data()
    
    if len(d) == 0:
        print(id_, d, "no hit")
        return None, None
    elif len(d) == 1:
        label = clean_labels(d[0]['labels'])
        name = d[0]['name']
        return name, label
    else:
        print(id_, d, 'multiple hits') # should be impossible
        return ""

In [None]:
other_complexes = pd.DataFrame(other_complexes_set, columns=['species', 'output1_newID'])
other_complexes['substrate_og_name'] = other_complexes['output1_newID'].apply(get_subunits)
other_complexes['substrate_form'] = other_complexes['substrate_og_name'].apply(lambda x: ["" for c in x])
other_complexes['product_name'] = other_complexes['output1_newID']
other_complexes

In [None]:
other_complexes[['substrate_name', 'substrate_label']] = other_complexes[['species', 'substrate_og_name']].apply(get_name_label, axis=1, result_type='expand')
other_complexes

In [None]:
del other_complexes['substrate_og_name']
del other_complexes['species']

In [None]:
new_complexes = other_complexes.append(new_subdf, sort=True).reset_index(drop=True)
new_complexes.head()

In [None]:
exploded_new_subdf = helpers.unnesting(new_complexes, ['substrate_name', 'substrate_label', 'substrate_form']).drop_duplicates()
exploded_new_subdf[exploded_new_subdf['product_name']=='NPR1|PAD4|TGA2,5,6']

In [None]:
# save new complexes 
label = 'Complex'
f = f'{label}-new-components.tsv'
want_cols = 'product_name'
new_complexes[want_cols].to_csv(f'../data/import/{f}', sep="\t", index=None, header=True)

In [None]:
print(new_complexes.shape[0])

In [None]:
query = helpers.bioelement_node_query(f, "Complex", 
                           n_name="line.product_name")
#print(query)
qr = graph.run(query)
if not new_complexes.shape[0] == qr.stats()['nodes_created']:
     raise Exception

In [None]:
exploded_new_subdf[exploded_new_subdf['product_name']=='D14|MAX2|SCF']

In [None]:
exploded_new_subdf.tail()

In [None]:
# component to complex edges
edge_type = 'COMPONENT_OF'
want_cols = ['substrate_name', 'substrate_form', 'substrate_label', 'product_name']

for t, this_subdf in exploded_new_subdf.groupby("substrate_label"):
    f = f'{edge_type}-{label}-{t}-edges.tsv'  
    print(t, this_subdf.shape[0])
    this_subdf[want_cols].to_csv(f"../data/import/{f}", index=None, sep="\t")

    query = helpers.make_create_type_of_edge_query(f, edge_type,
                           source_label=t, target_label="Complex",
                           source_name="line.substrate_name", target_name="line.product_name",
                           #source_form="line.substrate_form"
                          )
    print(query)
    qr = graph.run(query)
    
    r_created = qr.stats()['relationships_created']
    print(t, this_subdf.shape[0], r_created)    
    if not this_subdf.shape[0] == r_created:
        print("\tnot all edges created")

# END 