In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 2/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [None]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image, display

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
import helpers

In [None]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [None]:
graph = Graph(host="neo4j")

In [None]:
q = graph.run("MATCH (n) RETURN n LIMIT 10")

In [None]:
q.stats()

In [None]:
node_labels = helpers.node_labels

In [None]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [None]:
# start from scratch
graph.delete_all()

for l in node_labels:
    try:
        graph.schema.drop_uniqueness_constraint(l, "name")
    except:
        pass    

## Constraints on node names

Constraint automatically adds an index. 

In [None]:
for label in node_labels:
    q = f'''CREATE CONSTRAINT unique_name_{label.lower()}
        ON (node:{label}) ASSERT node.name IS UNIQUE'''
    try: 
        qr = graph.run(q)
        print(label, qr.stats()['constraints_added'])
    except Exception as e:
        print(e)

## Read in

In [None]:
path =  parsed_path / "components.tsv"
df_components = pd.read_csv(path, sep="\t")

## metabolites

In [None]:
label = 'Metabolite'
df_metabolites = df_components[df_components['NodeLabel'] == label].copy()

In [None]:
df_metabolites.columns

In [None]:
df_metabolites[df_metabolites['NodeName'].duplicated()]

In [None]:
want_columns = ['AddedBy', 'Family', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus',                 
               ]

In [None]:
f = '%s-components.tsv'%label
df_metabolites[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.NodeName"
                    )

In [None]:
print(q)

In [None]:
qr = graph.run(q)
if not df_metabolites.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [None]:
df_metabolites[df_metabolites['Clade'] != df_metabolites['NodeID']]

In [None]:
label = "MetaboliteFamily"

In [None]:
df_metabolites_has_family = df_metabolites[df_metabolites['Family'] != df_metabolites['NodeID']]

In [None]:
want_columns = ['AddedBy', 'Family',
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [None]:
df_metabolites_family = df_metabolites_has_family.fillna('').groupby('Family').agg({
                                                                  'Family':lambda x:list(x)[0],
                                                                  'AddedBy':lambda x:list(x)[0], 
                                                                  'NodeDescription':lambda x: ','.join(x), 
                                                                  'external_links':lambda x: ','.join([s for s in x if not s=='']), 
                                                                  'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                                  'Process':lambda x:list(x)[0], 
                                                                  'ModelV':helpers.get_latest_model, 
                                                                  'ModelStatus':helpers.get_model_status})

In [None]:
df_metabolites_family

In [None]:
f = '%s-components.tsv'%label
df_metabolites_family.to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.Family"
                    )

In [None]:
print(q)

In [None]:
qr = graph.run(q)
if not df_metabolites_family.shape[0] == qr.stats()['nodes_created']:
    raise Exception 

In [None]:
# Metabolite to MetaboliteFamily edges
edge_type = 'TYPE_OF'
f = '%s-edges.tsv'%edge_type
df_metabolites_has_family[want_columns + ["NodeName"]].to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="Metabolite", target_label="MetaboliteFamily",
                           source_name="line.NodeName", target_name="line.Family")

In [None]:
print(q)

In [None]:
qr = graph.run(q)
if not df_metabolites_has_family.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Pathogens (Foreign)

In [None]:
df_external = df_components[df_components["NodeLabel"].isin(helpers.foreign_node_labels)].copy()

In [None]:
df_external.columns

In [None]:
# Species --> '-'
# Family --> 'biological classification'
# Clade --> 'species'
df_external.columns = ['identifier', 'AddedBy', '-', 'NodeLabel', 'NodeType', 
                       'classification', 'species', 'NodeID', 'NodeName', 
                       'external_links', 'NodeDescription', 'AdditionalInfo', 
                       'Process', 'ModelV', 'ModelStatus', 
                       'gmm_ocd', 'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms']

In [None]:
#manualfix
df_external.loc[df_external['species']=='oomycete', 'classification'] = 'oomycete'

In [None]:
df_external.head()

In [None]:
want_columns = ['AddedBy',  
                'classification', 'species', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [None]:
df_external[['NodeType', 'classification', 'species', 'NodeName']]

In [None]:
for label, subdf in df_external.groupby('NodeLabel'):
    print(label, end='\t')
    f = '%s-components.tsv'%label
    subdf[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)
    
    q = helpers.foreign_node_query(f, label, 
                     n_name="line.NodeName"
                    )
    print(q)
    qr = graph.run(q)
    if not subdf.shape[0] == qr.stats()['nodes_created']:
        raise Exception

In [None]:
external_entities = df_external.groupby('species').agg({
                                                      'classification':lambda x:list(x)[0],
                                                      'AddedBy':lambda x:list(x)[0], 
                                                      #'NodeDescription':lambda x: ', '.join(x), 
                                                      #'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                      'Process':lambda x:list(x)[0], 
                                                      'ModelV':helpers.get_latest_model, 
                                                      #'ModelStatus':helpers.get_model_status
                                                }).reset_index()

In [None]:
external_entities.columns

In [None]:
label = 'ForeignEntity'
want_columns = ['species', 'classification', 'AddedBy', 'Process', 'ModelV']
f = '%s-components.tsv'%label
external_entities[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.foreign_node_query(f, label, 
                     n_name="line.species", 
                    )
print(q)

In [None]:
qr = graph.run(q)
if not external_entities.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [None]:
# ExternalXXX to external_entities edges
want_columns = ['AddedBy', 'NodeName', 'species', 'ModelV']
edge_type = 'AGENT_OF'
f = '%s-edges.tsv'%edge_type
df_external[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="", target_label="ForeignEntity",
                           source_name="line.NodeName", target_name="line.species")
print(q)

In [None]:
qr = graph.run(q)
if not df_external.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Process

In [None]:
df_process = df_components[df_components['NodeLabel'].isin(['Process'
                                                                ])].copy()

In [None]:
df_process.head()

In [None]:
want_columns = ['AddedBy', 'NodeLabel',  
        'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus']


In [None]:
label = "Process"
f = "%s-components.tsv"%label
df_process[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [None]:
q = helpers.process_node_query(f, label, 
                     n_name="line.NodeName", 
                    )
print(q)

In [None]:
qr = graph.run(q)
if not df_process.shape[0] == qr.stats()['nodes_created']:
    raise Exception

## Complexes

In [None]:
label = 'Complex'
df_complex = df_components[df_components['NodeLabel'] == label].copy()

In [None]:
df_complex

In [None]:
# save node types 
print(label, "\t", df_complex.shape[0])
df_complex.to_csv(f"../data/import/{label}-components.tsv", sep="\t", index=None)

In [None]:
query = helpers.bioelement_node_query(f"{label}-components.tsv", label, n_name="line.Family")
qr = graph.run(query)
print(label, "\t", qr.stats()['nodes_created'])

## Bio-elements

these nodes have "Family", "Clade" (msa) and "FunctionalCluster"
at this point only import Families

In [None]:
df_bioelements = df_components[df_components['NodeLabel'].isin(helpers.plant_node_labels)].copy()

In [None]:
df_bioelements[df_bioelements['NodeName'].duplicated(keep=False)].sort_values('NodeName')[['identifier', 'species', 'Family', 'Clade', 'NodeID', 'NodeName']]

In [None]:
pd.value_counts(df_bioelements['NodeLabel'])

In [None]:
all_species_listed = list(df_bioelements['species'].unique()); all_species_listed
all_species = all_species_listed.copy()
all_species

In [None]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return ",".join(list(v))
    return ''

def str_lists_to_one_list(x):
    l = []
    for sub in x:
        l += sub.split(',')
    l = [s.strip() for s in l if not s=='']
    
    return ','.join(list(set(l)))

def get_species_homologues(level, meta=False, use=False):
    
    if use:
        df = df_bioelements[df_bioelements['ModelStatus'] == 'use'].copy()
    else:
        df = df_bioelements.copy()

    df['family'] = df['Family']
    
    df_level_species  = df.groupby([level, 'species']).agg({
      'NodeID':lambda x: set(x), 
      'family':lambda x: set(x).pop(), 
    })
       
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['species'] == specie, col] =\
                 df_level_species[df_level_species['species'] == specie]['NodeID']
    
    selector = {
        y:pick_the_set for y in cols
    }
    selector['family'] = lambda x: x[0]
        
    df_level_species = df_level_species.groupby(level).agg(selector)
    
    if meta:
        df_level = df.fillna('').groupby(level).agg({
              'AddedBy':lambda x:list(x)[0], 
              'NodeLabel':lambda x:list(x)[0], 
              'NodeDescription':lambda x: ', '.join(list(set(x))), 
              'AdditionalInfo':lambda x: helpers.list_to_string(x), 
              'Process':lambda x:list(x)[0], 
              'ModelV':helpers.get_latest_model, 
              'species':lambda x: ','.join(list(set(x))),
              'ModelStatus':helpers.get_model_status,
              'external_links':str_lists_to_one_list, 
              'gmm_ocd':str_lists_to_one_list,
              'GMM_Description':str_lists_to_one_list,
              'GMM_ShortName':str_lists_to_one_list,
              'synonyms':str_lists_to_one_list,
        })    

        df = df_level.join(df_level_species[cols])
    else:
        df = df_level_species#[cols]
    
    df.reset_index(inplace=True)
    return df

In [None]:
df_families = get_species_homologues("Family", meta=True)

In [None]:
# save node types 
family_node_labels = []
for t, subdf in df_families.groupby("NodeLabel"):
    print(t, "\t", subdf.shape[0])
    subdf.to_csv("../data/import/%s-components.tsv"%t, sep="\t", index=None)
    family_node_labels.append(t)

In [None]:
for t in family_node_labels:
    labels = [t, 'Family', 'Plant']
    query = helpers.bioelement_node_query("%s-components.tsv"%t, labels, n_name="line.Family")
    #query = f"MATCH (n:{t}) DELETE n"
    #print(query)
    qr = graph.run(query)
    print(t, "\t", qr.stats()['nodes_created'])

## save files

In [None]:
file_name = parsed_path / "bio_elements.tsv"
df_bioelements.to_csv(file_name, sep="\t", index=False)

In [None]:
dfs = []
for col_name, level in [('NodeName', 'node'), ('Clade', 'clade'), ('Family', 'family')]:
    print(col_name)
    df = get_species_homologues(col_name, use=True)

    df.rename(columns={col_name:'name'}, inplace=True)
    df['level'] = level
    df.set_index(['name', 'level'], inplace=True)
    dfs.append(df)

In [None]:
translate_df = pd.concat(dfs)
translate_df = translate_df.sort_index()
translate_df.head()

In [None]:
file_name = parsed_path / "level_translation.tsv"
translate_df.to_csv(file_name, sep="\t")

In [None]:
dfs = []
for col_name, level in [('NodeName', 'node'), ('Clade', 'clade'), ('Family', 'family')]:
    print(col_name)
    df = get_species_homologues(col_name, use=False)

    df.rename(columns={col_name:'name'}, inplace=True)
    df['level'] = level
    df.set_index(['name', 'level'], inplace=True)
    dfs.append(df)

In [None]:
translate_df = pd.concat(dfs)
translate_df = translate_df.sort_index()
translate_df.head()

In [None]:
file_name = parsed_path / "level_not_use_translation.tsv"
translate_df.to_csv(file_name, sep="\t")

# END 