In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 2/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image, display

In [3]:
from py2neo import Graph, Node, Relationship

In [4]:
import helpers

In [5]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
graph = Graph(host="neo4j")

In [7]:
q = graph.run("MATCH (n) RETURN n LIMIT 10")

In [8]:
q.stats()

{}

In [9]:
node_labels = helpers.node_labels

In [10]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [14]:
# start from scratch
graph.delete_all()

for l in node_labels:
    try:
        graph.schema.drop_uniqueness_constraint(l, "name")
    except:
        pass    

## Constraints on node names

Constraint automatically adds an index. 

In [15]:
for label in node_labels:
    q = f'''CREATE CONSTRAINT unique_name_{label.lower()}
        ON (node:{label}) ASSERT node.name IS UNIQUE'''
    qr = graph.run(q)
    print(label, qr.stats()['constraints_added'])   

PlantCoding 1
PlantNonCoding 1
PlantAbstract 1
Complex 1
ExternalEntity 1
ExternalCoding 1
ExternalNonCoding 1
ExternalAbstract 1
Process 1
MetaboliteFamily 1
Metabolite 1
PseudoNode 1


## Read in

In [17]:
path =  parsed_path / "components.tsv"
df_components = pd.read_csv(path, sep="\t")

## metabolites

In [18]:
label = 'Metabolite'
df_metabolites = df_components[df_components['NodeLabel'] == label].copy()

In [19]:
df_metabolites.columns

Index(['AddedBy', 'species', 'NodeLabel', 'NodeType', 'Family', 'Clade',
       'NodeID', 'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus', 'gmm_ocd',
       'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms'],
      dtype='object')

In [20]:
df_metabolites[df_metabolites['NodeName'].duplicated()]

Unnamed: 0,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms


In [21]:
want_columns = ['AddedBy', 'Family', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus',                 
               ]

In [22]:
f = '%s-components.tsv'%label
df_metabolites[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [23]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.NodeName"
                    )

In [24]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///Metabolite-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Metabolite   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [25]:
qr = graph.run(q)
if not df_metabolites.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [26]:
df_metabolites[df_metabolites['Clade'] != df_metabolites['NodeID']]

Unnamed: 0,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms


In [27]:
label = "MetaboliteFamily"

In [28]:
df_metabolites_has_family = df_metabolites[df_metabolites['Family'] != df_metabolites['NodeID']]

In [29]:
want_columns = ['AddedBy', 'Family',
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [30]:
df_metabolites_family = df_metabolites_has_family.fillna('').groupby('Family').agg({
                                                                  'Family':lambda x:list(x)[0],
                                                                  'AddedBy':lambda x:list(x)[0], 
                                                                  'NodeDescription':lambda x: ', '.join(x), 
                                                                  'external_links':lambda x: ', '.join([s for s in x if not s=='']), 
                                                                  'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                                  'Process':lambda x:list(x)[0], 
                                                                  'ModelV':helpers.get_latest_model, 
                                                                  'ModelStatus':helpers.get_model_status})

In [31]:
df_metabolites_family

Unnamed: 0_level_0,Family,AddedBy,NodeDescription,external_links,AdditionalInfo,Process,ModelV,ModelStatus
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAMP/HAMP,DAMP/HAMP,KG,oligogalacturonides,,damage/host associated molecular patterns,,v2.6,use
DHZ,DHZ,AG,dihydrozeatin,chebi:17874,,Hormone:CK,v0.0,use
DHZ7G,DHZ7G,AG,7-(&alpha;-D-glucosyl)dihydrozeatin,chebi:133475,,Hormone:CK,v0.0,use
DHZ9G,DHZ9G,AG,9-(&alpha;-D-glucosyl)dihydrozeatin,chebi:72612,,Hormone:CK,v0.0,use
DHZOG,DHZOG,AG,dihydrozeatin-O-glucoside,chebi:80499,,Hormone:CK,v0.0,use
DHZR,DHZR,AG,dihydrozeatin riboside,chebi:80498,,Hormone:CK,v0.0,use
DHZROG,DHZROG,AG,dihydrozeatin riboside-O-glucoside,,,Hormone:CK,v0.0,use
PostROS,PostROS,MZ,water,chebi:15377,,S:ROS,v2.7,ignore
PreROS,PreROS,MZ,ROS precursors; oxygen,chebi:25805,,S:ROS,v2.6,use
ROS,ROS,MZ,"hydrogen peroxide (reactive oxygen species), h...","chebi:16240, chebi:18421",,S:ROS,v2.6,use


In [32]:
f = '%s-components.tsv'%label
df_metabolites_family.to_csv("../data/import/" + f, sep="\t", index=None)

In [33]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.Family"
                    )

In [34]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///MetaboliteFamily-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:MetaboliteFamily   { 
                name:line.Family, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [35]:
qr = graph.run(q)
if not df_metabolites_family.shape[0] == qr.stats()['nodes_created']:
    raise Exception 

In [36]:
# Metabolite to MetaboliteFamily edges
edge_type = 'TYPE_OF'
f = '%s-edges.tsv'%edge_type
df_metabolites_has_family[want_columns + ["NodeName"]].to_csv("../data/import/" + f, sep="\t", index=None)

In [37]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="Metabolite", target_label="MetaboliteFamily",
                           source_name="line.NodeName", target_name="line.Family")

In [38]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///TYPE_OF-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:Metabolite { name:line.NodeName}),
                 (target:MetaboliteFamily { name:line.Family})
           
           CREATE (source)-[:TYPE_OF {
                        added_by:line.AddedBy,
                        additional_information: line.AdditionalInfo, 
                        model_version:line.ModelV,
                        model_status:line.ModelStatus,
                        
                        pathway:line.Process
                        }]->(target)


In [39]:
qr = graph.run(q)
if not df_metabolites_has_family.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Pathogens (External)

In [40]:
df_external = df_components[df_components["NodeLabel"].isin(["ExternalCoding", "ExternalNonCoding"])].copy()

In [41]:
df_external.columns

Index(['AddedBy', 'species', 'NodeLabel', 'NodeType', 'Family', 'Clade',
       'NodeID', 'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus', 'gmm_ocd',
       'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms'],
      dtype='object')

In [42]:
# Species --> '-'
# Family --> 'biological classification'
# Clade --> 'species'
df_external.columns = ['AddedBy', '-', 'NodeLabel', 'NodeType', 
                       'classification', 'species', 'NodeID', 'NodeName', 
                       'external_links', 'NodeDescription', 'AdditionalInfo', 
                       'Process', 'ModelV', 'ModelStatus', 
                       'gmm_ocd', 'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms']

In [43]:
#manualfix
df_external.loc[df_external['species']=='oomycete', 'classification'] = 'oomycete'

In [44]:
df_external.head()

Unnamed: 0,AddedBy,-,NodeLabel,NodeType,classification,species,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms
3,KG,external,ExternalCoding,pathogen_coding,bacteria,trichous-bacteria,elf18,elf18,,EF-Tu fragment,"N terminus of elongation factor Tu (EF-Tu), th...",Pathogen_Effector,vNA,use,,,,
4,KG,external,ExternalCoding,pathogen_coding,bacteria,trichous-bacteria,flg22,flg22,,flagellin fragment,Flagellin is the structural protein that forms...,Pathogen_Effector,vNA,use,,,,
5,KG,external,ExternalCoding,pathogen_coding,oomycete,oomycete,ch,ch,,chitin,Chitin is a polymer of N-acetyl-d-glucosamine ...,Pathogen_Effector,vNA,use,,,,
6,KG,external,ExternalCoding,pathogen_coding,virus,potyvirus,6K1,6K1,,small peptide of unknown functions,,Pathogen,v2.6,use,,,,
7,KG,external,ExternalCoding,pathogen_coding,virus,potyvirus,6K2,6K2,,small peptide of unknown functions,,Pathogen,v2.6,use,,,,


In [45]:
want_columns = ['AddedBy',  
                'classification', 'species', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [46]:
df_external[['NodeType', 'classification', 'species', 'NodeName']]

Unnamed: 0,NodeType,classification,species,NodeName
3,pathogen_coding,bacteria,trichous-bacteria,elf18
4,pathogen_coding,bacteria,trichous-bacteria,flg22
5,pathogen_coding,oomycete,oomycete,ch
6,pathogen_coding,virus,potyvirus,6K1
7,pathogen_coding,virus,potyvirus,6K2
8,pathogen_coding,virus,potyvirus,CI
9,pathogen_coding,virus,potyvirus,CP
10,pathogen_coding,virus,potyvirus,HC-Pro
11,pathogen_coding,virus,potyvirus,NIa-Pro
12,pathogen_coding,virus,potyvirus,NIb


In [47]:
for label, subdf in df_external.groupby('NodeLabel'):
    print(label, end='\t')
    f = '%s-components.tsv'%label
    subdf[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)
    
    q = helpers.external_node_query(f, label, 
                     n_name="line.NodeName"
                    )
    print(q)
    qr = graph.run(q)
    if not subdf.shape[0] == qr.stats()['nodes_created']:
        raise Exception

ExternalCoding	USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///ExternalCoding-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:ExternalCoding   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                pathway:line.Process,
                species:split(line.species, ","),
                external_links:split(line.external_links, ","),
                                             
                classification:line.classification
            })


In [48]:
external_entities = df_external.groupby('species').agg({
                                                      'classification':lambda x:list(x)[0],
                                                      'AddedBy':lambda x:list(x)[0], 
                                                      #'NodeDescription':lambda x: ', '.join(x), 
                                                      #'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                      'Process':lambda x:list(x)[0], 
                                                      'ModelV':helpers.get_latest_model, 
                                                      #'ModelStatus':helpers.get_model_status
                                                }).reset_index()

In [49]:
external_entities.columns

Index(['species', 'classification', 'AddedBy', 'Process', 'ModelV'], dtype='object')

In [50]:
label = 'ExternalEntity'
want_columns = ['species', 'classification', 'AddedBy', 'Process', 'ModelV']
f = '%s-components.tsv'%label
external_entities[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [51]:
q = helpers.external_node_query(f, label, 
                     n_name="line.species", 
                    )
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///ExternalEntity-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:ExternalEntity   { 
                name:line.species, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                pathway:line.Process,
                species:split(line.species, ","),
                external_links:split(line.external_links, ","),
                                             
                classification:line.classification
            })


In [52]:
qr = graph.run(q)
if not external_entities.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [53]:
# ExternalXXX to external_entities edges
want_columns = ['AddedBy', 'NodeName', 'species', 'ModelV']
edge_type = 'AGENT_OF'
f = '%s-edges.tsv'%edge_type
df_external[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [54]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="", target_label="ExternalEntity",
                           source_name="line.NodeName", target_name="line.species")
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///AGENT_OF-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source { name:line.NodeName}),
                 (target:ExternalEntity { name:line.species})
           
           CREATE (source)-[:AGENT_OF {
                        added_by:line.AddedBy,
                        additional_information: line.AdditionalInfo, 
                        model_version:line.ModelV,
                        model_status:line.ModelStatus,
                        
                        pathway:line.Process
                        }]->(target)


In [55]:
qr = graph.run(q)
if not df_external.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Bio-elements

In [56]:
df_bioelements = df_components[df_components['NodeLabel'].isin(['PlantCoding', 
                                                                 'PlantNonCoding',
                                                                 'PlantAbstract', 
                                                                 'Complex'
                                                                ])].copy()

In [57]:
df_bioelements['species'].unique()

array(['all', 'ath', 'osa', 'stu', 'sly'], dtype=object)

In [58]:
df_bioelements[df_bioelements['species'] == 'all']

Unnamed: 0,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms
0,ZR,all,Complex,plant_complex,SCF,SCF,SCF,SCF,,SCF,,,v2.7,use,,,,
1,ZR,all,Complex,plant_complex,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,,WD/bHLH/MYB,,,v2.7,use,,,,
2,ZR,all,Complex,plant_complex,ribosome,ribosome,ribosome,ribosome,,ribosome,,,v2.6,use,,,,


In [59]:
pd.value_counts(df_bioelements['NodeLabel'])

PlantCoding       858
PlantNonCoding     12
PlantAbstract      12
Complex             3
Name: NodeLabel, dtype: int64

In [60]:
all_species_listed = list(df_bioelements['species'].unique()); all_species_listed

['all', 'ath', 'osa', 'stu', 'sly']

In [61]:
all_species = all_species_listed.copy()

In [62]:
try: all_species.remove('plant_all')
except ValueError: pass
try: all_species.remove('all')
except ValueError: pass    

In [63]:
all_species

['ath', 'osa', 'stu', 'sly']

In [64]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return ",".join(list(v))
    return ''


def get_species_homologues(level):
    df_level_species  = df_bioelements.groupby([level, 'species']).agg({
      'NodeID':lambda x: set(x), 
    })
    
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['species'] == specie, col] =\
                 df_level_species[df_level_species['species'] == specie]['NodeID']
    
    df_level_species = df_level_species[cols].groupby(level).agg({
        y:pick_the_set for y in cols
    })
    
    df_level = df_bioelements.fillna('').groupby(level).agg({
          'AddedBy':lambda x:list(x)[0], 
          'NodeLabel':lambda x:list(x)[0], 
          'NodeDescription':lambda x: ', '.join(list(set(x))), 
          'AdditionalInfo':lambda x: helpers.list_to_string(x), 
          'Process':lambda x:list(x)[0], 
          'ModelV':helpers.get_latest_model, 
          'species':lambda x: ', '.join(list(set(x))),
          'ModelStatus':helpers.get_model_status,
          'external_links':lambda x: ', '.join([s for s in x if not s=='']), 
          'gmm_ocd':lambda x: ', '.join(list(set(x))),
          'GMM_Description':lambda x: ', '.join(list(set(x))),
          'GMM_ShortName':lambda x: ', '.join(list(set(x))),
          'GMM_Synonyms':lambda x: ', '.join(list(set(x))),
    })    

    df_level = df_level.join(df_level_species[cols])
    df_level.reset_index(inplace=True)
    
    return df_level

In [65]:
df_families = get_species_homologues("Family")

In [66]:
df_families.shape[0]

181

In [67]:
df_families.head()

Unnamed: 0,Family,AddedBy,NodeLabel,NodeDescription,AdditionalInfo,Process,ModelV,species,ModelStatus,external_links,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms,ath_homologues,osa_homologues,stu_homologues,sly_homologues
0,4CLL,KG,PlantCoding,AMP-dependent synthetase and ligase family pro...,OPCL1: physiological role in JA biosynthesis w...,Hormone:JA,v2.5,ath,use,"gmm_ocd:ocd_all_000824, gmm_ocd:ocd_all_000824...",ocd_all_000824,"OPC-8:0 CoA ligase1, AMP-dependent synthetase ...","4CLL5, 4CLL2, 4CLL4, 4CLL3, 4CLL7, 4CLL8","4CL8,4CLL8, 4CLL2, 4CLL5,OPCL1, 4CLL4, 4CLL3, ...","AT4G05160,AT1G20490,AT1G20510,AT5G38120,AT1G20...",,,
1,AAO,KG,PlantCoding,"xanthine dehydrogenase, aldehyde oxidase","Encodes aldehyde oxidase AA01.,Encodes an alde...",Hormone:SA,v2.5,ath,use,"gmm_ocd:ocd_all_000158, gmm_ocd:ocd_all_000158...",ocd_all_000158,"aldehyde oxidase 1, xanthine dehydrogenase 1, ...","AAO1, XDH1, AAO4, AAO2, XDH2, AAO3","ATXDH1,XDH1, AAO3,AO4,AOdelta,At-AO3,AtAAO3, A...","AT1G04580,AT4G34900,AT4G34890,AT3G43600,AT2G27...",,,
2,ACH,KG,PlantCoding,acyl-coenzyme A thioesterase 9 [EC:3.1.2.-]; A...,,Hormone:JA,v2.5,ath,use,"gmm_ocd:ocd_all_002748,ec:3.1.2.-, gmm_ocd:ocd...",ocd_all_002748,Thioesterase/thiol ester dehydrase-isomerase s...,,,"AT2G30720,AT5G48370",,,
3,ACO,KG,PlantCoding,similar to 1-aminocyclopropane-1-carboxylate o...,ACOs belong to a large family of dioxygenases ...,Hormone:ET,v1.0,ath,use,"gmm_ocd:ocd_all_004694, gmm_ocd:ocd_all_013860...","ocd_all_004694, ocd_all_001045, ocd_all_013860...",2-oxoglutarate (2OG) and Fe(II)-dependent oxyg...,"ACO2, ACO1, ACO4, ACO5, [ORF]F12F1.12","ACO2,ATACO2,EI305, ACO1,ACO4,EAT1,EFE, ACO5,[O...","AT1G62380,AT1G77330,AT2G19590,AT1G12010,AT1G05010",,,
4,ACS,KG,PlantCoding,ACC synthase [EC:4.4.1.14],It does not act on branched chain amino acids ...,Hormone:ET,v2.5,ath,use,"ec:4.4.1.14,gmm_ocd:ocd_all_000133, gmm_ocd:oc...","ocd_all_002828, ocd_all_000133","1-aminocyclopropane-1-carboxylate synthase 4, ...","ACS9, ACS5, ACS6, ACS8, ACS10, ACS12, ACS7, AC...","ACS9,AtACS9,ETO3, ACC6,ACS6,ATACS6, ACC5,ACS5,...","AT4G11280,AT4G37770,AT1G01480,AT3G61510,AT5G65...",,,


In [68]:
# save node types 
family_node_labels = []
for t, subdf in df_families.groupby("NodeLabel"):
    print(t, "\t", subdf.shape[0])
    subdf.to_csv("../data/import/%s-components.tsv"%t, sep="\t", index=None)
    family_node_labels.append(t)

Complex 	 3
PlantAbstract 	 7
PlantCoding 	 162
PlantNonCoding 	 9


In [69]:
has_family = ["PlantCoding", "PlantNonCoding", "PlantAbstract"]


for t in family_node_labels:
    if t in has_family: 
        labels = [t, 'Family']
    else:
        labels = [t]
    query = helpers.bioelement_node_query("%s-components.tsv"%t, labels, n_name="line.Family")
    #query = f"MATCH (n:{t}) DELETE n"
    #print(query)
    qr = graph.run(query)
    print(t, "\t", qr.stats()['nodes_created'])

Complex 	 3
PlantAbstract 	 7
PlantCoding 	 162
PlantNonCoding 	 9


In [70]:
file_name = parsed_path / "bio_elements.tsv"
df_bioelements.to_csv(file_name, sep="\t", index=False)

## Process

In [71]:
df_process = df_components[df_components['NodeLabel'].isin(['Process'
                                                                ])].copy()

In [72]:
df_process.head()

Unnamed: 0,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms
1001,ZR,all,Process,process,AUX-signalling,AUX-signalling,AUX-signalling,AUX-signalling,,Auxin signalling.,,Hormone:AUX,v2.6,use,,,,
1002,ZR,all,Process,process,Anthocyanin-accumulation,Anthocyanin-accumulation,Anthocyanin-accumulation,Anthocyanin-accumulation,,Accumulation of anthocynanins.,,P:Secondary-metabolism_Anthocyanins,v2.6,use,,,,
1003,ZR,all,Process,process,CO2-deficiency,CO2-deficiency,CO2-deficiency,CO2-deficiency,,ROS processes.,,S:ROS,v2.6,use,,,,
1004,ZR,all,Process,process,RNA-silencing,RNA-silencing,RNA-silencing,RNA-silencing,,RNA silencing.,,P:Silencing,v2.6,use,,,,
1005,ZR,all,Process,process,ROS-production,ROS-production,ROS-production,ROS-production,,ROS processes.,,S:ROS,v2.6,use,,,,


In [73]:
want_columns = ['AddedBy', 'NodeLabel',  
        'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus']


In [74]:
label = "Process"
f = "%s-components.tsv"%label
df_process[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [75]:
q = helpers.process_node_query(f, label, 
                     n_name="line.NodeName", 
                    )
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///Process-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Process   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [76]:
qr = graph.run(q)
if not df_process.shape[0] == qr.stats()['nodes_created']:
    raise Exception

# END 

## Protein, Clade, Family keys

In [None]:
input_path

In [None]:
file_name = os.path.join(input_path, "bio_elements.tsv")
df_bioelements.to_csv(file_name, sep="\t", index=False)

In [None]:
id_to_name = df_bioelements[['NodeID', 'NodeName']]

In [None]:
id_to_name[id_to_name['NodeName'].duplicated()]['NodeName'].unique()

In [None]:
df_nodes = get_species_homologues('NodeName')
df_nodes.set_index('NodeName', inplace=True)

In [None]:
# node name to node IDs
node_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    node_ids_key[col] = df_nodes[col].to_dict()

In [None]:
df_clades = get_species_homologues('Clade')
df_clades.set_index('Clade', inplace=True)

In [None]:
# clade name to node IDs
clade_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    clade_ids_key[col] = df_clades[col].to_dict() 

In [None]:
# family name to node IDs
df_families.set_index("Family", inplace=True)
family_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    family_ids_key[col] = df_families[col].to_dict() 

In [None]:
node_to_family = df_bioelements[["NodeName", "Family"]].set_index("NodeName")["Family"].to_dict()

In [None]:
clade_to_family = df_bioelements[["Clade", "Family"]].set_index("Clade")["Family"].to_dict()

In [None]:
df_process['NodeName']

In [None]:
df_components["Process"].unique()

## Components summary

In [None]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name, n.level AS level'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"], d["level"]) for d in nodes])

In [None]:
len(all_nodes_in_components)

In [None]:
sorted(df_components['NodeLabel'].unique())

In [None]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s


### Reactions sheet

In [None]:
df_edges_orig = pd.read_excel(input_path, 
                         sheet_name="Reactions", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_orig = df_edges_orig[df_edges_orig['Status'].isin(["forCB", "forCB_INVENTED"])]
df_edges_orig.drop(['Status', 'FOXMES', 'Legacy:Process', 'Legacy:ReactionMode'], axis=1, inplace=True)
df_edges_orig.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_localisation', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_localisation', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_localisation', 'input3_type', 
          'ReactionEffect', 'ReactionMode', 'Modifications',
          'output1_ID', 'output1_level', 'output1_localisation', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV', 'kinetics'],
      dtype='object')

df_edges_new = pd.read_excel(input_path, 
                         sheet_name="Reactions_New", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_new = df_edges_new[~df_edges_new['AddedBy'].isin(['-'])]
df_edges_new.drop(['Status'], axis=1, inplace=True)
df_edges_new.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_type',                                  
          'ReactionEffect', 'ReactionMode', 
          'output1_ID', 'output1_level', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV'],
      dtype='object')


df_edges = pd.concat([df_edges_orig, df_edges_new], sort=False)

In [None]:
df_edges = df_edges[~df_edges["AddedBy"].isna()]

In [None]:
df_edges.tail()

In [None]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
df_edges.drop(x, inplace=True)

In [None]:
df_edges["TrustLevel"].unique()

In [None]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
df_edges['observed_species'] = df_edges["Species"].apply(helpers.get_second_item)
df_edges['also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [None]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

In [None]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

In [None]:
def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def doi_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []

def list_to_string(x):
    return ",".join([str(i) for i in x])

In [None]:
# format literature sources
df_edges["Literature"].fillna(value="", inplace=True)
for i, row in df_edges.iterrows():
    s = row['Literature']
    source = doi_list(s)
    source += pubmed_list(s)
    for z in s.split("|"):
        key = z.lower()
        if ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + z.split(":")[1].strip()
                source.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + z.split(":")[1].strip()
                source.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pubmed" in key) or ("pmid" in key):
                pmid_string = "pubmed:" + z.split(":")[1].strip()
        elif "invented" in key:
            source.append("invented")
        else:
            print("no/bad reference", row["ConnID"], z)
            source.append("other:" + only_asci(z.strip()))
    if len(source) > 0:
        df_edges.loc[i, "literature_sources"] = list_to_string(source)
    else:
        print(row["ConnID"], z)


In [None]:
df_edges[["ConnID", "Literature", "literature_sources"]].to_csv("lit-check.tsv", sep="\t", index=None)

In [None]:
df_edges.reset_index(inplace=True, drop=True)

In [None]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [None]:
df_edges[df_edges['ConnID'].duplicated()]

In [None]:
df_edges.head()

In [None]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [None]:
df_edges.head()

In [None]:
def convert_node_to_family(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, type_, level_ = x.values
      
    if id_ is np.nan:
        return np.nan, np.nan

    new_label = None
    family_id = None
    
    ########################
    # Simple Cases
    ########################
    if type_ in ['complex', 'complex [active]', 'complex [activated]', 'complex [inactive]', 'plant_complex']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
        family_id = id_
    
    elif type_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(id_, type_, level_, " | (label) not a listed metabolite")
        family_id = id_

    elif type_ in ['process']:
        if (id_ in node_dict["Process"]):
            family_id = id_
            new_label = "Process"
        else:
            print(id_, type_, level_, " | (label) process not a listed process")
        
    else:
        ########################
        # family ID
        ########################
        check_external = False
        if level_ == "family":    
            family_id = id_
        elif level_ in ["clade", "clade/orthologue"]:
            try:
                family_id = clade_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        elif level_ == "node":
            try:
                family_id = node_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        
        if check_external:
            for label in ["ExternalOrganism", "ExternalCoding", "ExternalNonCoding"]:
                if id_ in node_dict[label]:
                    new_label = label
                    family_id = id_
                    break
                    
        if not family_id:
            print(id_, type_, level_, " | (family id) could not convert to family/external")
                
        ########################
        # Label
        ########################
        if (family_id) and (not new_label):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(id_, type_, level_, " | (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(id_, type_, level_, " | (label) could not find label")
                missing_in_components.update([id_])        

    return family_id, new_label
        

In [None]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, level_col, new_id, new_label_col  =\
            [prefix + x for x in ('_ID',  '_type',  '_level',  '_newID', '_label')]    
    
    df_edges[[new_id, new_label_col]] = df_edges[[id_col, type_col, level_col]].apply(convert_node_to_family, axis=1, result_type='expand')


In [None]:
df_edges.head()

In [None]:
def get_orthologues(x, prefix=""):
    id_, level_, label_ = x.values
    ########################
    # Specie specific homologues
    ########################
    
    return_D = {f"{prefix}_{specie}_homologues":"" for specie in all_species}
    
    if label_ in ['PlantCoding', 'PlantNonCoding',  'PlantAbstract']:

        for species in all_species:
            species = f"{species}_homologues"
            k = f"{prefix}_{species}"
            if level_ == 'node':
                return_D[k] = node_ids_key[species][id_]
            elif level_ == 'clade':
                return_D[k] =  clade_ids_key[species][id_]
            elif level_ == 'family':
                return_D[k] =  family_ids_key[species][id_]

    return_D = {x:helpers.list_to_string(list(return_D[x])) for x in return_D}
    return return_D

In [None]:
new_dfs = []
for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, level_col, new_label_col  =\
                [prefix + x for x in ('_ID',  '_level',  '_label')]        
    
    new_df = df_edges[[id_col, level_col, new_label_col ]].apply(get_orthologues, axis=1, result_type='expand', prefix=prefix)
    new_dfs.append(new_df)
    #df_edges = df_edges.join(new_df, sort=False)

In [None]:
homologues_df = pd.concat(new_dfs, sort=False, axis=1)

In [None]:
homologues_df.loc[0]

In [None]:
df_edges = df_edges.join(homologues_df, sort=False)

In [None]:
df_edges.head()

In [None]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    'process [active]':"process_active",

    np.nan:"", 
    "plant_coding":"unknown"
}

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[type_col].apply(lambda x: node_type_to_node_form_dict[x])

In [None]:
df_edges.head()

In [None]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus'
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])


def node_localisation_std(x):
    if not type(x) == str:
        return ""
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation')]
    
    x = df_edges[['ConnID', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    
    #for _, y in x.iterrows():
    #    if y[localisation_col] in ['mitochondria?', np.nan]:
    #        print(y['ConnID'], "\t", y[id_col], "\t", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[localisation_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[localisation_col])

In [None]:
node_localisations

In [None]:
new_localisation

In [None]:
df_edges.to_csv(os.path.join("..", "data", "raw", "edges-sheet.tsv"), sep="\t")

In [None]:
homologue_cols = [f"{x}_homologues" for x in all_species]

In [None]:
all_species

In [None]:
with open(os.path.join("..", "data", "raw", "complexes_to_add.tsv"), "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")