# Import neo4j DB: 2/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

In [1]:
# check number families, nodes, clades

## Setup

In [1]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [2]:
from py2neo import Graph, Node, Relationship

In [3]:
import helpers

In [4]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [5]:
graph = Graph(host="neo4j")

In [6]:
q = graph.run("MATCH (n) RETURN n LIMIT 10")

In [7]:
q.stats()

constraints_added: 0
constraints_removed: 0
contains_updates: False
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 0
relationships_created: 0
relationships_deleted: 0

In [8]:
node_labels = [
    'PlantCoding',
    'PlantNonCoding',
    'PlantAbstract',
    'Complex',
    'ExternalOrganism', 
    'ExternalCoding',
    'ExternalNonCoding',
    'Process', 
    'MetaboliteFamily',
    'Metabolite',
    'GeneExpression', 
    'PseudoNode'
]

In [9]:
# start from scratch
graph.delete_all()

for l in node_labels:
    try:
        graph.schema.drop_uniqueness_constraint(l, "name")
    except:
        pass    

## Constraints on node names

Constraint automatically adds an index. 

In [10]:
for label in node_labels:
    q = f'''CREATE CONSTRAINT unique_name_{label.lower()}
        ON (node:{label}) ASSERT node.name IS UNIQUE'''
    qr = graph.run(q)
    print(label, qr.stats()['constraints_added'], qr.stats()['indexes_added'])   

PlantCoding 1 0
PlantNonCoding 1 0
PlantAbstract 1 0
Complex 1 0
ExternalOrganism 1 0
ExternalCoding 1 0
ExternalNonCoding 1 0
Process 1 0
MetaboliteFamily 1 0
Metabolite 1 0
GeneExpression 1 0
PseudoNode 1 0


## Read in

In [11]:
input_path = os.path.join("..", "data", "raw")
path = os.path.join(input_path, 'parsed-components.tsv')

df_components = pd.read_csv(path, sep="\t")

## metabolites

In [12]:
label = 'Metabolite'
df_metabolites = df_components[df_components['NodeLabel'] == label].copy()

In [13]:
df_metabolites.columns

Index(['AddedBy', 'Species', 'NodeType', 'Family', 'Clade', 'NodeID',
       'NodeName', 'ModelStatus', 'NodeDescription', 'AdditionalInfo',
       'ExtDBlink', 'Process', 'ExternalDB', 'ModelV', 'GMM_OCD1', 'GMM_OCD',
       'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms', 'NodeLabel',
       'observed_species', 'also_observed_in', 'chebi_identifier',
       'pubmed_identifier'],
      dtype='object')

In [14]:
df_metabolites[df_metabolites['NodeName'].duplicated()]

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier


In [15]:
want_columns = ['AddedBy', 'Family',
       'NodeID', 'NodeName', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ExternalDB', 'ModelV',
       'GMM_OCD1', 'GMM_OCD', 'GMM_Description', 'GMM_ShortName',
       'GMM_Synonyms', 'chebi_identifier', 'pubmed_identifier']

In [16]:
f = '%s-components.tsv'%label
df_metabolites[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [17]:
q = helpers.component_node_query(f, label, 
                     n_name="line.NodeName"
                    )

In [18]:
df_metabolites[df_metabolites['NodeName'].duplicated()]

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier


In [19]:
qr = graph.run(q)
if not df_metabolites.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [20]:
df_metabolites[df_metabolites['Clade'] != df_metabolites['NodeID']]

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier


In [21]:
label = "MetaboliteFamily"

In [22]:
df_metabolites

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier
20,KG,plant_all,metabolite,L-Met,L-Met,L-Met,L-Met,use,L-methionine,,...,,,,,,Metabolite,plant_all,,CHEBI:16643,
21,KG,plant_all,metabolite,SAMe,SAMe,SAMe,SAMe,use,S-adenosyl-L-methionine,,...,,,,,,Metabolite,plant_all,,CHEBI:15414,
22,KG,plant_all,metabolite,ACC,ACC,ACC,ACC,use,1-aminocyclopropane-1-carboxylate,,...,,,,,,Metabolite,plant_all,,CHEBI:30526,
23,KG,plant_all,metabolite,ET,ET,ET,ET,use,ethylene,,...,,,,,,Metabolite,plant_all,,CHEBI:18153,
24,KG,plant_all,metabolite,Cu2+,Cu2+,Cu2+,Cu2+,use,copper,,...,,,,,,Metabolite,plant_all,,CHEBI:29036,
165,KG,plant_all,metabolite,PUFA:LA,PUFA:LA,PUFA:LA,PUFA:LA,use,polyunsaturated fatty acids: linoleic acid,HA | polyunsaturated fatty acids: hexadecatrie...,...,,,,,,Metabolite,plant_all,,CHEBI:17351,
166,KG,plant_all,metabolite,13-HPOT,13-HPOT,13-HPOT,13-HPOT,use,"13-hydroperoxy-9,11,15-octadecatrienoic acid",11-HPHT | 11(S)-hydroperoxy-hexadecatrienoic a...,...,,,,,,Metabolite,plant_all,,CHEBI:48905,
167,KG,plant_all,metabolite,"12,13-EOT","12,13-EOT","12,13-EOT","12,13-EOT",use,"12,13(S)-epoxylinolenic acid","10,11-EHT | parallel path to OPDA",...,,,,,,Metabolite,plant_all,,CHEBI:15653,
168,KG,plant_all,metabolite,OPDA,OPDA,OPDA,OPDA,use,12-oxophytodienoic acid in the peroxisome,dnOPDA | dinorOPDA | parallel path to OPDA,...,,,,,,Metabolite,plant_all,,CHEBI:34005,
169,KG,plant_all,metabolite,OPC8,OPC8,OPC8,OPC8,use,oxopentenyl-cyclopentane-octanoic acid,,...,,,,,,Metabolite,plant_all,,,PubChem:25244083


In [23]:
df_metabolites_has_family = df_metabolites[df_metabolites['Family'] != df_metabolites['NodeID']]

In [24]:
want_columns = ['AddedBy', 'Family',
        'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV']

In [25]:
df_metabolites_has_family[want_columns].drop_duplicates()

Unnamed: 0,AddedBy,Family,NodeDescription,AdditionalInfo,Process,ModelV
523,MZ,PreROS,ROS precursors; oxygen,,S:ROS,v2.6
524,MZ,ROS,hydrogen peroxide (reactive oxygen species),,S:ROS,v2.6
525,MZ,ROS,superoxide anion (reactive oxygen species),,S:ROS,v2.6
526,MZ,ROS,hydroxyl radical (reactive oxygen species),,S:ROS,v1.0
527,MZ,ROS,hydroperoxyl radical (reactive oxygen species),,S:ROS,v2.6
528,MZ,ROS,ozone (reactive oxygen species),,S:ROS,v2.6
529,MZ,PostROS,water,,S:ROS,v2.7
752,KG,DAMP/HAMP,oligogalacturonides,damage/host associated molecular patterns,,v2.6
933,AG,tZRMP,9-ribosyl-trans-zeatin-5'-monophosphate,,Hormone:CK,vNA
939,AG,DHZ,dihydrozeatin,,Hormone:CK,vNA


In [26]:
df_metabolites_family = df_metabolites_has_family.groupby('Family').agg({
                                                                  'Family':lambda x:list(x)[0],
                                                                  'AddedBy':lambda x:list(x)[0], 
                                                                  'NodeDescription':lambda x: ', '.join(x), 
                                                                  'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                                  'Process':lambda x:list(x)[0], 
                                                                  'ModelV':helpers.get_latest_model})

In [27]:
df_metabolites_family

Unnamed: 0_level_0,Family,AddedBy,NodeDescription,AdditionalInfo,Process,ModelV
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DAMP/HAMP,DAMP/HAMP,KG,oligogalacturonides,damage/host associated molecular patterns,,v2.6
DHZ,DHZ,AG,dihydrozeatin,,Hormone:CK,v0.0
DHZ7G,DHZ7G,AG,7-(alpha-D-glucosyl)dihydrozeatin,,Hormone:CK,v0.0
DHZ9G,DHZ9G,AG,9-(alpha-D-glucosyl)dihydrozeatin,,Hormone:CK,v0.0
DHZOG,DHZOG,AG,dihydrozeatin-O-glucoside,,Hormone:CK,v0.0
DHZR,DHZR,AG,dihydrozeatin riboside,,Hormone:CK,v0.0
DHZROG,DHZROG,AG,dihydrozeatin riboside-O-glucoside,,Hormone:CK,v0.0
PostROS,PostROS,MZ,water,,S:ROS,v2.7
PreROS,PreROS,MZ,ROS precursors; oxygen,,S:ROS,v2.6
ROS,ROS,MZ,"hydrogen peroxide (reactive oxygen species), s...",,S:ROS,v2.6


In [28]:
f = '%s-components.tsv'%label
df_metabolites_family.to_csv("../data/import/" + f, sep="\t", index=None)

In [29]:
q = helpers.component_node_query(f, label, 
                     n_name="line.Family"
                    )

In [30]:
qr = graph.run(q)
if not df_metabolites_family.shape[0] == qr.stats()['nodes_created']:
    raise Exception 

In [31]:
# Metabolite to MetaboliteFamily edges
want_columns = ['AddedBy', 'Family', 'NodeName', 'ModelV']
edge_type = 'TYPE_OF'
f = '%s-edges.tsv'%edge_type
df_metabolites_has_family[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [32]:
q = helpers.make_create_edge_query(f, edge_type, 
                           source_label="Metabolite", target_label="MetaboliteFamily",
                           source_name="line.NodeName", target_name="line.Family")

In [33]:
qr = graph.run(q)
if not df_metabolites_has_family.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Pathogens (External)

In [34]:
df_external = df_components[df_components["NodeLabel"].isin(["ExternalCoding", "ExternalNonCoding"])].copy()

In [35]:
df_external.columns = ['AddedBy', 'O-Species', 'NodeType', 'classification', 'species',
       'NodeID', 'NodeName', 'ModelStatus', 'NodeDescription',
       'AdditionalInfo', 'ExtDBlink', 'Process', 'ExternalDB', 'ModelV',
       'GMM_OCD1', 'GMM_OCD', 'GMM_Description', 'GMM_ShortName',
       'GMM_Synonyms', 'NodeLabel', 'observed_species',
       'also_observed_in', 'chebi_identifier', 'pubmed_identifier']

In [36]:
want_columns = ['AddedBy',
       'NodeID', 'NodeName', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ExternalDB', 'ModelV',
       'GMM_OCD1', 'GMM_OCD', 'GMM_Description', 'GMM_ShortName',
       'GMM_Synonyms', 'chebi_identifier', 'pubmed_identifier']

In [37]:
df_external[['O-Species', 'NodeType', 'classification', 'species', 'NodeID', 'NodeName', 'observed_species']]

Unnamed: 0,O-Species,NodeType,classification,species,NodeID,NodeName,observed_species
451,external,pathogen_coding,virus,potyvirus,6K1,6K1,external
452,external,pathogen_coding,virus,potyvirus,6K2,6K2,external
453,external,pathogen_coding,virus,potyvirus,CI,CI,external
454,external,pathogen_coding,virus,potyvirus,CP,CP,external
455,external,pathogen_coding,virus,potyvirus,HC-Pro,HC-Pro,external
456,external,pathogen_coding,virus,potyvirus,NIa-Pro,NIa-Pro,external
457,external,pathogen_coding,virus,potyvirus,NIb,NIb,external
458,external,pathogen_coding,virus,potyvirus,P1,P1,external
459,external,pathogen_coding,virus,potyvirus,P3,P3,external
460,external,pathogen_coding,virus,potyvirus,P3N-PIPO,P3N-PIPO,external


In [38]:
for label, subdf in df_external.groupby('NodeLabel'):
    print(label, end='\t')
    f = '%s-components.tsv'%label
    subdf[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)
    
    q = helpers.component_node_query(f, label, 
                     n_name="line.NodeName"
                    )
    
    qr = graph.run(q)
    if not subdf.shape[0] == qr.stats()['nodes_created']:
        raise Exception

ExternalCoding	

In [39]:
external_organisms = df_external.groupby('species').agg({
                                                      'classification':lambda x:list(x)[0],
                                                      'AddedBy':lambda x:list(x)[0], 
                                                      #'NodeDescription':lambda x: ', '.join(x), 
                                                      #'AdditionalInfo':lambda x: ', '.join(x), 
                                                      #'Process':lambda x:list(x)[0], 
                                                      'ModelV':helpers.get_latest_model}).reset_index()

In [40]:
external_organisms

Unnamed: 0,species,classification,AddedBy,ModelV
0,oomycete,fungi,KG,v0.0
1,potyvirus,virus,KG,v2.6
2,trichous-bacteria,bacteria,KG,


In [41]:
label = 'ExternalOrganism'
f = '%s-components.tsv'%label
external_organisms.to_csv("../data/import/" + f, sep="\t", index=None)

In [42]:
q = helpers.component_node_query(f, label, 
                     n_name="line.species", 
                    )

In [43]:
qr = graph.run(q)
if not external_organisms.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [44]:
# External to ExternalOrganism edges
want_columns = ['NodeName', 'species', 'ModelV']
edge_type = 'AGENT_OF'
f = '%s-edges.tsv'%edge_type
df_external[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [45]:
q = helpers.make_create_edge_query(f, edge_type, 
                           source_label="", target_label="ExternalOrganism",
                           source_name="line.NodeName", target_name="line.species")

In [46]:
qr = graph.run(q)
if not df_external.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Bio-elements

In [47]:
df_bioelements = df_components[df_components['NodeLabel'].isin(['PlantCoding', 
                                                                 'PlantNonCoding',
                                                                 'PlantAbstract', 
                                                                 'Complex'
                                                                ])].copy()

In [48]:
df_bioelements['Species'].unique()

array(['ath', 'plant_all', 'stu', 'osa'], dtype=object)

In [49]:
pd.value_counts(df_bioelements['NodeLabel'])

PlantCoding       825
PlantAbstract      12
PlantNonCoding     12
Complex             1
Name: NodeLabel, dtype: int64

In [50]:
all_species_listed = list(df_bioelements['Species'].unique()); all_species_listed

['ath', 'plant_all', 'stu', 'osa']

In [51]:
all_species = all_species_listed.copy()

In [52]:
all_species.remove('plant_all')
all_species.remove('all')

ValueError: list.remove(x): x not in list

In [53]:
all_species

['ath', 'stu', 'osa']

In [54]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return v
    return {}


def get_species_homologues(level):
    df_level_species  = df_bioelements.groupby([level, 'Species']).agg({
      'NodeID':lambda x: set(x), 
    })
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['Species'] == specie, col] =\
         df_level_species[df_level_species['Species'] == specie]['NodeID']
    
    df_level_species = df_level_species[cols].groupby(level).agg({
        y:pick_the_set for y in cols
    })
    
    df_level = df_bioelements.groupby(level).agg({
          'AddedBy':lambda x:list(x)[0], 
          'NodeLabel':lambda x:list(x)[0], 
          'NodeDescription':lambda x: ', '.join(list(set(x))), 
          'AdditionalInfo':lambda x: helpers.list_to_string(x), 
          'Process':lambda x:list(x)[0], 
          'ModelV':helpers.get_latest_model, 
          'Species':lambda x: set(x), 
    })    

    df_level = df_level.join(df_level_species[cols])
    df_level.reset_index(inplace=True)
    
    return df_level

In [55]:
df_families = get_species_homologues("Family")

In [56]:
# save node types 
family_node_labels = []
for t, subdf in df_families.groupby("NodeLabel"):
    print(t, "\t", subdf.shape[0])
    subdf.to_csv("../data/import/%s-components.tsv"%t, sep="\t", index=None)
    family_node_labels.append(t)

Complex 	 1
PlantAbstract 	 7
PlantCoding 	 161
PlantNonCoding 	 9


In [57]:
for t in family_node_labels:
    query = helpers.component_node_query("%s-components.tsv"%t, n_name="line.Family", labels=[t, 'Family'])
    #query = f"MATCH (n:{t}) DELETE n"
    qr = graph.run(query)
    print(t, "\t", qr.stats()['nodes_created'])

Complex 	 1
PlantAbstract 	 7
PlantCoding 	 161
PlantNonCoding 	 9


In [58]:
file_name = os.path.join(input_path, "bio_elements.tsv")
df_bioelements.to_csv(file_name, sep="\t", index=False)

## Process

In [59]:
df_process = df_components[df_components['NodeLabel'].isin(['Process'
                                                                ])].copy()

In [60]:
want_columns = ['AddedBy',
       'NodeName', 'NodeDescription', 'Process', 
       'AdditionalInfo', 'Process', 'ExternalDB', 'ModelV',
       'GMM_OCD1', 'GMM_OCD', 'GMM_Description', 'GMM_ShortName',
       'GMM_Synonyms', 'chebi_identifier', 'pubmed_identifier', 'observed_species']

In [61]:
label = "Process"
f = "%s-components.tsv"%label
df_process[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [62]:
q = helpers.component_node_query(f, label, 
                     n_name="line.NodeName", 
                    )

In [63]:
qr = graph.run(q)
if not df_process.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [64]:
qr.stats()

constraints_added: 0
constraints_removed: 0
contains_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 6
labels_removed: 0
nodes_created: 6
nodes_deleted: 0
properties_set: 29
relationships_created: 0
relationships_deleted: 0

# END 

## Protein, Clade, Family keys

In [19]:
input_path

'../data/raw'

In [20]:
file_name = os.path.join(input_path, "bio_elements.tsv")
df_bioelements.to_csv(file_name, sep="\t", index=False)

In [128]:
id_to_name = df_bioelements[['NodeID', 'NodeName']]

In [129]:
id_to_name[id_to_name['NodeName'].duplicated()]['NodeName'].unique()

array(['CPS', 'GA20ox3', 'GA20ox1', 'GA20ox4', 'MYB33'], dtype=object)

In [130]:
df_nodes = get_species_homologues('NodeName')
df_nodes.set_index('NodeName', inplace=True)

In [131]:
# node name to node IDs
node_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    node_ids_key[col] = df_nodes[col].to_dict()

In [132]:
df_clades = get_species_homologues('Clade')
df_clades.set_index('Clade', inplace=True)

In [133]:
# clade name to node IDs
clade_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    clade_ids_key[col] = df_clades[col].to_dict() 

In [134]:
# family name to node IDs
df_families.set_index("Family", inplace=True)
family_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    family_ids_key[col] = df_families[col].to_dict() 

In [135]:
node_to_family = df_bioelements[["NodeName", "Family"]].set_index("NodeName")["Family"].to_dict()

In [136]:
clade_to_family = df_bioelements[["Clade", "Family"]].set_index("Clade")["Family"].to_dict()

In [143]:
df_process['NodeName']

780    Anthocyanin-accumulation
781         Trichome-initiation
782               RNA-silencing
783              ROS-production
784              AUX-signalling
785              CO2-deficiency
Name: NodeName, dtype: object

In [144]:
df_components["Process"].unique()

array(['Hormone:ET', 'Hormone:JA', 'Hormone:SA', 'Hormone:AUX',
       'Hormone:BS', 'Hormone:GA', 'S:Ca', 'P:Stress_HSPs', 'S:MAPKs',
       'Pathogen', 'Pathogen_Effector', 'S:ROS', 'S:TFs',
       'P:Primary-metabolism_Photosynthesis',
       'P:Primary-metabolism_Respiration',
       'P:Secondary-metabolism_Anthocyanins',
       'P:Secondary-metabolism_Isoprenoids',
       'P:Secondary-metabolism_Terpenoids', 'P:Ubiquitination',
       'P:Silencing', 'P:Translation', nan, 'S:Rgenes', 'Hormone:CK',
       'Hormone:SLs'], dtype=object)

## Components summary

In [145]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name, n.level AS level'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"], d["level"]) for d in nodes])

In [146]:
len(all_nodes_in_components)

313

In [147]:
sorted(df_components['NodeLabel'].unique())

['Complex',
 'ExternalCoding',
 'Metabolite',
 'PlantAbstract',
 'PlantCoding',
 'PlantNonCoding',
 'Process',
 'Undefined']

In [93]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s


PlantCoding 142
PlantNonCoding 9
PlantAbstract 7
Complex 1
ExternalOrganism 3
ExternalCoding 14
ExternalNonCoding 0
Process 6
MetaboliteFamily 4
Metabolite 59


### Reactions sheet

In [94]:
df_edges_orig = pd.read_excel(input_path, 
                         sheet_name="Reactions", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_orig = df_edges_orig[df_edges_orig['Status'].isin(["forCB", "forCB_INVENTED"])]
df_edges_orig.drop(['Status', 'FOXMES', 'Legacy:Process', 'Legacy:ReactionMode'], axis=1, inplace=True)
df_edges_orig.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_localisation', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_localisation', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_localisation', 'input3_type', 
          'ReactionEffect', 'ReactionMode', 'Modifications',
          'output1_ID', 'output1_level', 'output1_localisation', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV', 'kinetics'],
      dtype='object')

df_edges_new = pd.read_excel(input_path, 
                         sheet_name="Reactions_New", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_new = df_edges_new[~df_edges_new['AddedBy'].isin(['-'])]
df_edges_new.drop(['Status'], axis=1, inplace=True)
df_edges_new.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_type',                                  
          'ReactionEffect', 'ReactionMode', 
          'output1_ID', 'output1_level', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV'],
      dtype='object')


df_edges = pd.concat([df_edges_orig, df_edges_new], sort=False)

In [95]:
df_edges = df_edges[~df_edges["AddedBy"].isna()]

In [96]:
df_edges.tail()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,output1_ID,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics
46,sb,Conn310,ath,CAT,family,,plant_coding,LSD1,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.113.225805,LSD1 interacted with all three catalases both ...,,vNA,
47,sb,Conn310,ath,SA,node,,metabolite,CAT2,node,,...,,,,,[R4] indirect reaction,DOI:10.1016/j.chom.2017.01.007,SA decreased CAT2 activity in a dose-dependent...,,vNA,
48,sb,Conn310,ath,CAT2,node,,plant_coding,ACX2,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1016/j.chom.2017.01.007,CAT2 Promotes the Activityof ACX2/ACX3 ( (test...,,vNA,
49,sb,Conn310,ath,CAT2,node,,plant_coding,ACX3,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1016/j.chom.2017.01.007,CAT2 Promotes the Activityof ACX2/ACX3 ( (test...,,vNA,
1737,x,x,x,x,x,,x,,x,,...,x,x,,x,x,x,x,x,x,


In [97]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
df_edges.drop(x, inplace=True)

Int64Index([1737], dtype='int64')


In [98]:
df_edges["TrustLevel"].unique()

array(['[R1] targetted experiments (e.g. Y2H, BIFC)',
       '[Ry] invented reaction', '[Rx] incomplete/unspecific reaction',
       '[R2] high-throughput experiment (e.g. ChIP-seq)',
       '[R3] in-silico prediction', '[R4] indirect reaction'],
      dtype=object)

In [99]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
df_edges['observed_species'] = df_edges["Species"].apply(helpers.get_second_item)
df_edges['also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [100]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

array(['KG', 'MZ', 'ZR', 'MPE', 'ACR', 'MAK', 'ŠT', 'SB'], dtype=object)

In [101]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

array(['v1.0', 'v2.5', 'v2.7', 'v2.6', 'vNA'], dtype=object)

In [102]:
def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def doi_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []

def list_to_string(x):
    return ",".join([str(i) for i in x])

In [103]:
# format literature sources
df_edges["Literature"].fillna(value="", inplace=True)
for i, row in df_edges.iterrows():
    s = row['Literature']
    source = doi_list(s)
    source += pubmed_list(s)
    for z in s.split("|"):
        key = z.lower()
        if ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + z.split(":")[1].strip()
                source.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + z.split(":")[1].strip()
                source.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pubmed" in key) or ("pmid" in key):
                pmid_string = "pubmed:" + z.split(":")[1].strip()
        elif "invented" in key:
            source.append("invented")
        else:
            print("no/bad reference", row["ConnID"], z)
            source.append("other:" + only_asci(z.strip()))
    if len(source) > 0:
        df_edges.loc[i, "literature_sources"] = list_to_string(source)
    else:
        print(row["ConnID"], z)


no/bad reference Conn040 
no/bad reference Conn118 
no/bad reference Conn120 
no/bad reference Conn122 
no/bad reference Conn182 
no/bad reference Conn183 
no/bad reference Conn199 ? Kg need to find reference
no/bad reference Conn273 KEGG 
no/bad reference Conn274 KEGG 
no/bad reference Conn275 KEGG 
no/bad reference Conn276 KEGG 
no/bad reference Conn276  10.1073/pnas.98.4.2065
no/bad reference Conn277 KEGG 
no/bad reference Conn278 KEGG 
no/bad reference Conn279 KEGG 
no/bad reference Conn280 KEGG 
no/bad reference Conn281 KEGG 
no/bad reference Conn282 KEGG 
no/bad reference Conn283 KEGG 
no/bad reference Conn309 


In [104]:
df_edges[["ConnID", "Literature", "literature_sources"]].to_csv("lit-check.tsv", sep="\t", index=None)

In [105]:
df_edges.reset_index(inplace=True, drop=True)

In [106]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [107]:
df_edges[df_edges['ConnID'].duplicated()]

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
74,KG,Conn080,ath,Trichome-initiation,family,,process,potyvirus,family,,...,[Ry] invented reaction,"DOI:10.1105/tpc.111.083261 (Qi, 2011: The jasm...",,,v2.7,,Ry,ath,,doi:10.1105/tpc.111.083261
260,MAK,,ath,"GID1a,b,c",clade/orthologue,,plant_coding,GA3,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.106.047415,,,vNA,,R1,ath,,doi:10.1105/tpc.106.047415
261,MAK,,ath,"GID1a,b,c",clade/orthologue,,plant_coding,GA4,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.106.047415,,,vNA,,R1,ath,,doi:10.1105/tpc.106.047415
268,MAK,,ath,GA1-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
269,MAK,,ath,GA3-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
270,MAK,,ath,GA4-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
271,MAK,,ath,SLY1,node,,plant_coding,SCF,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.010827,,,vNA,,R1,ath,,doi:10.1105/tpc.010827
272,MAK,,osa,GID2,node,,plant_coding,SCF,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1126/science.1081077,,,vNA,,R1,osa,,doi:10.1126/science.1081077
273,MAK,,ath,GA1-GID1-DELLA,family,,plant_complex,SCF-SLY1,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956 | DOI:10.1105/tpc.01...,,,vNA,,R1,ath,,"doi:10.1104/pp.112.200956,doi:10.1105/tpc.010827"
274,MAK,,osa,GA1-GID1-SLR1,family,,plant_complex,SCF-GID2,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956 | https://doi.org/10...,,,vNA,,R1,osa,,doi:10.1104/pp.112.200956


In [108]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG,other:10.107..."
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"


In [109]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [110]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG,other:10.107..."
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"


In [111]:
def convert_node_to_family(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, type_, level_ = x.values
      
    if id_ is np.nan:
        return np.nan, np.nan

    new_label = None
    family_id = None
    
    ########################
    # Simple Cases
    ########################
    if type_ in ['complex', 'complex [active]', 'complex [activated]', 'complex [inactive]', 'plant_complex']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
        family_id = id_
    
    elif type_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(id_, type_, level_, " | (label) not a listed metabolite")
        family_id = id_

    elif type_ in ['process']:
        if (id_ in node_dict["Process"]):
            family_id = id_
            new_label = "Process"
        else:
            print(id_, type_, level_, " | (label) process not a listed process")
        
    else:
        ########################
        # family ID
        ########################
        check_external = False
        if level_ == "family":    
            family_id = id_
        elif level_ in ["clade", "clade/orthologue"]:
            try:
                family_id = clade_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        elif level_ == "node":
            try:
                family_id = node_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        
        if check_external:
            for label in ["ExternalOrganism", "ExternalCoding", "ExternalNonCoding"]:
                if id_ in node_dict[label]:
                    new_label = label
                    family_id = id_
                    break
                    
        if not family_id:
            print(id_, type_, level_, " | (family id) could not convert to family/external")
                
        ########################
        # Label
        ########################
        if (family_id) and (not new_label):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(id_, type_, level_, " | (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(id_, type_, level_, " | (label) could not find label")
                missing_in_components.update([id_])        

    return family_id, new_label
        

In [112]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, level_col, new_id, new_label_col  =\
            [prefix + x for x in ('_ID',  '_type',  '_level',  '_newID', '_label')]    
    
    df_edges[[new_id, new_label_col]] = df_edges[[id_col, type_col, level_col]].apply(convert_node_to_family, axis=1, result_type='expand')


miR6022 plant_ncRNA node  | (family id) could not convert to family/external
LSD1 plant_coding node  | (family id) could not convert to family/external


In [113]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,also_observed_in,literature_sources,input1_newID,input1_label,input2_newID,input2_label,input3_newID,input3_label,output1_newID,output1_label
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,"doi:10.1042/bj20120245,other:KEGG",L-Met,Metabolite,SAM,PlantCoding,,,SAMe,Metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",SAMe,Metabolite,ACS,PlantCoding,,,ACC,Metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",ACC,Metabolite,ACO,PlantCoding,,,ET,Metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG,other:10.107...",Cu2+,Metabolite,HMA,PlantCoding,,,Cu2+,Metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",ETR,PlantCoding,Cu2+,Metabolite,,,ETR,PlantCoding


In [114]:
def get_orthologues(x, prefix=""):
    id_, level_, label_ = x.values
    ########################
    # Specie specific homologues
    ########################
    
    return_D = {f"{prefix}_{specie}_homologues":"" for specie in all_species}
    
    if label_ in ['PlantCoding', 'PlantNonCoding',  'PlantAbstract']:

        for species in all_species:
            species = f"{species}_homologues"
            k = f"{prefix}_{species}"
            if level_ == 'node':
                return_D[k] = node_ids_key[species][id_]
            elif level_ == 'clade':
                return_D[k] =  clade_ids_key[species][id_]
            elif level_ == 'family':
                return_D[k] =  family_ids_key[species][id_]

    return_D = {x:helpers.list_to_string(list(return_D[x])) for x in return_D}
    return return_D

In [115]:
new_dfs = []
for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, level_col, new_label_col  =\
                [prefix + x for x in ('_ID',  '_level',  '_label')]        
    
    new_df = df_edges[[id_col, level_col, new_label_col ]].apply(get_orthologues, axis=1, result_type='expand', prefix=prefix)
    new_dfs.append(new_df)
    #df_edges = df_edges.join(new_df, sort=False)

input1
input2
input3
output1


In [116]:
homologues_df = pd.concat(new_dfs, sort=False, axis=1)

In [117]:
homologues_df.loc[0]

input1_ath_homologues                                            
input1_osa_homologues                                            
input1_stu_homologues                                            
input2_ath_homologues     AT2G36880,AT1G02500,AT4G01850,AT3G17390
input2_osa_homologues                                            
input2_stu_homologues                                            
input3_ath_homologues                                            
input3_osa_homologues                                            
input3_stu_homologues                                            
output1_ath_homologues                                           
output1_osa_homologues                                           
output1_stu_homologues                                           
Name: 0, dtype: object

In [118]:
df_edges = df_edges.join(homologues_df, sort=False)

In [119]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input1_stu_homologues,input2_ath_homologues,input2_osa_homologues,input2_stu_homologues,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,"AT2G36880,AT1G02500,AT4G01850,AT3G17390",,,,,,,,
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,"AT5G65800,AT5G51690,AT4G26200,AT3G49700,AT1G01...",,,,,,,,
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,"AT1G62380,AT1G12010,AT1G77330,AT1G05010,AT2G19590",,,,,,,,
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,"AT5G44790,AT5G21930,AT4G33520,AT1G63440",,,,,,,,
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,,,,,"AT2G40940,AT1G66340,AT3G23150,AT1G04310,AT3G04580",,


In [120]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    'process [active]':"process_active",

    np.nan:"", 
    "plant_coding":"unknown"
}

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[type_col].apply(lambda x: node_type_to_node_form_dict[x])

input1
input2
input3
output1


In [121]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues,input1_form,input2_form,input3_form,output1_form
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,,,,,,metabolite,protein_active,,metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,,,,,,metabolite,protein,,metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,"AT2G40940,AT1G66340,AT3G23150,AT1G04310,AT3G04580",,,protein,metabolite,,protein_active


In [163]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus'
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])


def node_localisation_std(x):
    if not type(x) == str:
        return ""
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation')]
    
    x = df_edges[['ConnID', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    
    #for _, y in x.iterrows():
    #    if y[localisation_col] in ['mitochondria?', np.nan]:
    #        print(y['ConnID'], "\t", y[id_col], "\t", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[localisation_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[localisation_col])

In [164]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'extracellular',
 'mitochondria?',
 nan,
 'nuc',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [165]:
new_localisation

{'',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [166]:
df_edges.to_csv(os.path.join("..", "data", "raw", "edges-sheet.tsv"), sep="\t")

In [167]:
homologue_cols = [f"{x}_homologues" for x in all_species]

In [125]:
all_species

['ath', 'stu', 'osa']

In [124]:
with open(os.path.join("..", "data", "raw", "complexes_to_add.tsv"), "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")