In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

from collections import defaultdict

In [3]:
import helpers

In [4]:
from importlib import reload

In [5]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
from py2neo import Graph, Node, Relationship

In [7]:
graph = Graph(host="neo4j")

## Read in Bio elements
bioelements = PlantCoding/PlantNonCoding/PlantAbstract

have family hierachy

In [8]:
file_name = output_path / "bio_elements.tsv"
df_bioelements = pd.read_csv(file_name, sep="\t")
df_bioelements.head()

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms
0,node0278,KG,ath,PlantAbstract,plant_abstract,BA2H,BA2H,BA2H,BA2H,,BA2H,Benzoic acid 2-hydroxylase: Isolated from toba...,Hormone:SA,v1.0,ignore,,,,
1,node0255,KG,ath,PlantAbstract,plant_abstract,IPL,IPL,IPL,IPL,,Arabidopsis contains two ICS genes but has no ...,,Hormone:SA,v1.0,ignore,,,,
2,node0695,KG,ath,PlantAbstract,plant_abstract,R-gene,GPAphid2,GPAphid2,GPAphid2,invented:unidentified,green peach aphid 2; resistance against PVX (stu),,S:Rgenes,v2.6,ignore,,,,
3,node0692,KG,ath,PlantAbstract,plant_abstract,R-gene,HRT,HRT,HRT,invented:unidentified,HR against TCV (ath),,S:Rgenes,v2.5,ignore,,,,
4,node0691,KG,ath,PlantAbstract,plant_abstract,R-gene,Ny,Ny,Ny,invented:unidentified,HR against PVY (stu),,S:Rgenes,v2.5,ignore,,,,


In [9]:
#df_bioelements[df_bioelements['NodeName'].duplicated(keep=False)].sort_values('NodeName')[['identifier', 'species', 'Family', 'Clade', 'NodeID', 'NodeName']]
repeated_node_level = df_bioelements[df_bioelements['NodeName'].duplicated(keep='first')]['NodeName'].values

In [10]:
repeated_node_level

array(['CPS', 'CPS.x1', 'CPS.x2', 'RGA2', 'RGA2', 'GA20ox.x1',
       'GA20ox.x2', 'GA20ox1', 'GA20ox1', 'GA20ox2', 'GA20ox2', 'GA20ox3',
       'GA20ox3', 'GA20ox4', 'GA20ox4', 'GA3ox.x1', 'GA3ox.x2', 'GID1.x1',
       'GID1.x2', 'GID1.x3', 'MYB33-65.x1', 'MYB33-65.x2'], dtype=object)

In [11]:
file_name = output_path / "level_translation.tsv"
translate_df = pd.read_csv(file_name, sep="\t", index_col=[0, 1])
translate_df.fillna('', inplace=True)
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20500,AT5G38120,AT1G20480,AT1G20510",,,,4CLL


In [12]:
file_name = output_path / "level_not_use_translation.tsv"
translate_ignore_etc_df = pd.read_csv(file_name, sep="\t", index_col=[0, 1])
translate_ignore_etc_df.fillna('', inplace=True)
translate_ignore_etc_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20500,AT5G38120,AT1G20480,AT1G20490,AT4G05...",,,,4CLL


## Read in parsed reactions

In [13]:
all_species = list(df_bioelements['species'].unique())
try: all_species.remove('plant_all')
except ValueError: pass
try: all_species.remove('all')
except ValueError: pass    
all_species

['ath', 'stu', 'sly', 'osa']

In [14]:
df_edges = pd.read_csv("parsed_reactions.tsv", sep="\t", index_col=0)
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,input1_form,input2_form,input3_form,output1_form,species,input1_location,input2_location,input3_location,output1_location,reaction_type
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,metabolite,protein_active,,metabolite,ath,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,metabolite,protein_active,,metabolite,ath,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,metabolite,protein_active,,metabolite,ath,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,catalysis
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,metabolite,protein,,metabolite,ath,cytoplasm,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,translocation
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,protein,metabolite,,protein_active,ath,endoplasmic reticulum,endoplasmic reticulum,putative:cytoplasm,endoplasmic reticulum,protein activation


In [15]:
len(df_edges['ConnID'].unique()) == df_edges.shape[0]

True

In [16]:
df_edges['reaction_id'] = df_edges.index.map(lambda x: f"rx{x+1:05}")
df_edges['reaction_id'].head()

0    rx00001
1    rx00002
2    rx00003
3    rx00004
4    rx00005
Name: reaction_id, dtype: object

In [17]:
node_dict = {}
for label in helpers.node_labels:
    if label == "Metabolite":
        q = '''MATCH (n:%s) WHERE NOT n:MetaboliteFamily RETURN DISTINCT n.name'''%label
    else:
        q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

FunctionalCluster 0
PlantCoding 162
PlantNonCoding 9
PlantAbstract 7
ForeignEntity 3
ForeignCoding 14
ForeignNonCoding 0
ForeignAbstract 0
Complex 3
Process 6
MetaboliteFamily 6
Metabolite 110
Reaction 0


In [18]:
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20500,AT5G38120,AT1G20480,AT1G20510",,,,4CLL


In [19]:
translate_ignore_etc_df.loc[('Rx1', 'node')]

ath_homologues       Rx1
stu_homologues          
sly_homologues          
osa_homologues          
family            R-gene
Name: (Rx1, node), dtype: object

In [20]:
def get_label(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, form_, level_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan

    new_label = None
    
    ########################
    # Simple Cases
    ########################
    if form_ in ['complex', 'complex_active']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
    
    elif form_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_, " |a (label) not a listed Metabolite/MetaboliteFamily")

    elif form_ in ['process', 'process_active']:
        if (id_ in node_dict["Process"]):
            new_label = "Process"
        else:
            print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |b (label) process not a listed process")
    
    else:
        # somethng plant, something foreign
        family_id = get_family_id(id_, level_, translate_df)
        
        if not family_id:
            family_id = get_family_id(id_, level_, translate_ignore_etc_df)

                
        if (family_id):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in helpers.plant_node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |d (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |e (label) could not find label")
                missing_in_components.update([id_]) 
        else:
            # check if foreign
            for label in helpers.foreign_node_labels:
                if id_ in node_dict[label]:
                    new_label = label
                    break
        
    if not new_label:
        print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |c (family id) could not convert to label")
        
    return new_label


    
def get_family_id(id_, level_, df):
    family_id = None
    if id_ in repeated_node_level:
        print('INFO', id_, level_, 'node level has multiple identifiers')
    if level_ == "family":    
        try:
            family_id = df.loc[(id_, 'family')]['family']
        except KeyError:
            pass
    elif level_ in ["clade", "clade/orthologue"]:
        try:
            family_id = df.loc[(id_, 'clade')]['family']
        except KeyError:
            pass
    elif level_ == "node":
        try:
            family_id = df.loc[(id_, 'node')]['family']
        except KeyError:
            pass

    return family_id

def apply_get_family_id(x):
    id_, level_, label_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan
    
    family_id = np.nan
    if label_ in helpers.plant_node_labels:
        family_id = get_family_id(id_, level_)
    
    return family_id


In [21]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level')]

    label_col = prefix + '_label'
    family_col = prefix + '_family'    
    
    df_edges[label_col] = df_edges[[id_col, form_col, level_col, 'ConnID', 'origin']].apply(get_label, axis=1, result_type='expand')
    #df_edges[family_col] = df_edges[[id_col, level_col, label_col, 'ConnID', 'origin']].apply(apply_get_family_id, axis=1, result_type='expand')


input1
input2
INFO CPS node node level has multiple identifiers
INFO CPS node node level has multiple identifiers
INFO GA20ox3 node node level has multiple identifiers
INFO GA20ox3 node node level has multiple identifiers
INFO GA20ox1 node node level has multiple identifiers
INFO GA20ox4 node node level has multiple identifiers
input3
output1
INFO GA20ox3 node node level has multiple identifiers
INFO GA20ox1 node node level has multiple identifiers
INFO GA20ox4 node node level has multiple identifiers


## Functional groups
Node --> identifiers

Clade --> identifiers

Family --> identifiers

In [22]:
helpers.plant_node_labels

['PlantCoding', 'PlantNonCoding', 'PlantAbstract']

In [23]:
def create_entry():
    d = {'level':'', 'members':'', 'species':'', "family":''}      
    for c in [f"{specie}_homologues" for specie in all_species]:
        d[c] = '-'

    return d

def list_of_string_list_to_list(x, delim=","):
    '''e.g: ['AT5G26130,AT1G50060,AT2G14610,AT2G14580', '', '', '']
    to: ['AT4G33710', 'AT4G33720', 'AT5G26130', 'AT1G50060', 'AT2G14610', 'AT2G14580']
    '''
    new_list = []
    for sub_s in x:
        if not sub_s in helpers.empty_strings:
            for s in sub_s.split(delim):
                if not s in helpers.empty_strings:
                    new_list.append(s)
    new_list = sorted(new_list)
    return new_list

print("column\tname\trxid\tlevel\tspecies(as in excel)\tspecies(for identifiers)")
reaction_players = defaultdict(lambda : defaultdict(dict))  #lambda : defaultdict(create_entry))
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, from_col, level_col, new_id, label_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level',  '_newID', '_label')]    
    homologue_cols = [f"{prefix}_{specie}_homologues" for specie in all_species]
    
    this_players = df_edges[['reaction_id', 'species', id_col, level_col, label_col]]# + homologue_cols]
    
    for _, row in this_players.iterrows():
        if row[label_col] in helpers.plant_node_labels:
            name = row[id_col]
            level = row[level_col]
            
            members = []
            
            reaction_species = row['species']
            
            if reaction_species == 'all':
                tmp_reaction_species = all_species
            else:
                tmp_reaction_species = row['species'].split(',')
            
            try:
                r = translate_df.loc[(name, level)]
                no_members = False
            except KeyError:
                r = translate_ignore_etc_df.loc[(name, level)]
                no_members = True
                #print(name, '- no members')
            

            was_in_species = []
            for specie in all_species:
                c = f"{specie}_homologues"
                if (not no_members) and (specie in tmp_reaction_species):
                    reaction_players[row[id_col]][row['reaction_id']][c] = r[c]
                    members.append(r[c])
                    was_in_species.append(specie)
                else:
                    reaction_players[row[id_col]][row['reaction_id']][c] = ''
            
            members = list_of_string_list_to_list(members)
            
            if ( not set(was_in_species) == set(tmp_reaction_species) ) or ( len(members) == 0 ):
                print(f"{id_col}\t{row[id_col]}\t{row['reaction_id']}\t{level}\t{reaction_species}\t{tmp_reaction_species}\t{members}")
                
            reaction_players[row[id_col]][row['reaction_id']]['level'] =  level
            reaction_players[row[id_col]][row['reaction_id']]['family'] =  r['family']
            reaction_players[row[id_col]][row['reaction_id']]['members'] =  members
            reaction_players[row[id_col]][row['reaction_id']]['species'] =  reaction_species
        #else:
            #print('---', id_col, row[label_col], row['reaction_id'])
reaction_players = dict(reaction_players)

column	name	rxid	level	species(as in excel)	species(for identifiers)
input1_ID	COI1	rx00045	node	ath,nbe	['ath', 'nbe']	['AT2G39940']
input1_ID	Rx1	rx00057	node	ath,stu	['ath', 'stu']	[]
input1_ID	Rx2	rx00058	node	ath,stu	['ath', 'stu']	[]
input1_ID	GPAphid2	rx00059	node	ath,stu	['ath', 'stu']	[]
input1_ID	R-gene	rx00108	family	ath	['ath']	[]
input1_ID	R-gene	rx00109	family	ath	['ath']	[]
input1_ID	AOX	rx00133	family	ath,nbe	['ath', 'nbe']	[]
input1_ID	R-gene	rx00168	family	ath	['ath']	[]
input1_ID	RBCS-3B	rx00232	node	ath	['ath']	[]
input1_ID	RBCS-2B	rx00233	node	ath	['ath']	[]
input1_ID	NAC055	rx00255	node	ath	['ath']	[]
input1_ID	NAC072	rx00256	node	ath	['ath']	[]
input1_ID	NAC055	rx00258	node	ath	['ath']	[]
input1_ID	NAC072	rx00259	node	ath	['ath']	[]
input1_ID	GPX	rx00262	family	all	['ath', 'stu', 'sly', 'osa']	[]
input2_ID	OMR1	rx00020	node	ath,nbe	['ath', 'nbe']	['AT3G10050']
input2_ID	X4	rx00039	node	ath	['ath']	[]
input2_ID	OMR1	rx00045	node	ath,nbe	['ath', 'nbe']	['AT3G10050'

In [24]:
with open("FunctionalClusters_first-definition.tsv", "w") as out:
    c = '\t'.join([f'{specie}_homologues' for specie in all_species])
    out.write(f"node_name\tfamily\treaction_id\tlevel\tspecies\tmembers\t{c}\n")
    for node in list(reaction_players):
        for reaction in list(reaction_players[node]):
            members = reaction_players[node][reaction]['members']
            c =  '\t'.join([reaction_players[node][reaction][f'{specie}_homologues'] for specie in all_species])
            out.write(f"{node}\t{reaction_players[node][reaction]['family']}\t{reaction}\t{reaction_players[node][reaction]['level']}\t{reaction_players[node][reaction]['species']}\t{','.join(members)}\t{c}\n")

In [25]:
!head FunctionalClusters_first-definition.tsv

node_name	family	reaction_id	level	species	members	ath_homologues	stu_homologues	sly_homologues	osa_homologues
ETR	ETR	rx00005	family	ath	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340			
ETR	ETR	rx00006	family	ath	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340			
ETR	ETR	rx00008	family	ath	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340			
CTR	CTR	rx00006	family	ath	AT5G03730	AT5G03730			
EIN2	EIN2	rx00009	node	ath	AT5G03280	AT5G03280			
EIN2	EIN2	rx00210	node	ath	AT5G03280	AT5G03280			
EIN2	EIN2	rx00010	node	ath	AT5G03280	AT5G03280			
EIN2	EIN2	rx00015	node	ath	AT5G03280	AT5G03280			
ETP	ETP	rx00011	family	ath	AT3G18910,AT3G18980	AT3G18910,AT3G18980			


In [26]:
functional_clusters_df = pd.read_csv("FunctionalClusters_first-definition.tsv", sep="\t")
functional_clusters_df = functional_clusters_df.fillna('')

In [27]:
functional_clusters_df[functional_clusters_df['level']=='node']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
4,EIN2,EIN2,rx00009,node,ath,AT5G03280,AT5G03280,,,
5,EIN2,EIN2,rx00210,node,ath,AT5G03280,AT5G03280,,,
6,EIN2,EIN2,rx00010,node,ath,AT5G03280,AT5G03280,,,
7,EIN2,EIN2,rx00015,node,ath,AT5G03280,AT5G03280,,,
9,EIN5,XRN,rx00012,node,ath,AT1G54490,AT1G54490,,,
...,...,...,...,...,...,...,...,...,...,...
462,MAX2,F-box/LRR-repeat,rx00319,node,"ath,osa",AT2G42620,AT2G42620,,,
463,D53,Class I Clp ATPase,rx00320,node,"ath,osa",OS11G0104300,,,,OS11G0104300
518,RTM3,TRAF,rx00060,node,ath,AT3G58350,AT3G58350,,,
521,NAC032,NAC,rx00157,node,ath,AT1G77450,AT1G77450,,,


In [28]:
print(functional_clusters_df.shape[0])
functional_clusters_df.head()

523


Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
0,ETR,ETR,rx00005,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
1,ETR,ETR,rx00006,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
2,ETR,ETR,rx00008,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
3,CTR,CTR,rx00006,family,ath,AT5G03730,AT5G03730,,,
4,EIN2,EIN2,rx00009,node,ath,AT5G03280,AT5G03280,,,


In [29]:
print(functional_clusters_df.shape[0])
functional_clusters_df.head()

523


Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
0,ETR,ETR,rx00005,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
1,ETR,ETR,rx00006,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
2,ETR,ETR,rx00008,family,ath,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT2G40940,AT3G04580,AT3G23150,AT1G04310,AT1G66340",,,
3,CTR,CTR,rx00006,family,ath,AT5G03730,AT5G03730,,,
4,EIN2,EIN2,rx00009,node,ath,AT5G03280,AT5G03280,,,


In [30]:
functional_clusters_df.sort_values('node_name', inplace=True)
functional_clusters_df = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'members', 'level'])

In [31]:
functional_clusters_df[functional_clusters_df['node_name']=='CPS']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
424,CPS,CPS,rx00275,node,stu,SOTUB06G034690.1.1,,SOTUB06G034690.1.1,,
423,CPS,CPS,rx00264,node,ath,AT4G02780,AT4G02780,,,


In [32]:
x = functional_clusters_df[['node_name']].value_counts()
x = x[x>1].index[:]
x = [z[0] for z in x]
functional_clusters_df[functional_clusters_df['node_name'].isin(x)]

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
45,COI1,COI1,rx00196,family,ath,AT2G39940,AT2G39940,,,
43,COI1,COI1,rx00045,node,"ath,nbe",AT2G39940,AT2G39940,,,
424,CPS,CPS,rx00275,node,stu,SOTUB06G034690.1.1,,SOTUB06G034690.1.1,,
423,CPS,CPS,rx00264,node,ath,AT4G02780,AT4G02780,,,
333,EDS5,EDS5,rx00116,family,ath,AT4G39030,AT4G39030,,,
330,EDS5,EDS5,rx00070,node,ath,AT4G39030,AT4G39030,,,
433,GA3ox,GA3ox,rx00282,clade,stu,SOTUB06G023360.1.1,,SOTUB06G023360.1.1,,
432,GA3ox,GA3ox,rx00272,family,ath,"AT1G15550,AT1G80330,AT1G80340,AT4G21690","AT1G80340,AT1G80330,AT1G15550,AT4G21690",,,
269,GID1,GID,rx00288,node,osa,LOC_OS05G33730,,,,LOC_OS05G33730
266,GID1,GID,rx00285,clade,ath,"AT3G05120,AT3G63010,AT5G27320","AT3G63010,AT3G05120,AT5G27320",,,


In [33]:
functional_clusters_df[functional_clusters_df['members'] == '']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
456,ACX2,ACX,rx00312,node,ath,,,,,
135,AOX,AOX,rx00133,family,"ath,nbe",,,,,
341,BA2H,BA2H,rx00078,family,ath,,,,,
53,GPAphid2,R-gene,rx00059,node,"ath,stu",,,,,
254,GPX,GPX,rx00262,family,all,,,,,
334,IPL,IPL,rx00071,family,ath,,,,,
453,MYB33,MYB,rx00308,node,stu,,,,,
245,NAC055,NAC,rx00255,node,ath,,,,,
250,NAC072,NAC,rx00253,node,ath,,,,,
114,R-gene,R-gene,rx00108,family,ath,,,,,


In [34]:
functional_clusters_df.loc[:, "functional_cluster_name"] = functional_clusters_df.apply(lambda row: f"{row['node_name']}[{row['members']}]", axis=1)
functional_clusters_df.head()

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,ath,"AT1G04580,AT2G27150,AT5G20960","AT2G27150,AT5G20960,AT1G04580",,,,"AAO[AT1G04580,AT2G27150,AT5G20960]"
318,ACH,ACH,rx00038,family,ath,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,ath,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G62380,AT1G12010,AT2G19590,AT1G05010,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,ath,"AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT4G08...","AT4G26200,AT4G11280,AT3G49700,AT3G61510,AT4G37...",,,,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT..."
148,ACS2,ACS,rx00248,node,ath,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]


In [35]:
functional_clusters_df[functional_clusters_df['members'] =='']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
456,ACX2,ACX,rx00312,node,ath,,,,,,ACX2[]
135,AOX,AOX,rx00133,family,"ath,nbe",,,,,,AOX[]
341,BA2H,BA2H,rx00078,family,ath,,,,,,BA2H[]
53,GPAphid2,R-gene,rx00059,node,"ath,stu",,,,,,GPAphid2[]
254,GPX,GPX,rx00262,family,all,,,,,,GPX[]
334,IPL,IPL,rx00071,family,ath,,,,,,IPL[]
453,MYB33,MYB,rx00308,node,stu,,,,,,MYB33[]
245,NAC055,NAC,rx00255,node,ath,,,,,,NAC055[]
250,NAC072,NAC,rx00253,node,ath,,,,,,NAC072[]
114,R-gene,R-gene,rx00108,family,ath,,,,,,R-gene[]


In [36]:
functional_clusters_df.head()

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,ath,"AT1G04580,AT2G27150,AT5G20960","AT2G27150,AT5G20960,AT1G04580",,,,"AAO[AT1G04580,AT2G27150,AT5G20960]"
318,ACH,ACH,rx00038,family,ath,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,ath,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G62380,AT1G12010,AT2G19590,AT1G05010,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,ath,"AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT4G08...","AT4G26200,AT4G11280,AT3G49700,AT3G61510,AT4G37...",,,,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT..."
148,ACS2,ACS,rx00248,node,ath,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]


In [37]:
functional_clusters_df[functional_clusters_df['node_name'] == 'SAGT']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
349,SAGT,SAGT,rx00259,clade,ath,"AT2G43820,AT2G43840","AT2G43820,AT2G43840",,,,"SAGT[AT2G43820,AT2G43840]"
345,SAGT,SAGT,rx00085,family,ath,"AT2G43820,AT2G43840","AT2G43820,AT2G43840",,,,"SAGT[AT2G43820,AT2G43840]"


In [38]:
functional_clusters_df[functional_clusters_df['node_name'] == 'COI1']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
45,COI1,COI1,rx00196,family,ath,AT2G39940,AT2G39940,,,,COI1[AT2G39940]
43,COI1,COI1,rx00045,node,"ath,nbe",AT2G39940,AT2G39940,,,,COI1[AT2G39940]


In [39]:
functional_clusters_df[functional_clusters_df['node_name'] == 'EIN2']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
5,EIN2,EIN2,rx00210,node,ath,AT5G03280,AT5G03280,,,,EIN2[AT5G03280]


In [40]:
functional_clusters_df[functional_clusters_df['node_name'] == 'miR6022']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
275,miR6022,miR6022,rx00308,clade,stu,miR6022-3p,,miR6022-3p,,,miR6022[miR6022-3p]


In [41]:
functional_clusters_df[functional_clusters_df['node_name'] == 'miR159a']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
272,miR159a,miR159,rx00305,node,ath,miR159a,miR159a,,,,miR159a[miR159a]


In [42]:
functional_clusters_df[functional_clusters_df['node_name'] == 'X4']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
319,X4,X4,rx00039,node,ath,,,,,,X4[]


In [43]:
functional_clusters_df[functional_clusters_df['reaction_id'] == 'rx00039']

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
319,X4,X4,rx00039,node,ath,,,,,,X4[]


## Create functional cluster nodes

In [44]:
functional_clusters_import = functional_clusters_df.drop_duplicates(keep='first', subset=['functional_cluster_name'])

In [45]:
functional_clusters_import.shape

(247, 11)

In [46]:
def clean_labels(labels):
	for x in ['Family', 'Plant', 'Foreign', 'Node']:
		if x in labels:
			labels.remove(x)
	return labels[0]

def functional_cluster_query(file_name, 
                         labels, 
                         n_name="line.functional_cluster_name"
                        ):
    
    if type(labels) == list:
        node_label = ':' + ':'.join(labels)
    else:
        node_label = ':' + labels
    
    key = {"file_name":file_name, 
           "node_label":node_label, 
           "name":n_name}

    species_str = ""
    for specie in all_species:
        species_str += f"{specie}_homologues:split(line.{specie}_homologues, ','),\n                "

    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           CREATE (p{node_label}   {{ 
                name:{name}, 
                                
                {species_str}

                family:line.family
   
            }})'''.format(**key, species_str=species_str)
    
    return q



def make_create_type_of_edge_query(file_name, edge_type,
                           source_label="", target_label="",
                           source_name="line.source_name", target_name="line.target_name",
                          ):

    if not source_label == "":
        source_label = ':' + source_label
        
    if not target_label == "":
        target_label = ':' + target_label
                
    key ={"file_name":file_name, "edge_type":edge_type,
          "source_label":source_label, "target_label":target_label,
          "source_name":source_name, "target_name":target_name, 
         }
    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           
           MATCH (source{source_label} {{ name:{source_name}}}),
                 (target{target_label} {{ name:{target_name}}})
           
           CREATE (source)-[:{edge_type}]->(target)'''.format(**key)

    return q

In [47]:
label = "FunctionalCluster"
f = f"{label}-components.tsv"
functional_clusters_import.to_csv(f"../data/import/{f}", sep="\t", index=None)


q = '''LOAD CSV WITH HEADERS FROM 'file:///{file_name}' AS line  FIELDTERMINATOR '\t'
       MATCH (parent:Family {{ name:line.family }}) RETURN parent.name AS name, labels(parent)   AS labels
    '''.format(file_name=f)
print(q)
qr = graph.run(q)

df = pd.DataFrame(qr.data())
df['labels'] = df['labels'].apply(clean_labels)
df = df.drop_duplicates(keep='first')
df = functional_clusters_import.join(df.set_index('name'), on='family')

df.head()

LOAD CSV WITH HEADERS FROM 'file:///FunctionalCluster-components.tsv' AS line  FIELDTERMINATOR '	'
       MATCH (parent:Family { name:line.family }) RETURN parent.name AS name, labels(parent)   AS labels
    


Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name,labels
340,AAO,AAO,rx00081,family,ath,"AT1G04580,AT2G27150,AT5G20960","AT2G27150,AT5G20960,AT1G04580",,,,"AAO[AT1G04580,AT2G27150,AT5G20960]",PlantCoding
318,ACH,ACH,rx00038,family,ath,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]",PlantCoding
301,ACO,ACO,rx00003,family,ath,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G62380,AT1G12010,AT2G19590,AT1G05010,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",PlantCoding
300,ACS,ACS,rx00002,family,ath,"AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT4G08...","AT4G26200,AT4G11280,AT3G49700,AT3G61510,AT4G37...",,,,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT...",PlantCoding
148,ACS2,ACS,rx00248,node,ath,AT1G01480,AT1G01480,,,,ACS2[AT1G01480],PlantCoding


In [48]:
df['labels'].value_counts()

PlantCoding       229
PlantNonCoding     10
PlantAbstract       8
Name: labels, dtype: int64

In [49]:
# save node types
label = "FunctionalCluster"
print(label, "\t", df.shape[0])

for t, subdf in df.groupby('labels'):
    subdf.to_csv(f"../data/import/{label}-{t}-components.tsv", sep="\t", index=None)

    query = functional_cluster_query(f"{label}-{t}-components.tsv", [label, t], n_name="line.functional_cluster_name")
    qr = graph.run(query)
    print(t, "\t", subdf.shape[0], qr.stats()['nodes_created'])

FunctionalCluster 	 247
PlantAbstract 	 8 8
PlantCoding 	 229 229
PlantNonCoding 	 10 10


In [50]:
functional_clusters_import

Unnamed: 0,node_name,family,reaction_id,level,species,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,ath,"AT1G04580,AT2G27150,AT5G20960","AT2G27150,AT5G20960,AT1G04580",,,,"AAO[AT1G04580,AT2G27150,AT5G20960]"
318,ACH,ACH,rx00038,family,ath,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,ath,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G62380,AT1G12010,AT2G19590,AT1G05010,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,ath,"AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT4G08...","AT4G26200,AT4G11280,AT3G49700,AT3G61510,AT4G37...",,,,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT..."
148,ACS2,ACS,rx00248,node,ath,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]
...,...,...,...,...,...,...,...,...,...,...,...
274,miR319a-3p,miR319,rx00307,node,stu,miR319a-3p,,miR319a-3p,,,miR319a-3p[miR319a-3p]
275,miR6022,miR6022,rx00308,clade,stu,miR6022-3p,,miR6022-3p,,,miR6022[miR6022-3p]
263,phasiRNA931,phasiRNA931,rx00282,node,stu,phasiRNA931,,phasiRNA931,,,phasiRNA931[phasiRNA931]
257,vsiRNA12986,vsiRNA12986,rx00276,node,stu,vsiRNA12986,,vsiRNA12986,,,vsiRNA12986[vsiRNA12986]


In [51]:
# component to complex edges
edge_type = 'TAKES_PART'
want_cols = ['functional_cluster_name', 'family']

f = f'{edge_type}-{label}-edges.tsv'  
functional_clusters_import.to_csv(f"../data/import/{f}", index=None, sep="\t")

query = make_create_type_of_edge_query(f, edge_type, 
                       source_label='Family', target_label=label,
                       source_name="line.family", target_name="line.functional_cluster_name",
                      )
print(query)
qr = graph.run(query)

print(label, functional_clusters_df.shape[0], qr.stats()['relationships_created'])    
if not functional_clusters_df.shape[0] == qr.stats()['relationships_created']:
    print("\tnot all edges created")

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///TAKES_PART-FunctionalCluster-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:Family { name:line.family}),
                 (target:FunctionalCluster { name:line.functional_cluster_name})
           
           CREATE (source)-[:TAKES_PART]->(target)
FunctionalCluster 253 247
	not all edges created


## Replace ids in edges table

In [52]:
functional_clusters_df = pd.read_csv("FunctionalClusters_first-definition.tsv", sep="\t")
functional_clusters_df = functional_clusters_df.fillna('')
functional_clusters_df.loc[:, "functional_cluster_name"] = functional_clusters_df.apply(lambda row: f"{row['node_name']}[{row['members']}]", axis=1)

functional_clusters_df = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'level', 'species', 'functional_cluster_name'])

functional_clusters_translate = functional_clusters_df[['node_name', 'level', 'species', 'functional_cluster_name']].set_index(['node_name', 'level', 'species']).to_dict('index')
functional_clusters_translate.keys()

dict_keys([('ETR', 'family', 'ath'), ('CTR', 'family', 'ath'), ('EIN2', 'node', 'ath'), ('ETP', 'family', 'ath'), ('EIN5', 'node', 'ath'), ('EIN3(like)', 'family', 'ath'), ('EBF', 'family', 'ath'), ('JAZ', 'family', 'ath'), ('MYC', 'family', 'ath'), ('PR3', 'family', 'ath'), ('PR4', 'family', 'ath'), ('ERF/EDF', 'family', 'ath'), ('LOX', 'family', 'ath'), ('COI1', 'node', 'ath,nbe'), ('COI1', 'node', 'ath'), ('COI1', 'family', 'ath'), ('JAM', 'family', 'ath'), ('SGT1', 'clade', 'ath'), ('NDR1', 'node', 'ath'), ('Rx1', 'node', 'ath,stu'), ('Rx2', 'node', 'ath,stu'), ('GPAphid2', 'node', 'ath,stu'), ('RTM1', 'node', 'ath'), ('CLH', 'family', 'ath'), ('JR1', 'node', 'ath'), ('PR13', 'family', 'ath'), ('VSP', 'family', 'ath'), ('ICS', 'family', 'ath'), ('EDS1', 'node', 'ath'), ('NPR1', 'node', 'ath'), ('MPK4', 'node', 'ath'), ('MPK3', 'node', 'ath'), ('MPK6', 'node', 'ath'), ('TRX-H', 'family', 'ath'), ('NIMIN', 'family', 'ath'), ('PR1', 'family', 'ath'), ('PR1', 'clade', 'ath'), ('PR2', '

In [53]:
def get_new_id(x):
    specie, id_, level_, label_ = x.values
    
    if label_ in helpers.plant_node_labels:
        try:
            return functional_clusters_translate[(id_, level_, specie)]['functional_cluster_name']
        except KeyError:
            pass
        
        # if reaction has same node on different levels, then one is removed
        return functional_clusters_translate[(id_, 'node', specie)]['functional_cluster_name']
        print(id_, level_, label_)
    else:
        return id_



for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col, label_col, new_id =\
            [prefix + x for x in ('_ID',  '_form',  '_level', '_label', '_newID')]

    df_edges[new_id] = df_edges[['species', id_col, level_col, label_col]].apply(get_new_id, axis=1, result_type='expand')


input1
input2
input3
output1


In [54]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,reaction_type,reaction_id,input1_label,input2_label,input3_label,output1_label,input1_newID,input2_newID,input3_newID,output1_newID
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,catalysis,rx00001,Metabolite,PlantCoding,,Metabolite,L-Met,"SAMS[AT1G02500,AT2G36880,AT3G17390,AT4G01850]",,SAMe
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,catalysis,rx00002,Metabolite,PlantCoding,,Metabolite,SAMe,"ACS[AT1G01480,AT2G22810,AT3G49700,AT3G61510,AT...",,ACC
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,catalysis,rx00003,Metabolite,PlantCoding,,Metabolite,ACC,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",,ET
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,translocation,rx00004,Metabolite,PlantCoding,,Metabolite,Cu2+,"HMA[AT1G63440,AT5G44790]",,Cu2+
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,protein activation,rx00005,PlantCoding,Metabolite,,PlantCoding,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",Cu2+,,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT..."


In [55]:
len(df_edges['reaction_id'].unique()) == df_edges.shape[0]

True

In [56]:
df_edges.to_csv(output_path / "edges-sheet.tsv", sep="\t")

In [57]:
df.to_csv(output_path / "functional_clusters.tsv", sep="\t", index=False)

In [58]:
with open(output_path / "complexes_to_add.tsv", "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")

# END