In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

from collections import defaultdict

In [3]:
import helpers

In [4]:
from importlib import reload

In [5]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
from py2neo import Graph, Node, Relationship

In [7]:
graph = Graph(host="neo4j")

## Read in Bio elements
bioelements = PlantCoding/PlantNonCoding/PlantAbstract

have family hierachy

In [8]:
file_name = output_path / "bio_elements.tsv"
df_bioelements = pd.read_csv(file_name, sep="\t")
df_bioelements.head()

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms,family
0,node0278,KG,ath,PlantAbstract,plant_abstract,BA2H,BA2H,BA2H,BA2H,,BA2H,Benzoic acid 2-hydroxylase: Isolated from toba...,Hormone:SA,v1.0,ignore,,,,,BA2H
1,node0255,KG,ath,PlantAbstract,plant_abstract,IPL,IPL,IPL,IPL,,Arabidopsis contains two ICS genes but has no ...,,Hormone:SA,v1.0,ignore,,,,,IPL
2,node0695,KG,ath,PlantAbstract,plant_abstract,R-gene,GPAphid2,GPAphid2,GPAphid2,invented:unidentified,green peach aphid 2; resistance against PVX (stu),,S:Rgenes,v2.6,ignore,,,,,R-gene
3,node0692,KG,ath,PlantAbstract,plant_abstract,R-gene,HRT,HRT,HRT,invented:unidentified,HR against TCV (ath),,S:Rgenes,v2.5,ignore,,,,,R-gene
4,node0691,KG,ath,PlantAbstract,plant_abstract,R-gene,Ny,Ny,Ny,invented:unidentified,HR against PVY (stu),,S:Rgenes,v2.5,ignore,,,,,R-gene


In [9]:
#df_bioelements[df_bioelements['NodeName'].duplicated(keep=False)].sort_values('NodeName')[['identifier', 'species', 'Family', 'Clade', 'NodeID', 'NodeName']]
repeated_node_level = df_bioelements[df_bioelements['NodeName'].duplicated(keep='first')]['NodeName'].values

In [10]:
file_name = output_path / "level_translation.tsv"
translate_df = pd.read_csv(file_name, sep="\t", index_col=[0, 1])
translate_df.fillna('', inplace=True)
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20490,AT1G20510,AT4G05160,AT1G20500,AT1G20...",,,,4CLL


## Read in parsed reactions

In [11]:
all_species = list(df_bioelements['species'].unique())
try: all_species.remove('plant_all')
except ValueError: pass
try: all_species.remove('all')
except ValueError: pass    
all_species

['ath', 'stu', 'sly', 'osa']

In [12]:
df_edges = pd.read_csv("parsed_reactions.tsv", sep="\t", index_col=0)
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,AdditionalInfo,ModelV,origin,trust_level,external_links,input1_form,input2_form,input3_form,output1_form,species
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,SAMS catalyse L-Met to SAMe reaction.,v1.0,forCB-v2.7.8_PIS-model-Reactions,R1,aracyc:ethyl-pwy,metabolite,protein_active,,metabolite,ath
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,ACS catalyse ACC to SAMe reaction.,v1.0,forCB-v2.7.8_PIS-model-Reactions,R1,aracyc:ethyl-pwy,metabolite,protein_active,,metabolite,ath
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,ACO catalyse ACC to ET reaction.,v1.0,forCB-v2.7.8_PIS-model-Reactions,R1,aracyc:ethyl-pwy,metabolite,protein_active,,metabolite,ath
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,Copper gets transported from the cytoplasm to ...,v1.0,forCB-v2.7.8_PIS-model-Reactions,R1,doi:10.1105/tpc.001768,metabolite,protein,,metabolite,ath
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,Copper activates the membrane bound ethylene r...,v1.0,forCB-v2.7.8_PIS-model-Reactions,R1,doi:10.1105/tpc.001768,protein,metabolite,,protein_active,ath


In [13]:
len(df_edges['ConnID'].unique()) == df_edges.shape[0]

True

In [14]:
df_edges['reaction_id'] = df_edges.index.map(lambda x: f"rx{x+1:05}")
df_edges['reaction_id'].head()

0    rx00001
1    rx00002
2    rx00003
3    rx00004
4    rx00005
Name: reaction_id, dtype: object

In [15]:
node_dict = {}
for label in helpers.node_labels:
    if label == "Metabolite":
        q = '''MATCH (n:%s) WHERE NOT n:MetaboliteFamily RETURN DISTINCT n.name'''%label
    else:
        q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

FunctionalCluster 0
PlantCoding 162
PlantNonCoding 9
PlantAbstract 7
ForeignEntity 3
ForeignCoding 14
ForeignNonCoding 0
ForeignAbstract 0
Complex 3
Process 6
MetaboliteFamily 6
Metabolite 110
Reaction 0


In [16]:
def get_label(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, form_, level_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan

    new_label = None
    
    ########################
    # Simple Cases
    ########################
    if form_ in ['complex', 'complex_active']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
    
    elif form_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(origin, "|", ConnID, "|", id_, "|", form_, "|", level_, " |a (label) not a listed Metabolite/MetaboliteFamily")

    elif form_ in ['process', 'process_active']:
        if (id_ in node_dict["Process"]):
            new_label = "Process"
        else:
            print(origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |b (label) process not a listed process")
    
    else:
        # somethng plant, something foreign
        family_id = get_family_id(id_, level_)
                
        if (family_id):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in helpers.plant_node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |d (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |e (label) could not find label")
                missing_in_components.update([id_]) 
        else:
            # check if foreign
            for label in helpers.foreign_node_labels:
                if id_ in node_dict[label]:
                    new_label = label
                    break
        
    if not new_label:
        print(origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |c (family id) could not convert to label")
        
    return new_label


    
def get_family_id(id_, level_):
    family_id = None
    if id_ in repeated_node_level:
        print(id_, level_, 'node level has multiple identifiers')
    if level_ == "family":    
        try:
            family_id = translate_df.loc[(id_, 'family')]['family']
        except KeyError:
            pass
    elif level_ in ["clade", "clade/orthologue"]:
        try:
            family_id = translate_df.loc[(id_, 'clade')]['family']
        except KeyError:
            pass
    elif level_ == "node":
        try:
            family_id = translate_df.loc[(id_, 'node')]['family']
        except KeyError:
            pass

    return family_id

def apply_get_family_id(x):
    id_, level_, label_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan
    
    family_id = np.nan
    if label_ in helpers.plant_node_labels:
        family_id = get_family_id(id_, level_)
    
    return family_id


In [17]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level')]

    label_col = prefix + '_label'
    family_col = prefix + '_family'    
    
    df_edges[label_col] = df_edges[[id_col, form_col, level_col, 'ConnID', 'origin']].apply(get_label, axis=1, result_type='expand')
    df_edges[family_col] = df_edges[[id_col, level_col, label_col, 'ConnID', 'origin']].apply(apply_get_family_id, axis=1, result_type='expand')


input1
PR1 family node level has multiple identifiers
PR1 family node level has multiple identifiers
input2
PR1 family node level has multiple identifiers
PR1 family node level has multiple identifiers
CPS node node level has multiple identifiers
CPS node node level has multiple identifiers
GA20ox3 node node level has multiple identifiers
GA20ox3 node node level has multiple identifiers
GA20ox1 node node level has multiple identifiers
GA20ox4 node node level has multiple identifiers
PR1 clade node level has multiple identifiers
PR1 family node level has multiple identifiers
PR1 family node level has multiple identifiers
CPS node node level has multiple identifiers
CPS node node level has multiple identifiers
GA20ox3 node node level has multiple identifiers
GA20ox3 node node level has multiple identifiers
GA20ox1 node node level has multiple identifiers
GA20ox4 node node level has multiple identifiers
PR1 clade node level has multiple identifiers
input3
output1
PR1 family node level has

## Functional groups
Node --> identifiers

Clade --> identifiers

Family --> identifiers

In [18]:
def create_entry():
    d = {'level':'-', 'members':'-'}      
    for c in [f"{specie}_homologues" for specie in all_species]:
        d[c] = '-'

    return d

def list_of_string_list_to_list(x, delim=","):
    '''e.g: ['AT5G26130,AT1G50060,AT2G14610,AT2G14580', '', '', '']
    to: ['AT4G33710', 'AT4G33720', 'AT5G26130', 'AT1G50060', 'AT2G14610', 'AT2G14580']
    '''
    new_list = []
    for sub_s in x:
        if not sub_s in helpers.empty_strings:
            for s in sub_s.split(delim):
                if not s in helpers.empty_strings:
                    new_list.append(s)
    new_list = sorted(new_list)
    return new_list

reaction_players = defaultdict(lambda : defaultdict(create_entry))
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, from_col, level_col, new_id, label_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level',  '_newID', '_label')]    
    homologue_cols = [f"{prefix}_{specie}_homologues" for specie in all_species]
    
    this_players = df_edges[['reaction_id', id_col, level_col, label_col]]# + homologue_cols]
    
    for _, row in this_players.iterrows():
        if row[label_col] in helpers.plant_node_labels:
            name = row[id_col]
            level = row[level_col]
            members = []
            
            r = translate_df.loc[(name, level)]
            for specie in all_species:
                c = f"{specie}_homologues"
                members.append(r[c])
                reaction_players[row[id_col]][row['reaction_id']][c] = r[c]
            
            reaction_players[row[id_col]][row['reaction_id']]['level'] =  level
            reaction_players[row[id_col]][row['reaction_id']]['family'] =  r['family']
            reaction_players[row[id_col]][row['reaction_id']]['members'] =  list_of_string_list_to_list(members)

In [19]:
df_edges[df_edges['input2_ID'] == 'EIN2'].values

array([['forCB', 'KG', 'Conn010', 'ath', 'CTR|ETR', 'family', 'ER',
        'complex', 'EIN2', 'family', 'ER', 'protein [active]', nan, nan,
        nan, nan, 'inhibition', 'protein deactivation',
        'phosphorylation', 'EIN2', 'node', 'ER', 'protein',
        '[R1] targetted experiments (e.g. Y2H, BIFC)',
        'DOI:10.1073/pnas.0605528103 (ETHYLENE-INSENSITIVE5 encodes...) | DOI:10.1073/pnas.1214848109 (CTR1 phosphorylates the central regulator EIN2...) | DOI:10.1093/mp/ssq036 (New insight in ethylene signaling...) | DOI:10.1016/j.pbi.2008.06.011 (Ethylene signaling...) | DOI:10.1093/mp/ssr042 (Paradigms and paradox in the ethylene signaling...) | DOI:10.1042/BJ20091102 (EIN2, the central regulator of ethylene signalling...)',
        'ETR/CTR domain keeps EIN2 phosphorylated = inactive. Modelled as ETR(a)/CTR(a) catalysing the phosphorylation reaction that turns EIN2 inactive.',
        'v1.0', 'forCB-v2.7.8_PIS-model-Reactions', 'R1',
        'doi:10.1073/pnas.1214848109,doi:

In [20]:
with open("FunctionalClusters_first-definition.tsv", "w") as out:
    c = '\t'.join([f'{specie}_homologues' for specie in all_species])
    out.write(f"node_name\tfamily\treaction_id\tlevel\tmembers\t{c}\n")
    for node in list(reaction_players):
        for reaction in list(reaction_players[node]):
            c =  '\t'.join([reaction_players[node][reaction][f'{specie}_homologues'] for specie in all_species])
            out.write(f"{node}\t{reaction_players[node][reaction]['family']}\t{reaction}\t{reaction_players[node][reaction]['level']}\t{','.join(reaction_players[node][reaction]['members'])}\t{c}\n")

In [21]:
!head FunctionalClusters_first-definition.tsv

node_name	family	reaction_id	level	members	ath_homologues	stu_homologues	sly_homologues	osa_homologues
ETR	ETR	rx00005	family	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940			
ETR	ETR	rx00006	family	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940			
ETR	ETR	rx00008	family	AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150	AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940			
CTR	CTR	rx00006	family	AT4G24480,AT5G03730	AT5G03730,AT4G24480			
EIN2	EIN2	rx00009	node	AT5G03280	AT5G03280			
EIN2	EIN2	rx00210	node	AT5G03280	AT5G03280			
EIN2	EIN2	rx00010	node	AT5G03280	AT5G03280			
EIN2	EIN2	rx00015	node	AT5G03280	AT5G03280			
ETP	ETP	rx00011	family	AT2G04920,AT2G07140,AT2G24510,AT2G27520,AT3G17265,AT3G17280,AT3G17480,AT3G17490,AT3G17500,AT3G17530,AT3G17540,AT3G17560,AT3G17570,AT3G18320,AT3G18330,AT3G18900,AT3G18910,AT3G18980,AT3G19410,AT3G21120,AT3G21130,AT3G21170,AT3G22350,AT3G227

In [22]:
functional_clusters_df = pd.read_csv("FunctionalClusters_first-definition.tsv", sep="\t")

In [23]:
functional_clusters_df[functional_clusters_df['level']=='node']

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
4,EIN2,EIN2,rx00009,node,AT5G03280,AT5G03280,,,
5,EIN2,EIN2,rx00210,node,AT5G03280,AT5G03280,,,
6,EIN2,EIN2,rx00010,node,AT5G03280,AT5G03280,,,
7,EIN2,EIN2,rx00015,node,AT5G03280,AT5G03280,,,
9,EIN5,XRN,rx00012,node,AT1G54490,AT1G54490,,,
...,...,...,...,...,...,...,...,...,...
462,MAX2,F-box/LRR-repeat,rx00319,node,AT2G42620,AT2G42620,,,
463,D53,Class I Clp ATPase,rx00320,node,OS11G0104300,,,,OS11G0104300
518,RTM3,TRAF,rx00060,node,AT3G58350,AT3G58350,,,
521,NAC032,NAC,rx00157,node,AT1G77450,AT1G77450,,,


In [24]:
print(functional_clusters_df.shape[0])
functional_clusters_df.head()

523


Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
0,ETR,ETR,rx00005,family,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940",,,
1,ETR,ETR,rx00006,family,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940",,,
2,ETR,ETR,rx00008,family,"AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT3G23150","AT3G04580,AT1G04310,AT3G23150,AT1G66340,AT2G40940",,,
3,CTR,CTR,rx00006,family,"AT4G24480,AT5G03730","AT5G03730,AT4G24480",,,
4,EIN2,EIN2,rx00009,node,AT5G03280,AT5G03280,,,


In [25]:
functional_clusters_df.sort_values('node_name', inplace=True)
functional_clusters_df = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'members', 'level'])

In [26]:
functional_clusters_df[functional_clusters_df['node_name']=='CPS']

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
424,CPS,CPS,rx00275,node,"AT4G02780,SOTUB06G034690.1.1",AT4G02780,SOTUB06G034690.1.1,,


In [27]:
x = functional_clusters_df[['node_name']].value_counts()
x = x[x>1].index[:]
x = [z[0] for z in x]
functional_clusters_df[functional_clusters_df['node_name'].isin(x)]

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues
45,COI1,COI1,rx00196,family,AT2G39940,AT2G39940,,,
43,COI1,COI1,rx00045,node,AT2G39940,AT2G39940,,,
333,EDS5,EDS5,rx00116,family,"AT2G21340,AT4G39030","AT4G39030,AT2G21340",,,
330,EDS5,EDS5,rx00070,node,AT4G39030,AT4G39030,,,
433,GA3ox,GA3ox,rx00282,clade,"AT1G15550,AT1G80330,AT1G80340,AT4G21690,SOLYC0...","AT4G21690,AT1G80330,AT1G15550,AT1G80340","SOTUB03G033480.1.1,SOTUB06G023360.1.1","SOLYC03G119910.3.1,SOLYC06G066820.3.1",
432,GA3ox,GA3ox,rx00272,family,"AT1G15550,AT1G80330,AT1G80340,AT4G21690,SOLYC0...","AT4G21690,AT1G80330,AT1G15550,AT1G80340","SOTUB03G033480.1.1,SOTUB06G023360.1.1","SOLYC03G119910.3.1,SOLYC06G066820.3.1",
269,GID1,GID,rx00288,node,LOC_OS05G33730,,,,LOC_OS05G33730
266,GID1,GID,rx00285,clade,"AT3G05120,AT3G63010,AT5G27320,LOC_OS05G33730,S...","AT3G05120,AT5G27320,AT3G63010","SOTUB06G008680.1.1,SOTUB09G021750.1.1,SOTUB01G...","SOLYC01G098390.3.1,SOLYC06G008870.2.1,SOLYC09G...",LOC_OS05G33730
213,NADPH-oxidase,NADPH-oxidase,rx00179,clade,"AT1G09090,AT1G19230,AT1G64060,AT3G45810,AT4G11...","AT1G09090,AT3G45810,AT5G07390,AT5G47910,AT5G51...",,,
215,NADPH-oxidase,NADPH-oxidase,rx00132,family,"AT1G09090,AT1G19230,AT1G64060,AT3G45810,AT4G11...","AT1G09090,AT3G45810,AT5G07390,AT5G47910,AT5G51...",,,


In [28]:
functional_clusters_df.loc[:, "functional_cluster_name"] = functional_clusters_df.apply(lambda row: f"{row['node_name']}[{row['members']}]", axis=1)

In [29]:
functional_clusters_df.head()

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,"AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT4G34...","AT2G27150,AT5G20960,AT3G43600,AT4G34900,AT4G34...",,,,"AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT..."
318,ACH,ACH,rx00038,family,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G05010,AT1G62380,AT1G12010,AT2G19590,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,"AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT3G61...","AT4G37770,AT4G26200,AT3G61510,AT3G49700,AT1G01...",,,,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT..."
148,ACS2,ACS,rx00248,node,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]


In [30]:
functional_clusters_df[functional_clusters_df['node_name'] == 'SAGT']

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
349,SAGT,SAGT,rx00259,clade,"AT2G43820,AT2G43840,AT4G31570","AT4G31570,AT2G43820,AT2G43840",,,,"SAGT[AT2G43820,AT2G43840,AT4G31570]"
345,SAGT,SAGT,rx00085,family,"AT2G43820,AT2G43840,AT4G31570","AT4G31570,AT2G43820,AT2G43840",,,,"SAGT[AT2G43820,AT2G43840,AT4G31570]"


In [31]:
functional_clusters_df[functional_clusters_df['node_name'] == 'COI1']

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
45,COI1,COI1,rx00196,family,AT2G39940,AT2G39940,,,,COI1[AT2G39940]
43,COI1,COI1,rx00045,node,AT2G39940,AT2G39940,,,,COI1[AT2G39940]


In [32]:
functional_clusters_df[functional_clusters_df['node_name'] == 'EIN2']

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
5,EIN2,EIN2,rx00210,node,AT5G03280,AT5G03280,,,,EIN2[AT5G03280]


## Create functional cluster nodes

In [33]:
functional_clusters_import = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'members'])

In [34]:
def clean_labels(labels):
	for x in ['Family', 'Plant', 'Foreign', 'Node']:
		if x in labels:
			labels.remove(x)
	return labels[0]

def functional_cluster_query(file_name, 
                         labels, 
                         n_name="line.functional_cluster_name"
                        ):
    
    if type(labels) == list:
        node_label = ':' + ':'.join(labels)
    else:
        node_label = ':' + labels
    
    key = {"file_name":file_name, 
           "node_label":node_label, 
           "name":n_name}

    species_str = ""
    for specie in all_species:
        species_str += f"{specie}_homologues:split(line.{specie}_homologues, ','),\n                "

    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           CREATE (p{node_label}   {{ 
                name:{name}, 
                                
                {species_str}

                family:line.family
   
            }})'''.format(**key, species_str=species_str)
    
    return q



def make_create_type_of_edge_query(file_name, edge_type,
                           source_label="", target_label="",
                           source_name="line.source_name", target_name="line.target_name",
                          ):

    if not source_label == "":
        source_label = ':' + source_label
        
    if not target_label == "":
        target_label = ':' + target_label
                
    key ={"file_name":file_name, "edge_type":edge_type,
          "source_label":source_label, "target_label":target_label,
          "source_name":source_name, "target_name":target_name, 
         }
    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           
           MATCH (source{source_label} {{ name:{source_name}}}),
                 (target{target_label} {{ name:{target_name}}})
           
           CREATE (source)-[:{edge_type}]->(target)'''.format(**key)

    return q

In [35]:
label = "FunctionalCluster"
f = f"{label}-components.tsv"
functional_clusters_import.to_csv(f"../data/import/{f}", sep="\t", index=None)


q = '''LOAD CSV WITH HEADERS FROM 'file:///{file_name}' AS line  FIELDTERMINATOR '\t'
       MATCH (parent:Family {{ name:line.family }}) RETURN parent.name AS name, labels(parent)   AS labels
    '''.format(file_name=f)
print(q)
qr = graph.run(q)

df = pd.DataFrame(qr.data())
df['labels'] = df['labels'].apply(clean_labels)
df = df.drop_duplicates(keep='first')
df = functional_clusters_import.join(df.set_index('name'), on='family')

df.head()

LOAD CSV WITH HEADERS FROM 'file:///FunctionalCluster-components.tsv' AS line  FIELDTERMINATOR '	'
       MATCH (parent:Family { name:line.family }) RETURN parent.name AS name, labels(parent)   AS labels
    


Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name,labels
340,AAO,AAO,rx00081,family,"AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT4G34...","AT2G27150,AT5G20960,AT3G43600,AT4G34900,AT4G34...",,,,"AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT...",PlantCoding
318,ACH,ACH,rx00038,family,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]",PlantCoding
301,ACO,ACO,rx00003,family,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G05010,AT1G62380,AT1G12010,AT2G19590,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",PlantCoding
300,ACS,ACS,rx00002,family,"AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT3G61...","AT4G37770,AT4G26200,AT3G61510,AT3G49700,AT1G01...",,,,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT...",PlantCoding
148,ACS2,ACS,rx00248,node,AT1G01480,AT1G01480,,,,ACS2[AT1G01480],PlantCoding


In [36]:
df['labels'].value_counts()

PlantCoding       227
PlantNonCoding     10
PlantAbstract       8
Name: labels, dtype: int64

In [37]:
# save node types
label = "FunctionalCluster"
print(label, "\t", df.shape[0])

for t, subdf in df.groupby('labels'):
    subdf.to_csv(f"../data/import/{label}-{t}-components.tsv", sep="\t", index=None)

    query = functional_cluster_query(f"{label}-{t}-components.tsv", [label, t], n_name="line.functional_cluster_name")
    qr = graph.run(query)
    print(t, "\t", subdf.shape[0], qr.stats()['nodes_created'])

FunctionalCluster 	 245
PlantAbstract 	 8 8
PlantCoding 	 227 227
PlantNonCoding 	 10 10


In [38]:
functional_clusters_import

Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,"AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT4G34...","AT2G27150,AT5G20960,AT3G43600,AT4G34900,AT4G34...",,,,"AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT..."
318,ACH,ACH,rx00038,family,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G05010,AT1G62380,AT1G12010,AT2G19590,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,"AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT3G61...","AT4G37770,AT4G26200,AT3G61510,AT3G49700,AT1G01...",,,,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT..."
148,ACS2,ACS,rx00248,node,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]
...,...,...,...,...,...,...,...,...,...,...
274,miR319a-3p,miR319,rx00307,node,miR319a-3p,,miR319a-3p,,,miR319a-3p[miR319a-3p]
275,miR6022,miR6022,rx00308,clade,miR6022-3p,,miR6022-3p,,,miR6022[miR6022-3p]
263,phasiRNA931,phasiRNA931,rx00282,node,phasiRNA931,,phasiRNA931,,,phasiRNA931[phasiRNA931]
257,vsiRNA12986,vsiRNA12986,rx00276,node,vsiRNA12986,,vsiRNA12986,,,vsiRNA12986[vsiRNA12986]


In [39]:
# component to complex edges
edge_type = 'TAKES_PART'
want_cols = ['functional_cluster_name', 'family']

f = f'{edge_type}-{label}-edges.tsv'  
functional_clusters_import.to_csv(f"../data/import/{f}", index=None, sep="\t")

query = make_create_type_of_edge_query(f, edge_type, 
                       source_label='Family', target_label=label,
                       source_name="line.family", target_name="line.functional_cluster_name",
                      )
print(query)
qr = graph.run(query)

print(label, functional_clusters_df.shape[0], qr.stats()['relationships_created'])    
if not functional_clusters_df.shape[0] == qr.stats()['relationships_created']:
    print("\tnot all edges created")

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///TAKES_PART-FunctionalCluster-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:Family { name:line.family}),
                 (target:FunctionalCluster { name:line.functional_cluster_name})
           
           CREATE (source)-[:TAKES_PART]->(target)
FunctionalCluster 251 245
	not all edges created


## Replace ids in edges table

In [40]:
functional_clusters_df.head()


Unnamed: 0,node_name,family,reaction_id,level,members,ath_homologues,stu_homologues,sly_homologues,osa_homologues,functional_cluster_name
340,AAO,AAO,rx00081,family,"AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT4G34...","AT2G27150,AT5G20960,AT3G43600,AT4G34900,AT4G34...",,,,"AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT..."
318,ACH,ACH,rx00038,family,"AT2G30720,AT5G48370","AT2G30720,AT5G48370",,,,"ACH[AT2G30720,AT5G48370]"
301,ACO,ACO,rx00003,family,"AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT2G19590","AT1G05010,AT1G62380,AT1G12010,AT2G19590,AT1G77330",,,,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT..."
300,ACS,ACS,rx00002,family,"AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT3G61...","AT4G37770,AT4G26200,AT3G61510,AT3G49700,AT1G01...",,,,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT..."
148,ACS2,ACS,rx00248,node,AT1G01480,AT1G01480,,,,ACS2[AT1G01480]


In [41]:
functional_clusters_translate = functional_clusters_df[['node_name', 'level', 'functional_cluster_name']].set_index(['node_name', 'level']).to_dict('index')

def get_new_id(x):
    id_, level_, label_ = x.values
    
    if label_ in helpers.plant_node_labels:
        try:
            return functional_clusters_translate[(id_, level_)]['functional_cluster_name']
        except KeyError:
            pass
        
        # if reaction has same node on different levels, then one is removed
        return functional_clusters_translate[(id_, 'node')]['functional_cluster_name']
        print(id_, level_, label_)
    else:
        return id_



for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col, label_col, new_id =\
            [prefix + x for x in ('_ID',  '_form',  '_level', '_label', '_newID')]

    df_edges[new_id] = df_edges[[id_col, level_col, label_col]].apply(get_new_id, axis=1, result_type='expand')


input1
input2
input3
output1


In [42]:
df_edges.head()

Unnamed: 0,Status,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,...,input2_label,input2_family,input3_label,input3_family,output1_label,output1_family,input1_newID,input2_newID,input3_newID,output1_newID
0,forCB,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,...,PlantCoding,SAM,,,Metabolite,,L-Met,"SAMS[AT1G02500,AT2G36880,AT3G17390,AT4G01850]",,SAMe
1,forCB,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,...,PlantCoding,ACS,,,Metabolite,,SAMe,"ACS[AT1G01480,AT1G62960,AT2G22810,AT3G49700,AT...",,ACC
2,forCB,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,...,PlantCoding,ACO,,,Metabolite,,ACC,"ACO[AT1G05010,AT1G12010,AT1G62380,AT1G77330,AT...",,ET
3,forCB,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,...,PlantCoding,HMA,,,Metabolite,,Cu2+,"HMA[AT1G63440,AT4G33520,AT5G21930,AT5G44790]",,Cu2+
4,forCB,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,...,Metabolite,,,,PlantCoding,ETR,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT...",Cu2+,,"ETR[AT1G04310,AT1G66340,AT2G40940,AT3G04580,AT..."


In [43]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus', 
    'mitochondria?': 'putative:mitochondrion', 
    'cytoplasm?': 'putative:cytoplasm', 
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])

good_localisations.update(['putative:' + s for s in good_localisations])


def node_localisation_std(x):
    if not type(x) == str:
        return "putative:cytoplasm"
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        print(x)
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col, location_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation', '_location')]
    
    x = df_edges[['ConnID', 'origin', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    for _, y in x.iterrows():
        if (not (y[id_col] in helpers.empty_strings)) and (y[localisation_col] in helpers.empty_strings + ['']):
            print(y['origin'], "|", y['ConnID'], "|", prefix, "|", y[id_col], "|", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[location_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[location_col])

In [44]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'cytoplasm?',
 'extracellular',
 'mitochondria?',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [45]:
new_localisation

{'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'putative:cytoplasm',
 'putative:mitochondrion',
 'vacuole'}

In [46]:
good_localisations

{'apoplast',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'mitochondrion',
 'nucleolus',
 'nucleus',
 'peroxisome',
 'putative:apoplast',
 'putative:chloroplast',
 'putative:cytoplasm',
 'putative:endoplasmic reticulum',
 'putative:extracellular',
 'putative:golgi apparatus',
 'putative:mitochondrion',
 'putative:nucleolus',
 'putative:nucleus',
 'putative:peroxisome',
 'putative:vacuole',
 'vacuole'}

In [47]:
print('reaction_mode_dict = {')
for s in df_edges['ReactionMode'].unique():
    print(f"\t'{s}':'{'/'.join([x.lower().strip() for x in str(s).split('/')])}',")
print('}')

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational induction',
	'degradation / secretion':'degradation/secretion',
	'nan':'nan',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}


In [48]:
# # Old version before edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'translation',
# 	'transcription':'transcription',
# 	'by binding':'by binding',
# 	'nan':'nan',
# 	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'protein phosphorylation',
# }

# Old version after edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'bad',
# 	'transcription':'bad',
# 	'by binding':'bad',
# 	np.nan:'bad',
# 	'cleavage / auto-cleavage':'bad',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'bad',
# }

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational induction',
	'degradation / secretion':'degradation/secretion',
	np.nan:'undefined',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}

In [49]:
df_edges['reaction_type'] = df_edges['ReactionMode'].apply(lambda x: reaction_mode_dict[x])

In [50]:
df_edges[df_edges['reaction_type']=='undefined'][['origin', 'ConnID', 'ReactionMode', 'ReactionEffect', 'reaction_type']]

Unnamed: 0,origin,ConnID,ReactionMode,ReactionEffect,reaction_type
31,_TBD-v2.7.8_PIS-model-Reactions,Conn032,,inhibition,undefined
60,_TBD-v2.7.8_PIS-model-Reactions,Conn061,,inhibition,undefined
87,_TBD-v2.7.8_PIS-model-Reactions,Conn088,,inhibition,undefined
88,_TBD-v2.7.8_PIS-model-Reactions,Conn089,,inhibition,undefined
91,_TBD-v2.7.8_PIS-model-Reactions,Conn092,,activation,undefined
120,_TBD-v2.7.8_PIS-model-Reactions,Conn121,,inhibition,undefined
150,_TBD-v2.7.8_PIS-model-Reactions,Conn151,,inhibition,undefined
209,_TBD-v2.7.8_PIS-model-Reactions,Conn210,,inhibition,undefined
310,_TBD-v2.7.8_PIS-model-Reactions,Conn311,,inhibition,undefined


In [51]:
len(df_edges['reaction_id'].unique()) == df_edges.shape[0]

True

In [52]:
df_edges.to_csv(output_path / "edges-sheet.tsv", sep="\t")

In [53]:
df.to_csv(output_path / "functional_clusters.tsv", sep="\t", index=False)

In [54]:
with open(output_path / "complexes_to_add.tsv", "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")

# END