In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 2/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image, display

In [3]:
from py2neo import Graph, Node, Relationship

In [4]:
import helpers

In [5]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
graph = Graph(host="neo4j")

In [7]:
q = graph.run("MATCH (n) RETURN n LIMIT 10")

In [8]:
q.stats()

{}

In [9]:
node_labels = helpers.node_labels

In [10]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

In [11]:
# start from scratch
graph.delete_all()

for l in node_labels:
    try:
        graph.schema.drop_uniqueness_constraint(l, "name")
    except:
        pass    

## Constraints on node names

Constraint automatically adds an index. 

In [12]:
for label in node_labels:
    q = f'''CREATE CONSTRAINT unique_name_{label.lower()}
        ON (node:{label}) ASSERT node.name IS UNIQUE'''
    try: 
        qr = graph.run(q)
        print(label, qr.stats()['constraints_added'])
    except Exception as e:
        print(e)

FunctionalCluster 1
PlantCoding 1
PlantNonCoding 1
PlantAbstract 1
ForeignEntity 1
ForeignCoding 1
ForeignNonCoding 1
ForeignAbstract 1
Complex 1
Process 1
MetaboliteFamily 1
Metabolite 1
Reaction 1


## Read in

In [13]:
path =  parsed_path / "components.tsv"
df_components = pd.read_csv(path, sep="\t")

## metabolites

In [14]:
label = 'Metabolite'
df_metabolites = df_components[df_components['NodeLabel'] == label].copy()

In [15]:
df_metabolites.columns

Index(['identifier', 'AddedBy', 'species', 'NodeLabel', 'NodeType', 'Family',
       'Clade', 'NodeID', 'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus', 'gmm_ocd',
       'GMM_Description', 'GMM_ShortName', 'synonyms'],
      dtype='object')

In [16]:
df_metabolites[df_metabolites['NodeName'].duplicated()]

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms


In [17]:
want_columns = ['AddedBy', 'Family', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus',                 
               ]

In [18]:
f = '%s-components.tsv'%label
df_metabolites[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [19]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.NodeName"
                    )

In [20]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///Metabolite-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Metabolite:Metabolite   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [21]:
qr = graph.run(q)
if not df_metabolites.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [22]:
df_metabolites[df_metabolites['Clade'] != df_metabolites['NodeID']]

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms


In [23]:
label = "MetaboliteFamily"

In [24]:
df_metabolites_has_family = df_metabolites[df_metabolites['Family'] != df_metabolites['NodeID']]

In [25]:
want_columns = ['AddedBy', 'Family',
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [26]:
df_metabolites_family = df_metabolites_has_family.fillna('').groupby('Family').agg({
                                                                  'Family':lambda x:list(x)[0],
                                                                  'AddedBy':lambda x:list(x)[0], 
                                                                  'NodeDescription':lambda x: ','.join(x), 
                                                                  'external_links':lambda x: ','.join([s for s in x if not s=='']), 
                                                                  'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                                  'Process':lambda x:list(x)[0], 
                                                                  'ModelV':helpers.get_latest_model, 
                                                                  'ModelStatus':helpers.get_model_status})

In [27]:
df_metabolites_family

Unnamed: 0_level_0,Family,AddedBy,NodeDescription,external_links,AdditionalInfo,Process,ModelV,ModelStatus
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DAMP/HAMP,DAMP/HAMP,KG,oligogalacturonides,chebi:62533,damage/host associated molecular patterns; Oli...,,v2.6,use
PostROS,PostROS,MZ,water,chebi:15377,,S:ROS,v2.7,ignore
PreROS,PreROS,MZ,ROS precursors; oxygen,chebi:25805,,S:ROS,v2.6,use
ROS,ROS,MZ,"hydrogen peroxide (reactive oxygen species),hy...","chebi:16240,chebi:25935,chebi:18421,chebi:2581...",,S:ROS,v2.6,use
SLs,SLs,MZ,StrigoLactone(s),"doi:10.1126/science.1218094,chebi:68487",Strigolactones derive from carotenoids via a p...,Hormone:SLs,v0.0,use
tZRMP,tZRMP,AG,9-ribosyl-trans-zeatin-5&prime;-monophosphate,chebi:71719,,Hormone:CK,v0.0,use


In [28]:
f = '%s-components.tsv'%label
df_metabolites_family.to_csv("../data/import/" + f, sep="\t", index=None)

In [29]:
q = helpers.metabolite_node_query(f, label, 
                     n_name="line.Family"
                    )

In [30]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///MetaboliteFamily-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Metabolite:MetaboliteFamily   { 
                name:line.Family, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [31]:
qr = graph.run(q)
if not df_metabolites_family.shape[0] == qr.stats()['nodes_created']:
    raise Exception 

In [32]:
# Metabolite to MetaboliteFamily edges
edge_type = 'TYPE_OF'
f = '%s-edges.tsv'%edge_type
df_metabolites_has_family[want_columns + ["NodeName"]].to_csv("../data/import/" + f, sep="\t", index=None)

In [33]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="Metabolite", target_label="MetaboliteFamily",
                           source_name="line.NodeName", target_name="line.Family")

In [34]:
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///TYPE_OF-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source:Metabolite { name:line.NodeName}),
                 (target:MetaboliteFamily { name:line.Family})
           
           CREATE (source)-[:TYPE_OF {
                        added_by:line.AddedBy,
                        additional_information: line.AdditionalInfo, 
                        model_version:line.ModelV,
                        model_status:line.ModelStatus,
                        
                        pathway:line.Process
                        }]->(target)


In [35]:
qr = graph.run(q)
if not df_metabolites_has_family.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Pathogens (Foreign)

In [36]:
df_external = df_components[df_components["NodeLabel"].isin(helpers.foreign_node_labels)].copy()

In [37]:
df_external.columns

Index(['identifier', 'AddedBy', 'species', 'NodeLabel', 'NodeType', 'Family',
       'Clade', 'NodeID', 'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus', 'gmm_ocd',
       'GMM_Description', 'GMM_ShortName', 'synonyms'],
      dtype='object')

In [38]:
# Species --> '-'
# Family --> 'biological classification'
# Clade --> 'species'
df_external.columns = ['identifier', 'AddedBy', '-', 'NodeLabel', 'NodeType', 
                       'classification', 'species', 'NodeID', 'NodeName', 
                       'external_links', 'NodeDescription', 'AdditionalInfo', 
                       'Process', 'ModelV', 'ModelStatus', 
                       'gmm_ocd', 'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms']

In [39]:
#manualfix
df_external.loc[df_external['species']=='oomycete', 'classification'] = 'oomycete'

In [40]:
df_external.head()

Unnamed: 0,identifier,AddedBy,-,NodeLabel,NodeType,classification,species,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,GMM_Synonyms
3,node0467,KG,external,ForeignCoding,pathogen_coding,bacteria,trichous-bacteria,elf18,elf18,chebi:73165,EF-Tu fragment,"N terminus of elongation factor Tu (EF-Tu), th...",Pathogen_Effector,vNA,use,,,,
4,node0466,KG,external,ForeignCoding,pathogen_coding,bacteria,trichous-bacteria,flg22,flg22,,flagellin fragment,Flagellin is the structural protein that forms...,Pathogen_Effector,vNA,use,,,,
5,node0468,KG,external,ForeignCoding,pathogen_coding,oomycete,oomycete,ch,ch,chebi:17029,chitin,Chitin is a polymer of N-acetyl-d-glucosamine ...,Pathogen_Effector,vNA,use,,,,
6,node0455,KG,external,ForeignCoding,pathogen_coding,virus,potyvirus,6K1,6K1,doi:10.1016/j.coviro.2012.09.004,small peptide of unknown functions,,Pathogen,v2.6,use,,,,
7,node0456,KG,external,ForeignCoding,pathogen_coding,virus,potyvirus,6K2,6K2,doi:10.1016/j.coviro.2012.09.004,small peptide of unknown functions,,Pathogen,v2.6,use,,,,


In [41]:
want_columns = ['AddedBy',  
                'classification', 'species', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus']

In [42]:
df_external[['NodeType', 'classification', 'species', 'NodeName']]

Unnamed: 0,NodeType,classification,species,NodeName
3,pathogen_coding,bacteria,trichous-bacteria,elf18
4,pathogen_coding,bacteria,trichous-bacteria,flg22
5,pathogen_coding,oomycete,oomycete,ch
6,pathogen_coding,virus,potyvirus,6K1
7,pathogen_coding,virus,potyvirus,6K2
8,pathogen_coding,virus,potyvirus,CI
9,pathogen_coding,virus,potyvirus,CP
10,pathogen_coding,virus,potyvirus,HC-Pro
11,pathogen_coding,virus,potyvirus,NIa-Pro
12,pathogen_coding,virus,potyvirus,NIb


In [43]:
for label, subdf in df_external.groupby('NodeLabel'):
    print(label, end='\t')
    f = '%s-components.tsv'%label
    subdf[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)
    
    q = helpers.foreign_node_query(f, label, 
                     n_name="line.NodeName"
                    )
    print(q)
    qr = graph.run(q)
    if not subdf.shape[0] == qr.stats()['nodes_created']:
        raise Exception

ForeignCoding	USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///ForeignCoding-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Foreign:ForeignCoding   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                pathway:line.Process,
                species:split(line.species, ","),
                external_links:split(line.external_links, ","),
                                             
                classification:line.classification
            })


In [44]:
external_entities = df_external.groupby('species').agg({
                                                      'classification':lambda x:list(x)[0],
                                                      'AddedBy':lambda x:list(x)[0], 
                                                      #'NodeDescription':lambda x: ', '.join(x), 
                                                      #'AdditionalInfo':lambda x: helpers.list_to_string(x), 
                                                      'Process':lambda x:list(x)[0], 
                                                      'ModelV':helpers.get_latest_model, 
                                                      #'ModelStatus':helpers.get_model_status
                                                }).reset_index()

In [45]:
external_entities.columns

Index(['species', 'classification', 'AddedBy', 'Process', 'ModelV'], dtype='object')

In [46]:
label = 'ForeignEntity'
want_columns = ['species', 'classification', 'AddedBy', 'Process', 'ModelV']
f = '%s-components.tsv'%label
external_entities[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [47]:
q = helpers.foreign_node_query(f, label, 
                     n_name="line.species", 
                    )
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///ForeignEntity-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Foreign:ForeignEntity   { 
                name:line.species, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                pathway:line.Process,
                species:split(line.species, ","),
                external_links:split(line.external_links, ","),
                                             
                classification:line.classification
            })


In [48]:
qr = graph.run(q)
if not external_entities.shape[0] == qr.stats()['nodes_created']:
    raise Exception

In [49]:
# ExternalXXX to external_entities edges
want_columns = ['AddedBy', 'NodeName', 'species', 'ModelV']
edge_type = 'AGENT_OF'
f = '%s-edges.tsv'%edge_type
df_external[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [50]:
q = helpers.make_create_type_of_edge_query(f, edge_type, 
                           source_label="", target_label="ForeignEntity",
                           source_name="line.NodeName", target_name="line.species")
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///AGENT_OF-edges.tsv' AS line FIELDTERMINATOR '	'
           
           MATCH (source { name:line.NodeName}),
                 (target:ForeignEntity { name:line.species})
           
           CREATE (source)-[:AGENT_OF {
                        added_by:line.AddedBy,
                        additional_information: line.AdditionalInfo, 
                        model_version:line.ModelV,
                        model_status:line.ModelStatus,
                        
                        pathway:line.Process
                        }]->(target)


In [51]:
qr = graph.run(q)
if not df_external.shape[0] == qr.stats()['relationships_created']:
    raise Exception

## Process

In [52]:
df_process = df_components[df_components['NodeLabel'].isin(['Process'
                                                                ])].copy()

In [53]:
df_process.head()

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms
1009,node0792,ZR,all,Process,process,AUX-signalling,AUX-signalling,AUX-signalling,AUX-signalling,conceptual:process,Auxin signalling.,,Hormone:AUX,v2.6,use,,,,
1010,node0788,ZR,all,Process,process,Anthocyanin-accumulation,Anthocyanin-accumulation,Anthocyanin-accumulation,Anthocyanin-accumulation,conceptual:process,Accumulation of anthocynanins.,,P:Secondary-metabolism_Anthocyanins,v2.6,use,,,,
1011,node0793,ZR,all,Process,process,CO2-deficiency,CO2-deficiency,CO2-deficiency,CO2-deficiency,conceptual:process,ROS processes.,,S:ROS,v2.6,use,,,,
1012,node0790,ZR,all,Process,process,RNA-silencing,RNA-silencing,RNA-silencing,RNA-silencing,conceptual:process,RNA silencing.,,P:Silencing,v2.6,use,,,,
1013,node0791,ZR,all,Process,process,ROS-production,ROS-production,ROS-production,ROS-production,conceptual:process,ROS processes.,,S:ROS,v2.6,use,,,,


In [54]:
want_columns = ['AddedBy', 'NodeLabel',  
        'NodeName', 'external_links', 'NodeDescription',
       'AdditionalInfo', 'Process', 'ModelV', 'ModelStatus']


In [55]:
label = "Process"
f = "%s-components.tsv"%label
df_process[want_columns].to_csv("../data/import/" + f, sep="\t", index=None)

In [56]:
q = helpers.process_node_query(f, label, 
                     n_name="line.NodeName", 
                    )
print(q)

USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///Process-components.tsv' AS line FIELDTERMINATOR '	'
           CREATE (p:Process   { 
                name:line.NodeName, 
                added_by:line.AddedBy,
                description:line.NodeDescription, 
                additional_information: line.AdditionalInfo, 
                model_version:line.ModelV,
                model_status:line.ModelStatus, 
                pathway:line.Process,
                
                external_links:split(line.external_links, ",")
                
            })


In [57]:
qr = graph.run(q)
if not df_process.shape[0] == qr.stats()['nodes_created']:
    raise Exception

## Complexes

In [58]:
label = 'Complex'
df_complex = df_components[df_components['NodeLabel'] == label].copy()

In [59]:
df_complex

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms
0,node0887,ZR,all,Complex,plant_complex,SCF,SCF,SCF,SCF,go:0019005,SCF,,,v2.7,use,,,,
1,node0888,ZR,all,Complex,plant_complex,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,WD/bHLH/MYB,conceptual:complex,WD/bHLH/MYB,,,v2.7,use,,,,
2,node0787,ZR,all,Complex,plant_complex,ribosome,ribosome,ribosome,ribosome,go:0005840,ribosome,,,v2.6,use,,,,


In [60]:
# save node types 
print(label, "\t", df_complex.shape[0])
df_complex.to_csv(f"../data/import/{label}-components.tsv", sep="\t", index=None)

Complex 	 3


In [61]:
query = helpers.bioelement_node_query(f"{label}-components.tsv", label, n_name="line.Family")
qr = graph.run(query)
print(label, "\t", qr.stats()['nodes_created'])

Complex 	 3


## Bio-elements

these nodes have "Family", "Clade" (msa) and "FunctionalCluster"

In [62]:
df_bioelements = df_components[df_components['NodeLabel'].isin(helpers.plant_node_labels)].copy()

In [63]:
df_bioelements[df_bioelements['NodeName'].duplicated(keep=False)].sort_values('NodeName')[['identifier', 'species', 'Family', 'Clade', 'NodeID', 'NodeName']]

Unnamed: 0,identifier,species,Family,Clade,NodeID,NodeName
335,node0795,ath,CPS,CPS,AT4G02780,CPS
336,node0796,stu,CPS,CPS,SOTUB06G034690.1.1,CPS
337,node0797,stu,CPS,CPS,SOTUB08G006560.1.1,CPS.x1
338,node0800,sly,CPS,CPS,SOLYC06G084240.2.1,CPS.x1
339,node0798,stu,CPS,CPS,SOTUB08G020310.1.1,CPS.x2
340,node0801,sly,CPS,CPS,SOLYC08G005710.3.1,CPS.x2
477,node0823,stu,GA20ox,GA20ox,SOTUB06G023200.1.1,GA20ox.x1
478,node0828,sly,GA20ox,GA20ox,SOLYC06G050110.2.1,GA20ox.x1
479,node0824,stu,GA20ox,GA20ox,SOTUB09G017720.1.1,GA20ox.x2
480,node0829,sly,GA20ox,GA20ox,SOLYC09G009110.3.1,GA20ox.x2


In [64]:
pd.value_counts(df_bioelements['NodeLabel'])

PlantCoding       858
PlantNonCoding     12
PlantAbstract      12
Name: NodeLabel, dtype: int64

In [65]:
all_species_listed = list(df_bioelements['species'].unique()); all_species_listed
all_species = all_species_listed.copy()
all_species

['ath', 'stu', 'sly', 'osa']

In [66]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return ",".join(list(v))
    return ''

def str_lists_to_one_list(x):
    l = []
    for sub in x:
        l += sub.split(',')
    l = [s.strip() for s in l if not s=='']
    
    return ','.join(list(set(l)))

def get_species_homologues(level, meta=False, use=False):
    
    if use:
        df = df_bioelements[df_bioelements['ModelStatus'] == 'use'].copy()
    else:
        df = df_bioelements.copy()

    df['family'] = df['Family']
    
    df_level_species  = df.groupby([level, 'species']).agg({
      'NodeID':lambda x: set(x), 
      'family':lambda x: set(x).pop(), 
    })
       
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['species'] == specie, col] =\
                 df_level_species[df_level_species['species'] == specie]['NodeID']
    
    selector = {
        y:pick_the_set for y in cols
    }
    selector['family'] = lambda x: x[0]
        
    df_level_species = df_level_species.groupby(level).agg(selector)
    
    if meta:
        df_level = df.fillna('').groupby(level).agg({
              'AddedBy':lambda x:list(x)[0], 
              'NodeLabel':lambda x:list(x)[0], 
              'NodeDescription':lambda x: ', '.join(list(set(x))), 
              'AdditionalInfo':lambda x: helpers.list_to_string(x), 
              'Process':lambda x:list(x)[0], 
              'ModelV':helpers.get_latest_model, 
              'species':lambda x: ','.join(list(set(x))),
              'ModelStatus':helpers.get_model_status,
              'external_links':str_lists_to_one_list, 
              'gmm_ocd':str_lists_to_one_list,
              'GMM_Description':str_lists_to_one_list,
              'GMM_ShortName':str_lists_to_one_list,
              'synonyms':str_lists_to_one_list,
        })    

        df = df_level.join(df_level_species[cols])
    else:
        df = df_level_species#[cols]
    
    df.reset_index(inplace=True)
    return df

In [67]:
df_families = get_species_homologues("Family", meta=True)

In [68]:
# save node types 
family_node_labels = []
for t, subdf in df_families.groupby("NodeLabel"):
    print(t, "\t", subdf.shape[0])
    subdf.to_csv("../data/import/%s-components.tsv"%t, sep="\t", index=None)
    family_node_labels.append(t)

PlantAbstract 	 7
PlantCoding 	 162
PlantNonCoding 	 9


In [69]:
for t in family_node_labels:
    labels = [t, 'Family', 'Plant']
    query = helpers.bioelement_node_query("%s-components.tsv"%t, labels, n_name="line.Family")
    #query = f"MATCH (n:{t}) DELETE n"
    #print(query)
    qr = graph.run(query)
    print(t, "\t", qr.stats()['nodes_created'])

PlantAbstract 	 7
PlantCoding 	 162
PlantNonCoding 	 9


## save files

In [70]:
file_name = parsed_path / "bio_elements.tsv"
df_bioelements.to_csv(file_name, sep="\t", index=False)

In [71]:
dfs = []
for col_name, level in [('NodeName', 'node'), ('Clade', 'clade'), ('Family', 'family')]:
    print(col_name)
    df = get_species_homologues(col_name, use=True)

    df.rename(columns={col_name:'name'}, inplace=True)
    df['level'] = level
    df.set_index(['name', 'level'], inplace=True)
    dfs.append(df)

NodeName
Clade
Family


In [72]:
translate_df = pd.concat(dfs)
translate_df = translate_df.sort_index()
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20500,AT5G38120,AT1G20480,AT1G20510",,,,4CLL


In [73]:
file_name = parsed_path / "level_translation.tsv"
translate_df.to_csv(file_name, sep="\t")

In [74]:
dfs = []
for col_name, level in [('NodeName', 'node'), ('Clade', 'clade'), ('Family', 'family')]:
    print(col_name)
    df = get_species_homologues(col_name, use=False)

    df.rename(columns={col_name:'name'}, inplace=True)
    df['level'] = level
    df.set_index(['name', 'level'], inplace=True)
    dfs.append(df)

NodeName
Clade
Family


In [75]:
translate_df = pd.concat(dfs)
translate_df = translate_df.sort_index()
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20500,AT5G38120,AT1G20480,AT1G20490,AT4G05...",,,,4CLL


In [76]:
file_name = parsed_path / "level_not_use_translation.tsv"
translate_df.to_csv(file_name, sep="\t")

# END 