In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

from collections import defaultdict

In [3]:
import helpers

In [4]:
from importlib import reload

In [5]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [6]:
from py2neo import Graph, Node, Relationship

In [7]:
graph = Graph(host="neo4j")

## Read in Bio elements
bioelements = PlantCoding/PlantNonCoding/PlantAbstract

have family hierachy

In [8]:
file_name = output_path / "bio_elements.tsv"
df_bioelements = pd.read_csv(file_name, sep="\t")
df_bioelements.head()

Unnamed: 0,identifier,AddedBy,species,NodeLabel,NodeType,Family,Clade,NodeID,NodeName,external_links,NodeDescription,AdditionalInfo,Process,ModelV,ModelStatus,gmm_ocd,GMM_Description,GMM_ShortName,synonyms
0,node0278,KG,ath,PlantAbstract,plant_abstract,BA2H,BA2H,BA2H,BA2H,,BA2H,Benzoic acid 2-hydroxylase: Isolated from toba...,Hormone:SA,v1.0,ignore,,,,
1,node0255,KG,ath,PlantAbstract,plant_abstract,IPL,IPL,IPL,IPL,,Arabidopsis contains two ICS genes but has no ...,,Hormone:SA,v1.0,ignore,,,,
2,node0695,KG,ath,PlantAbstract,plant_abstract,R-gene,GPAphid2,GPAphid2,GPAphid2,invented:unidentified,green peach aphid 2; resistance against PVX (stu),,S:Rgenes,v2.6,ignore,,,,
3,node0692,KG,ath,PlantAbstract,plant_abstract,R-gene,HRT,HRT,HRT,invented:unidentified,HR against TCV (ath),,S:Rgenes,v2.5,ignore,,,,
4,node0691,KG,ath,PlantAbstract,plant_abstract,R-gene,Ny,Ny,Ny,invented:unidentified,HR against PVY (stu),,S:Rgenes,v2.5,ignore,,,,


In [9]:
#df_bioelements[df_bioelements['NodeName'].duplicated(keep=False)].sort_values('NodeName')[['identifier', 'species', 'Family', 'Clade', 'NodeID', 'NodeName']]
repeated_node_level = df_bioelements[df_bioelements['NodeName'].duplicated(keep='first')]['NodeName'].values

In [10]:
repeated_node_level

array(['CPS', 'CPS.x1', 'CPS.x2', 'RGA2', 'RGA2', 'GA20ox.x1',
       'GA20ox.x2', 'GA20ox1', 'GA20ox1', 'GA20ox2', 'GA20ox2', 'GA20ox3',
       'GA20ox3', 'GA20ox4', 'GA20ox4', 'GA3ox.x1', 'GA3ox.x2', 'GID1.x1',
       'GID1.x2', 'GID1.x3', 'MYB33', 'MYB33-65.x2'], dtype=object)

In [11]:
file_name = output_path / "level_translation.tsv"
translate_df = pd.read_csv(file_name, sep="\t", index_col=[0, 1])
translate_df.fillna('', inplace=True)
translate_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ath_homologues,stu_homologues,sly_homologues,osa_homologues,family
name,level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
&alpha;/&beta; hydroxylase,clade,AT3G03990,,,,&alpha;/&beta; hydroxylase
&alpha;/&beta; hydroxylase,family,AT3G03990,,,,&alpha;/&beta; hydroxylase
&beta;-carotene isomerase,clade,,,,OS11G0587000,&beta;-carotene isomerase
&beta;-carotene isomerase,family,,,,OS11G0587000,&beta;-carotene isomerase
4CLL,clade,"AT1G20510,AT1G20480,AT5G38120,AT1G20500",,,,4CLL


In [None]:
file_name = output_path / "level_not_use_translation.tsv"
translate_ignore_etc_df = pd.read_csv(file_name, sep="\t", index_col=[0, 1])
translate_ignore_etc_df.fillna('', inplace=True)
translate_ignore_etc_df.head()

## Read in parsed reactions

In [None]:
all_species = list(df_bioelements['species'].unique())
try: all_species.remove('plant_all')
except ValueError: pass
try: all_species.remove('all')
except ValueError: pass    
all_species

In [None]:
df_edges = pd.read_csv("parsed_reactions.tsv", sep="\t", index_col=0)
df_edges.head()

In [None]:
len(df_edges['ConnID'].unique()) == df_edges.shape[0]

In [None]:
df_edges['species'].value_counts()

In [None]:
df_edges['reaction_id'] = df_edges.index.map(lambda x: f"rx{x+1:05}")
df_edges['reaction_id'].head()

In [None]:
node_dict = {}
for label in helpers.node_labels:
    if label == "Metabolite":
        q = '''MATCH (n:%s) WHERE NOT n:MetaboliteFamily RETURN DISTINCT n.name'''%label
    else:
        q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

In [None]:
translate_df.head()

In [None]:
translate_ignore_etc_df.loc[('Rx1', 'node')]

In [None]:
def get_label(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, form_, level_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan

    new_label = None
    
    ########################
    # Simple Cases
    ########################
    if form_ in ['complex', 'complex_active']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
    
    elif form_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_, " |a (label) not a listed Metabolite/MetaboliteFamily")

    elif form_ in ['process', 'process_active']:
        if (id_ in node_dict["Process"]):
            new_label = "Process"
        else:
            print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |b (label) process not a listed process")
    
    else:
        # somethng plant, something foreign
        family_id = get_family_id(id_, level_, translate_df)
        
        if not family_id:
            family_id = get_family_id(id_, level_, translate_ignore_etc_df)

                
        if (family_id):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in helpers.plant_node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |d (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |e (label) could not find label")
                missing_in_components.update([id_]) 
        else:
            # check if foreign
            for label in helpers.foreign_node_labels:
                if id_ in node_dict[label]:
                    new_label = label
                    break
        
    if not new_label:
        print('ERROR', origin, "|", ConnID, "|", id_, "|", form_, "|", level_,  " |c (family id) could not convert to label")
        
    return new_label


    
def get_family_id(id_, level_, df):
    family_id = None
    if id_ in repeated_node_level:
        print('INFO', id_, level_, 'node level has multiple identifiers')
    if level_ == "family":    
        try:
            family_id = df.loc[(id_, 'family')]['family']
        except KeyError:
            pass
    elif level_ in ["clade", "clade/orthologue"]:
        try:
            family_id = df.loc[(id_, 'clade')]['family']
        except KeyError:
            pass
    elif level_ == "node":
        try:
            family_id = df.loc[(id_, 'node')]['family']
        except KeyError:
            pass

    return family_id

def apply_get_family_id(x):
    id_, level_, label_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan
    
    family_id = np.nan
    if label_ in helpers.plant_node_labels:
        family_id = get_family_id(id_, level_)
    
    return family_id


In [None]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level')]

    label_col = prefix + '_label'
    family_col = prefix + '_family'    
    
    df_edges[label_col] = df_edges[[id_col, form_col, level_col, 'ConnID', 'origin']].apply(get_label, axis=1, result_type='expand')
    #df_edges[family_col] = df_edges[[id_col, level_col, label_col, 'ConnID', 'origin']].apply(apply_get_family_id, axis=1, result_type='expand')


## Functional groups
Node --> identifiers

Clade --> identifiers

Family --> identifiers

In [None]:
helpers.plant_node_labels

In [None]:
def create_entry():
    d = {'level':'', 'members':'', 'species':'', "family":''}      
    for c in [f"{specie}_homologues" for specie in all_species]:
        d[c] = '-'

    return d

def list_of_string_list_to_list(x, delim=","):
    '''e.g: ['AT5G26130,AT1G50060,AT2G14610,AT2G14580', '', '', '']
    to: ['AT4G33710', 'AT4G33720', 'AT5G26130', 'AT1G50060', 'AT2G14610', 'AT2G14580']
    '''
    new_list = []
    for sub_s in x:
        if not sub_s in helpers.empty_strings:
            for s in sub_s.split(delim):
                if not s in helpers.empty_strings:
                    new_list.append(s)
    new_list = sorted(new_list)
    return new_list

print("column\tname\trxid\tlevel\tspecies(as in excel)\tspecies(for identifiers)")
reaction_players = defaultdict(lambda : defaultdict(dict))  #lambda : defaultdict(create_entry))
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, from_col, level_col, new_id, label_col  =\
            [prefix + x for x in ('_ID',  '_form',  '_level',  '_newID', '_label')]    
    homologue_cols = [f"{prefix}_{specie}_homologues" for specie in all_species]
    
    this_players = df_edges[['reaction_id', 'species', id_col, level_col, label_col]]# + homologue_cols]
    
    for _, row in this_players.iterrows():
        if row[label_col] in helpers.plant_node_labels:
            name = row[id_col]
            level = row[level_col]
            
            members = []
            
            reaction_species = row['species']
            
            if reaction_species == 'all':
                tmp_reaction_species = all_species
            else:
                tmp_reaction_species = row['species'].split(',')
            
            try:
                r = translate_df.loc[(name, level)]
                no_members = False
            except KeyError:
                r = translate_ignore_etc_df.loc[(name, level)]
                no_members = True
                #print(name, '- no members')
            

            was_in_species = []
            for specie in all_species:
                c = f"{specie}_homologues"
                if (not no_members) and (specie in tmp_reaction_species):
                    reaction_players[row[id_col]][row['reaction_id']][c] = r[c]
                    members.append(r[c])
                    was_in_species.append(specie)
                else:
                    reaction_players[row[id_col]][row['reaction_id']][c] = ''
            
            members = list_of_string_list_to_list(members)
            
            if ( not set(was_in_species) == set(tmp_reaction_species) ) or ( len(members) == 0 ):
                print(f"{id_col}\t{row[id_col]}\t{row['reaction_id']}\t{level}\t{reaction_species}\t{tmp_reaction_species}\t{members}")
                
            reaction_players[row[id_col]][row['reaction_id']]['level'] =  level
            reaction_players[row[id_col]][row['reaction_id']]['family'] =  r['family']
            reaction_players[row[id_col]][row['reaction_id']]['members'] =  members
            reaction_players[row[id_col]][row['reaction_id']]['species'] =  reaction_species
        #else:
            #print('---', id_col, row[label_col], row['reaction_id'])
reaction_players = dict(reaction_players)

In [None]:
with open("FunctionalClusters_first-definition.tsv", "w") as out:
    c = '\t'.join([f'{specie}_homologues' for specie in all_species])
    out.write(f"node_name\tfamily\treaction_id\tlevel\tspecies\tmembers\t{c}\n")
    for node in list(reaction_players):
        for reaction in list(reaction_players[node]):
            members = reaction_players[node][reaction]['members']
            c =  '\t'.join([reaction_players[node][reaction][f'{specie}_homologues'] for specie in all_species])
            out.write(f"{node}\t{reaction_players[node][reaction]['family']}\t{reaction}\t{reaction_players[node][reaction]['level']}\t{reaction_players[node][reaction]['species']}\t{','.join(members)}\t{c}\n")

In [None]:
!head FunctionalClusters_first-definition.tsv

In [None]:
functional_clusters_df = pd.read_csv("FunctionalClusters_first-definition.tsv", sep="\t")
functional_clusters_df = functional_clusters_df.fillna('')

In [None]:
functional_clusters_df[functional_clusters_df['level']=='node']

In [None]:
print(functional_clusters_df.shape[0])
functional_clusters_df.head()

In [None]:
print(functional_clusters_df.shape[0])
functional_clusters_df.head()

In [None]:
functional_clusters_df.sort_values('node_name', inplace=True)
functional_clusters_df = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'members', 'level'])

In [None]:
functional_clusters_df[functional_clusters_df['node_name']=='CPS']

In [None]:
functional_clusters_df[functional_clusters_df['node_name']=='MYB33']

In [None]:
x = functional_clusters_df[['node_name']].value_counts()
x = x[x>1].index[:]
x = [z[0] for z in x]
functional_clusters_df[functional_clusters_df['node_name'].isin(x)]

In [None]:
functional_clusters_df[functional_clusters_df['members'] == '']

In [None]:
functional_clusters_df.loc[:, "functional_cluster_name"] = functional_clusters_df.apply(lambda row: f"{row['node_name']}[{row['members']}]", axis=1)
functional_clusters_df.head()

In [None]:
functional_clusters_df[functional_clusters_df['members'] =='']

In [None]:
functional_clusters_df.head()

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'SAGT']

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'COI1']

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'EIN2']

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'miR6022']

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'miR159a']

In [None]:
functional_clusters_df[functional_clusters_df['node_name'] == 'X4']

In [None]:
functional_clusters_df[functional_clusters_df['reaction_id'] == 'rx00039']

## Create functional cluster nodes

In [None]:
functional_clusters_import = functional_clusters_df.drop_duplicates(keep='first', subset=['functional_cluster_name'])

In [None]:
functional_clusters_import.shape

In [None]:
def clean_labels(labels):
	for x in ['Family', 'Plant', 'Foreign', 'Node']:
		if x in labels:
			labels.remove(x)
	return labels[0]

def functional_cluster_query(file_name, 
                         labels, 
                         n_name="line.functional_cluster_name"
                        ):
    
    if type(labels) == list:
        node_label = ':' + ':'.join(labels)
    else:
        node_label = ':' + labels
    
    key = {"file_name":file_name, 
           "node_label":node_label, 
           "name":n_name}

    species_str = ""
    for specie in all_species:
        species_str += f"{specie}_homologues:split(line.{specie}_homologues, ','),\n                "

    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           CREATE (p{node_label}   {{ 
                name:{name}, 
                                
                {species_str}

                family:line.family
   
            }})'''.format(**key, species_str=species_str)
    
    return q



def make_create_type_of_edge_query(file_name, edge_type,
                           source_label="", target_label="",
                           source_name="line.source_name", target_name="line.target_name",
                          ):

    if not source_label == "":
        source_label = ':' + source_label
        
    if not target_label == "":
        target_label = ':' + target_label
                
    key ={"file_name":file_name, "edge_type":edge_type,
          "source_label":source_label, "target_label":target_label,
          "source_name":source_name, "target_name":target_name, 
         }
    
    q = '''USING PERIODIC COMMIT 500
           LOAD CSV WITH HEADERS FROM  'file:///{file_name}' AS line FIELDTERMINATOR '\t'
           
           MATCH (source{source_label} {{ name:{source_name}}}),
                 (target{target_label} {{ name:{target_name}}})
           
           CREATE (source)-[:{edge_type}]->(target)'''.format(**key)

    return q

In [None]:
label = "FunctionalCluster"
f = f"{label}-components.tsv"
functional_clusters_import.to_csv(f"../data/import/{f}", sep="\t", index=None)


q = '''LOAD CSV WITH HEADERS FROM 'file:///{file_name}' AS line  FIELDTERMINATOR '\t'
       MATCH (parent:Family {{ name:line.family }}) RETURN parent.name AS name, labels(parent)   AS labels
    '''.format(file_name=f)
print(q)
qr = graph.run(q)

df = pd.DataFrame(qr.data())
df['labels'] = df['labels'].apply(clean_labels)
df = df.drop_duplicates(keep='first')
df = functional_clusters_import.join(df.set_index('name'), on='family')

df.head()

In [None]:
df['labels'].value_counts()

In [None]:
# save node types
label = "FunctionalCluster"
print(label, "\t", df.shape[0])

for t, subdf in df.groupby('labels'):
    subdf.to_csv(f"../data/import/{label}-{t}-components.tsv", sep="\t", index=None)

    query = functional_cluster_query(f"{label}-{t}-components.tsv", [label, t], n_name="line.functional_cluster_name")
    qr = graph.run(query)
    print(t, "\t", subdf.shape[0], qr.stats()['nodes_created'])

In [None]:
functional_clusters_import

In [None]:
# component to complex edges
edge_type = 'TAKES_PART'
want_cols = ['functional_cluster_name', 'family']

f = f'{edge_type}-{label}-edges.tsv'  
functional_clusters_import.to_csv(f"../data/import/{f}", index=None, sep="\t")

query = make_create_type_of_edge_query(f, edge_type, 
                       source_label='Family', target_label=label,
                       source_name="line.family", target_name="line.functional_cluster_name",
                      )
print(query)
qr = graph.run(query)

print(label, functional_clusters_df.shape[0], qr.stats()['relationships_created'])    
if not functional_clusters_df.shape[0] == qr.stats()['relationships_created']:
    print("\tnot all edges created")

## Replace ids in edges table

In [None]:
functional_clusters_df = pd.read_csv("FunctionalClusters_first-definition.tsv", sep="\t")
functional_clusters_df = functional_clusters_df.fillna('')
functional_clusters_df.loc[:, "functional_cluster_name"] = functional_clusters_df.apply(lambda row: f"{row['node_name']}[{row['members']}]", axis=1)

functional_clusters_df = functional_clusters_df.drop_duplicates(keep='first', subset=['node_name', 'level', 'species', 'functional_cluster_name'])

functional_clusters_translate = functional_clusters_df[['node_name', 'level', 'species', 'functional_cluster_name']].set_index(['node_name', 'level', 'species']).to_dict('index')
functional_clusters_translate.keys()

In [None]:
def get_new_id(x):
    specie, id_, level_, label_ = x.values
    
    if label_ in helpers.plant_node_labels:
        try:
            return functional_clusters_translate[(id_, level_, specie)]['functional_cluster_name']
        except KeyError:
            pass
        
        # if reaction has same node on different levels, then one is removed
        return functional_clusters_translate[(id_, 'node', specie)]['functional_cluster_name']
        print(id_, level_, label_)
    else:
        return id_



for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, form_col, level_col, label_col, new_id =\
            [prefix + x for x in ('_ID',  '_form',  '_level', '_label', '_newID')]

    df_edges[new_id] = df_edges[['species', id_col, level_col, label_col]].apply(get_new_id, axis=1, result_type='expand')


In [None]:
df_edges.head()

In [None]:
len(df_edges['reaction_id'].unique()) == df_edges.shape[0]

In [None]:
df_edges.to_csv(output_path / "edges-sheet.tsv", sep="\t")

In [None]:
df.to_csv(output_path / "functional_clusters.tsv", sep="\t", index=False)

In [None]:
with open(output_path / "complexes_to_add.tsv", "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")

# END