# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [1]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [2]:
import helpers

In [3]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [4]:
from py2neo import Graph, Node, Relationship

In [5]:
graph = Graph(host="neo4j")

In [6]:
node_labels = [
    'PlantCoding',
    'PlantNonCoding',
    'PlantAbstract',
    'Complex',
    'ExternalOrganism', 
    'ExternalCoding',
    'ExternalNonCoding',
    'Process', 
    'MetaboliteFamily',
    'Metabolite',
    'Gene', 
    'Expression'
]

In [7]:
reaction_relationships = [
    'ACTIVATE',
    'SUBSTRATE',
    'INHIBIT',
    'PRODUCT',
    'TRANSLOCATE_FROM',
    'TRANSLOCATE_TO'
]

## Read in sheets

In [8]:
input_path = os.path.join("..", "data", "raw")

In [9]:
sheets = [#(file, sheet_name)]
    ("v2.7.5_PIS-model.xlsx", "Reactions"),
    ("v2.7.5_PIS-model.xlsx", "Reactions_New"), 
    ("Model_CK.xlsx", "Reactions_new"), 
    ("v2.7.2_PIS-model-JALR.xlsx", "Reactions_New")
]

In [10]:
# resave xlsx as tsv
drops = ['Status', 'FOXMES', 'Legacy:Process', 'Legacy:ReactionMode']#, 'ConnID']
col_rename = {
    'AddedBy':'AddedBy',
    'Species':'Species',
    'ID':'input1_ID',
    'level':'input1_level',
    'localisation':'input1_localisation',
    'type':'input1_type',
    'ID.1':'input2_ID',
    'level.1':'input2_level',
    'localisation.1':'input2_localisation',
    'type.1':'input2_type',
    'ID.2':'input3_ID',
    'level.2':'input3_level',
    'localisation.2':'input3_localisation',
    'type.2':'input3_type',
    'ReactionEffect':'ReactionEffect',
    'ReactionMode':'ReactionMode',
    'Modifications':'Modifications',
    'ID.3':'output1_ID',
    'level.3':'output1_level',
    'localisation.3':'output1_localisation',
    'type.3':'output1_type',
    'TrustLevel':'TrustLevel',
    'Literature':'Literature',
    'AdditionalInfo':'AdditionalInfo',
    'Comment':'Comment',
    'Model-v':'ModelV',
    'KINETICS':'kinetics', 
    'ConnID': 'ConnID'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = os.path.join(input_path, file_name)
    
    base_name, extension = os.path.splitext(file_name)
    new_file_path = os.path.join(input_path, f'{base_name}-{sheet_name}.tsv')
    
     if os.path.exists(new_file_path):
        continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)

    df = df[~df["AddedBy"].isna()]
    
    if 'Status' in df.columns:
        df = df[df['Status'].isin(["forCB", "forCB_INVENTED", np.nan])]

    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df['origin'] = f'{base_name}-{sheet_name}'
    
    df.to_csv(new_file_path, sep="\t", index=None)

v2.7.5_PIS-model.xlsx Reactions
v2.7.5_PIS-model.xlsx Reactions_New
Model_CK.xlsx Reactions_new
v2.7.2_PIS-model-JALR.xlsx Reactions_New


In [11]:
dfs = []

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)

    file_path = os.path.join(input_path, f'{base_name}-{sheet_name}.tsv')
    
    df = pd.read_csv(file_path, sep="\t")

    dfs.append(df)

v2.7.5_PIS-model.xlsx Reactions
v2.7.5_PIS-model.xlsx Reactions_New
Model_CK.xlsx Reactions_new
v2.7.2_PIS-model-JALR.xlsx Reactions_New


In [12]:
df_edges = pd.concat(dfs, sort=False)
df_edges.reset_index(drop=True, inplace=True)

In [13]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,origin
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,v2.7.5_PIS-model-Reactions
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,v2.7.5_PIS-model-Reactions
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,v2.7.5_PIS-model-Reactions
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,family,ER,metabolite,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,v2.7.5_PIS-model-Reactions
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,family,ER,protein [active],"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,v2.7.5_PIS-model-Reactions


In [14]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
df_edges.drop(x, inplace=True)

Int64Index([], dtype='int64')


In [15]:
df_edges["TrustLevel"].unique()

array(['[R1] targetted experiments (e.g. Y2H, BIFC)',
       '[Ry] invented reaction', '[Rx] incomplete/unspecific reaction',
       '[R2] high-throughput experiment (e.g. ChIP-seq)',
       '[R3] in-silico prediction', '[R4] indirect reaction'],
      dtype=object)

In [60]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
df_edges['observed_species'] = df_edges['Species'].apply(helpers.lower_string)
df_edges['species_also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [61]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

array(['KG', 'MZ', 'ZR', 'MPE', 'ACR', 'MAK', 'ST', 'SB', 'AG', 'JALR'],
      dtype=object)

In [20]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

array(['v1.0', 'v2.5', 'v2.7', 'v2.6', 'vNA'], dtype=object)

In [21]:
def get_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print(f"|{character}|", ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0' : b" ",           # funky WIN whitespace
    b'\xe2\x80\xa6': b'...',      # …
    b'\xe2\x80\x8b' : b'',        # have no figging clue
    b'\xe2\x80\x93' : b'-',       # –
    b'\xce\xb1' : b"alpha",       # α
    b'\xc3\x9f' : b"beta",        # ß
    b'\xce\xb2' : b"beta",        # β
    b'\xe2\x80\x98' : b"prime",   # ‘
    b'\xe2\x80\x99' : b"prime",   # ’
    b'\xc2\xb4': b'prime',        # ´
    # Sorry accents :(
    b'\xc5\xa0' : b"S",           # Š
    b'\xc5\xa1' : b's',           # š
    b'\xc5\xbd' : b'Z',           # Ž
    b'\xc4\x8d' : b'c'            # č
}


def replacer(x):
    if not x in [' ', '', np.nan]:
        x = x.encode('utf-8')
        for old, new in ascii_replacers.items():
            x = x.replace(old, new)
        return x.decode('utf-8')
    return ''

In [22]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(get_non_ascii)):
        bad_cols.append(c)
    print()

AddedBy 
-------------
|Š| 352 b'\xc5\xa0'
ŠT

ConnID 
-------------

Species 
-------------

input1_ID 
-------------
|´| 180 b'\xc2\xb4'
9-cis-b-apo-10´-carotenal

input1_level 
-------------

input1_localisation 
-------------

input1_type 
-------------

input2_ID 
-------------

input2_level 
-------------

input2_localisation 
-------------

input2_type 
-------------

input3_ID 
-------------

input3_level 
-------------

input3_localisation 
-------------

input3_type 
-------------

ReactionEffect 
-------------

ReactionMode 
-------------

Modifications 
-------------

output1_ID 
-------------
|´| 180 b'\xc2\xb4'
9-cis-b-apo-10´-carotenal

output1_level 
-------------

output1_localisation 
-------------

output1_type 
-------------

TrustLevel 
-------------

Literature 
-------------
|…| 8230 b'\xe2\x80\xa6'
|​| 8203 b'\xe2\x80\x8b'
|​| 8203 b'\xe2\x80\x8b'
|​| 8203 b'\xe2\x80\x8b'
|​| 8203 b'\xe2\x80\x8b'
|…| 8230 b'\xe2\x80\xa6'
DOI:10.1073/pnas.0605528103 (ETHYLENE-INS

In [23]:
for c in bad_cols:
    df_edges[c] = df_edges[c].apply(replacer)

In [24]:
def doi_list(x):
    x = x.lower()
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x =x.lower()
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []

In [25]:
# format literature sources
df_edges["Literature"].fillna(value="", inplace=True)
for i, row in df_edges.iterrows():
    s = row['Literature']
    source = doi_list(s)
    source += pubmed_list(s)
    for z in s.split("|"):
        key = z.lower()
        if ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + z.split(":")[1].strip()
                source.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + z.split(":")[1].strip()
                source.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pubmed" in key) or ("pmid" in key):
                pmid_string = "pubmed:" + z.split(":")[1].strip()
        elif "invented" in key:
            source.append("invented")
        else:
            print("no/bad reference", row['ConnID'], row['origin'], z)
            source.append("other:" + z.strip())
    if len(source) > 0:
        df_edges.loc[i, "literature_sources"] = helpers.list_to_string(source)
    else:
        print(row['ConnID'], row['origin'], z)


no/bad reference Conn040 v2.7.5_PIS-model-Reactions 
no/bad reference Conn118 v2.7.5_PIS-model-Reactions 
no/bad reference Conn120 v2.7.5_PIS-model-Reactions 
no/bad reference Conn122 v2.7.5_PIS-model-Reactions 
no/bad reference Conn182 v2.7.5_PIS-model-Reactions 
no/bad reference Conn183 v2.7.5_PIS-model-Reactions 
no/bad reference Conn199 v2.7.5_PIS-model-Reactions ? Kg need to find reference
no/bad reference Conn273 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn274 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn275 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn276 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn276 v2.7.5_PIS-model-Reactions_New  10.1073/pnas.98.4.2065
no/bad reference Conn277 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn278 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn279 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn280 v2.7.5_PIS-model-Reactions_New KEGG 
no/bad reference Conn281 v

In [26]:
df_edges[["origin", "ConnID", "Literature", "literature_sources"]].to_csv("lit-check.tsv", sep="\t", index=None)

In [27]:
df_edges.reset_index(inplace=True, drop=True)

In [28]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [29]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,origin,trust_level,also_observed_in,literature_sources
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,v2.7.5_PIS-model-Reactions,R1,,aracyc:ETHYL-PWY
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,v2.7.5_PIS-model-Reactions,R1,,aracyc:ETHYL-PWY
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,v2.7.5_PIS-model-Reactions,R1,,aracyc:ETHYL-PWY
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,v2.7.5_PIS-model-Reactions,R1,,doi:10.1105/tpc.001768
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,v2.7.5_PIS-model-Reactions,R1,,doi:10.1105/tpc.001768


In [30]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

## Protein, Clade, Family keys

In [31]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s

PlantCoding 161
PlantNonCoding 9
PlantAbstract 7
Complex 1
ExternalOrganism 3
ExternalCoding 14
ExternalNonCoding 0
Process 6
MetaboliteFamily 11
Metabolite 102
Gene 0
Expression 0


In [32]:
all_species = ['ath', 'stu', 'osa']

In [33]:
file_name = os.path.join(input_path, "bio_elements.tsv")
df_bioelements = pd.read_csv(file_name, sep="\t")

In [34]:
df_bioelements.head()

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier
0,KG,ath,plant_coding,SAM,SAMS,AT1G02500,SAM1,use,SAM synthetase [EC:2.5.1.6],,...,OCD_all_000621,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,S-adenosylmethionine synthetase 1,SAM1,"AtSAM1,MAT1,METK1,SAM-1,SAM1",PlantCoding,ath,,,
1,KG,ath,plant_coding,SAM,SAMS,AT4G01850,SAM2,use,SAM synthetase [EC:2.5.1.6],,...,OCD_all_000621,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,S-adenosylmethionine synthetase 2,SAM2,"AtSAM2,MAT2,SAM-2,SAM2",PlantCoding,ath,,,
2,KG,ath,plant_coding,SAM,SAMS,AT2G36880,SAM3,use,SAM synthetase [EC:2.5.1.6],,...,OCD_all_000621,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,methionine adenosyltransferase 3,METK3,"MAT3,METK3",PlantCoding,ath,,,
3,KG,ath,plant_coding,SAM,SAMS,AT3G17390,SAM4,use,SAM synthetase [EC:2.5.1.6],,...,OCD_all_000621,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,S-adenosylmethionine synthetase family protein,METK4,"MAT4,METK4,MTO3,SAMS3",PlantCoding,ath,,,
4,KG,ath,plant_coding,ACS,"ACS1,2,6",AT3G61510,ACS1,use,ACC synthase [EC:4.4.1.14],,...,OCD_all_000133,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,ACC synthase 1,ACS1,"ACC2,ACS1,AT-ACS1",PlantCoding,ath,,,


In [35]:
def pick_the_set(x):
    for v in x:
        if type(v) == set:
            return v
    return {}


def get_species_homologues(level):
    df_level_species  = df_bioelements.groupby([level, 'Species']).agg({
      'NodeID':lambda x: set(x), 
    })
    df_level_species.reset_index(1, inplace=True)

    cols = []
    for specie in all_species:
        col = specie + '_homologues'
        cols.append(col)
        df_level_species.loc[df_level_species['Species'] == specie, col] =\
         df_level_species[df_level_species['Species'] == specie]['NodeID']
    
    df_level_species = df_level_species[cols].groupby(level).agg({
        y:pick_the_set for y in cols
    })
    
    df_level = df_bioelements.groupby(level).agg({
          'AddedBy':lambda x:list(x)[0], 
          'NodeLabel':lambda x:list(x)[0], 
          'NodeDescription':lambda x: ', '.join(list(set(x))), 
          'AdditionalInfo':lambda x: helpers.list_to_string(x), 
          'Process':lambda x:list(x)[0], 
          'ModelV':helpers.get_latest_model, 
          'Species':lambda x: set(x), 
    })    

    df_level = df_level.join(df_level_species[cols])
    df_level.reset_index(inplace=True)
    
    return df_level

In [36]:
id_to_name = df_bioelements[['NodeID', 'NodeName']]
id_to_name[id_to_name['NodeName'].duplicated()]['NodeName'].unique()

array(['CPS', 'GA20ox3', 'GA20ox1', 'GA20ox4', 'MYB33'], dtype=object)

In [37]:
df_nodes = get_species_homologues('NodeName')
df_nodes.set_index('NodeName', inplace=True)

df_families = get_species_homologues("Family")

# node name to node IDs
node_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    node_ids_key[col] = df_nodes[col].to_dict()
df_clades = get_species_homologues('Clade')
df_clades.set_index('Clade', inplace=True)

# clade name to node IDs
clade_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    clade_ids_key[col] = df_clades[col].to_dict() 

# family name to node IDs
df_families.set_index("Family", inplace=True)
family_ids_key = {}
for species in all_species:
    col = species + '_homologues'
    family_ids_key[col] = df_families[col].to_dict() 

node_to_family = df_bioelements[["NodeName", "Family"]].set_index("NodeName")["Family"].to_dict()
clade_to_family = df_bioelements[["Clade", "Family"]].set_index("Clade")["Family"].to_dict()

In [38]:
def convert_node_to_family(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, type_, level_, ConnID, origin = x.values
      
    if id_ in helpers.empty_strings:
        return np.nan, np.nan

    new_label = None
    family_id = None
    
    ########################
    # Simple Cases
    ########################
    if type_ in ['complex', 'complex [active]', 'complex [activated]', 'complex [inactive]', 'plant_complex']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
        family_id = id_
    
    elif type_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_, " |a (label) not a listed metabolite")
        family_id = id_

    elif type_ in ['process']:
        if (id_ in node_dict["Process"]):
            family_id = id_
            new_label = "Process"
        else:
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |b (label) process not a listed process")
        
    else:
        ########################
        # family ID
        ########################
        check_external = False
        if level_ == "family":    
            family_id = id_
        elif level_ in ["clade", "clade/orthologue"]:
            try:
                family_id = clade_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        elif level_ == "node":
            try:
                family_id = node_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        
        if check_external:
            for label in ["ExternalOrganism", "ExternalCoding", "ExternalNonCoding"]:
                if id_ in node_dict[label]:
                    new_label = label
                    family_id = id_
                    break
                    
        if not family_id:
            print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |c (family id) could not convert to family/external")
                
        ########################
        # Label
        ########################
        if (family_id) and (not new_label):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |d (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(origin, "|", ConnID, "|", id_, "|", type_, "|", level_,  " |e (label) could not find label")
                missing_in_components.update([id_])        

    return family_id, new_label
        

In [40]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, level_col, new_id, new_label_col  =\
            [prefix + x for x in ('_ID',  '_type',  '_level',  '_newID', '_label')]    
    
    df_edges[[new_id, new_label_col]] = df_edges[[id_col, type_col, level_col, 'ConnID', 'origin']].apply(convert_node_to_family, axis=1, result_type='expand')


Model_CK-Reactions_new | nan | DHZRMP | metabolite | node  |a (label) not a listed metabolite
Model_CK-Reactions_new | nan | DHZRMP | metabolite | node  |a (label) not a listed metabolite
Model_CK-Reactions_new | nan | AHK2,3,4(a) | metabolite | clade  |a (label) not a listed metabolite
Model_CK-Reactions_new | nan | AHK2,3,4(a) | metabolite | clade  |a (label) not a listed metabolite
Model_CK-Reactions_new | nan | AHK2,3,4(a) | metabolite | clade  |a (label) not a listed metabolite
Model_CK-Reactions_new | nan | AHK2,3,4(a) | protein | clade  |c (family id) could not convert to family/external
Model_CK-Reactions_new | nan | ARR-B(a)(p) | protein | clade  |c (family id) could not convert to family/external
v2.7.2_PIS-model-JALR-Reactions_New | Conn273 | All-trans-b-carotene | metabolite | family  |a (label) not a listed metabolite
v2.7.2_PIS-model-JALR-Reactions_New | Conn274 | 9-cis-b-carotene | metabolite | node  |a (label) not a listed metabolite
v2.7.2_PIS-model-JALR-Reactions_New 

In [41]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,also_observed_in,literature_sources,input1_newID,input1_label,input2_newID,input2_label,input3_newID,input3_label,output1_newID,output1_label
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,aracyc:ETHYL-PWY,L-Met,Metabolite,SAM,PlantCoding,,,SAMe,Metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,aracyc:ETHYL-PWY,SAMe,Metabolite,ACS,PlantCoding,,,ACC,Metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,aracyc:ETHYL-PWY,ACC,Metabolite,ACO,PlantCoding,,,ET,Metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,doi:10.1105/tpc.001768,Cu2+,Metabolite,HMA,PlantCoding,,,Cu2+,Metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,doi:10.1105/tpc.001768,ETR,PlantCoding,Cu2+,Metabolite,,,ETR,PlantCoding


In [42]:
def get_orthologues(x, prefix=""):
    id_, level_, label_ = x.values
    ########################
    # Specie specific homologues
    ########################
    
    return_D = {f"{prefix}_{specie}_homologues":"" for specie in all_species}
    
    if label_ in ['PlantCoding', 'PlantNonCoding',  'PlantAbstract']:

        for species in all_species:
            species = f"{species}_homologues"
            k = f"{prefix}_{species}"
            if level_ == 'node':
                return_D[k] = node_ids_key[species][id_]
            elif level_ == 'clade':
                return_D[k] =  clade_ids_key[species][id_]
            elif level_ == 'family':
                return_D[k] =  family_ids_key[species][id_]

    return_D = {x:helpers.list_to_string(list(return_D[x])) for x in return_D}
    return return_D

In [43]:
new_dfs = []
for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, level_col, new_label_col  =\
                [prefix + x for x in ('_ID',  '_level',  '_label')]        
    
    new_df = df_edges[[id_col, level_col, new_label_col ]].apply(get_orthologues, axis=1, result_type='expand', prefix=prefix)
    new_dfs.append(new_df)
    #df_edges = df_edges.join(new_df, sort=False)

input1
input2
input3
output1


In [44]:
homologues_df = pd.concat(new_dfs, sort=False, axis=1)

In [45]:
homologues_df.loc[0]

input1_ath_homologues                                            
input1_osa_homologues                                            
input1_stu_homologues                                            
input2_ath_homologues     AT4G01850,AT3G17390,AT1G02500,AT2G36880
input2_osa_homologues                                            
input2_stu_homologues                                            
input3_ath_homologues                                            
input3_osa_homologues                                            
input3_stu_homologues                                            
output1_ath_homologues                                           
output1_osa_homologues                                           
output1_stu_homologues                                           
Name: 0, dtype: object

In [46]:
df_edges = df_edges.join(homologues_df, sort=False)

In [47]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input1_stu_homologues,input2_ath_homologues,input2_osa_homologues,input2_stu_homologues,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,"AT4G01850,AT3G17390,AT1G02500,AT2G36880",,,,,,,,
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,"AT3G61510,AT4G08040,AT2G22810,AT1G62960,AT5G51...",,,,,,,,
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,"AT2G19590,AT1G05010,AT1G77330,AT1G62380,AT1G12010",,,,,,,,
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,"AT1G63440,AT4G33520,AT5G44790,AT5G21930",,,,,,,,
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,,,,,"AT1G66340,AT3G04580,AT1G04310,AT2G40940,AT3G23150",,


In [48]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [inactivated]":"protein", 
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    'process [active]':"process_active",

    np.nan:"", 
    "plant_coding":"unknown"
}

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[type_col].apply(lambda x: node_type_to_node_form_dict[x])

input1
input2
input3
output1


In [49]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues,input1_form,input2_form,input3_form,output1_form
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,,,,,,metabolite,protein_active,,metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,,,,,,metabolite,protein,,metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,"AT1G66340,AT3G04580,AT1G04310,AT2G40940,AT3G23150",,,protein,metabolite,,protein_active


In [50]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus', 
    'mitochondria?': 'putative:mitochondrion'
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])

good_localisations.update(['putative:' + s for s in good_localisations])


def node_localisation_std(x):
    if not type(x) == str:
        return ""
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation')]
    
    x = df_edges[['ConnID', 'origin', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    
    for _, y in x.iterrows():
        if (not (y[id_col] in helpers.empty_strings)) and (y[localisation_col] in helpers.empty_strings):
            print(y['origin'], y['ConnID'], prefix, y[id_col], y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[localisation_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[localisation_col])

v2.7.5_PIS-model-Reactions Conn024 input1 PR3 nan
v2.7.5_PIS-model-Reactions Conn025 input1 PR4 nan
v2.7.5_PIS-model-Reactions Conn066 input1 CLH nan
v2.7.5_PIS-model-Reactions Conn067 input1 JR1 nan
v2.7.5_PIS-model-Reactions Conn068 input1 PR13 nan
v2.7.5_PIS-model-Reactions Conn069 input1 VSP nan
v2.7.5_PIS-model-Reactions Conn080 input1 Anthocyanin-accumulation nan
v2.7.5_PIS-model-Reactions Conn080 input1 Trichome-initiation nan
v2.7.5_PIS-model-Reactions Conn104 input1 PR1 nan
v2.7.5_PIS-model-Reactions Conn105 input1 PR2 nan
v2.7.5_PIS-model-Reactions Conn106 input1 PR5 nan
v2.7.5_PIS-model-Reactions Conn126 input1 WRKY53 nan
v2.7.5_PIS-model-Reactions Conn172 input1 R-gene nan
v2.7.5_PIS-model-Reactions_New Conn273 input1 Geranylgeranyl-PP  nan
v2.7.5_PIS-model-Reactions_New Conn274 input1 ent-Copalyl-PP nan
v2.7.5_PIS-model-Reactions_New Conn275 input1 ent-Kaurene nan
v2.7.5_PIS-model-Reactions_New Conn276 input1 ent-Kaurenoic acid nan
v2.7.5_PIS-model-Reactions_New Conn277 in

v2.7.5_PIS-model-Reactions_New nan input2 DELLA nan
v2.7.5_PIS-model-Reactions_New nan input2 DELLA nan
v2.7.5_PIS-model-Reactions_New nan input2 SCF nan
v2.7.5_PIS-model-Reactions_New nan input2 SCF nan
v2.7.5_PIS-model-Reactions_New nan input2 SCF-SLY1 nan
v2.7.5_PIS-model-Reactions_New nan input2 SCF-GID2 nan
v2.7.5_PIS-model-Reactions_New Conn299 input2 GAMT1,2 nan
v2.7.5_PIS-model-Reactions_New Conn300 input2 GAMT1,2 nan
v2.7.5_PIS-model-Reactions_New Conn301 input2 GAMT1,2 nan
v2.7.5_PIS-model-Reactions_New Conn302 input2 GAMYB nan
v2.7.5_PIS-model-Reactions_New Conn303 input2 GAMYB nan
v2.7.5_PIS-model-Reactions_New Conn304 input2 GAMYB nan
v2.7.5_PIS-model-Reactions_New Conn305 input2 MYB33 nan
v2.7.5_PIS-model-Reactions_New Conn306 input2 MYB65 nan
v2.7.5_PIS-model-Reactions_New Conn307 input2 MYB33 nan
v2.7.5_PIS-model-Reactions_New Conn308 input2 miR6022 nan
v2.7.5_PIS-model-Reactions_New Conn309 input2 NPR1 nan
v2.7.5_PIS-model-Reactions_New Conn310 input2 LSD1 nan
v2.7.5_P

In [51]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'extracellular',
 'mitochondria?',
 nan,
 'nuc',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [52]:
new_localisation

{'',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'putative:mitochondrion',
 'vacuole'}

In [53]:
len(df_edges['ConnID'].unique()) == df_edges.shape[0]

False

In [54]:
df_edges['RxID'] = df_edges.index.map(lambda x: "RxID-" + str(x))

In [55]:
len(df_edges['RxID'].unique()) == df_edges.shape[0]

True

In [62]:
df_edges.to_csv(os.path.join("..", "data", "raw", "edges-sheet.tsv"), sep="\t")

In [57]:
homologue_cols = [f"{x}_homologues" for x in all_species]

In [58]:
all_species

['ath', 'stu', 'osa']

In [59]:
with open(os.path.join("..", "data", "raw", "complexes_to_add.tsv"), "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")

# END