# Import neo4j DB: 1/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [1]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [2]:
import helpers

In [3]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [4]:
node_labels = [
    'PlantCoding',
    'PlantNonCoding',
    'PlantAbstract',
    'Complex',
    'ExternalOrganism', 
    'ExternalCoding',
    'ExternalNonCoding',
    'Process', 
    'MetaboliteFamily',
    'Metabolite',
    'GeneExpression', 
    'PseudoNode'
]

## Read in sheets

In [5]:
input_path = os.path.join("..", "data", "raw")

### Components sheet

In [6]:
sheets = [#(file, sheet_name)]
    ("v2.7.5_PIS-model.xlsx", "Components"),
    ("v2.7.5_PIS-model.xlsx", "Components_New"), 
    ("Model_CK.xlsx", "Components_new"), 
    ("v2.7.2_PIS-model-JALR.xlsx", "Components_New")
]

In [7]:
# resave xlsx as tsv
drops = ['Legacy:Process', 'mID', 'Notes', 'Unnamed: 21']
col_rename = {
    'mID':'mID', 
    'Notes':'Notes', 
    'AddedBy':'AddedBy', 
    'Species':'Species', 
    'NodeType':'NodeType', 
    'Family':'Family', 
    'Clade':'Clade', 
    'NodeID':'NodeID', 
    'NodeName':'NodeName', 
    'ModelStatus':'ModelStatus', 
    'NodeDescription':'NodeDescription', 
    'AdditionalInfo':'AdditionalInfo', 
    'ExtDBlink':'ExtDBlink', 
    'Process':'Process', 
    'ExternalDB':'ExternalDB', 
    'ModelV':'ModelV', 
    'GMM_OCD1':'GMM_OCD1', 
    'GMM_OCD':'GMM_OCD', 
    'GMM:Description':'GMM_Description', 
    'GMM:ShortName':'GMM_ShortName', 
    'GMM:Synonyms':'GMM_Synonyms', 

    'Node':'NodeName'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = os.path.join(input_path, file_name)
    
    base_name, extension = os.path.splitext(file_name)
    new_file_path = os.path.join(input_path, f'{base_name}-{sheet_name}.tsv')
    
    if os.path.exists(new_file_path):
        continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)
    df = df[~df["AddedBy"].isna()]
    
    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df.to_csv(new_file_path, sep="\t", index=None)

v2.7.5_PIS-model.xlsx Components
v2.7.5_PIS-model.xlsx Components_New
Model_CK.xlsx Components_new
v2.7.2_PIS-model-JALR.xlsx Components_New


In [8]:
dfs = []

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)

    file_path = os.path.join(input_path, f'{base_name}-{sheet_name}.tsv')
    
    df = pd.read_csv(file_path, sep="\t")
    
    dfs.append(df)
    

v2.7.5_PIS-model.xlsx Components
v2.7.5_PIS-model.xlsx Components_New
Model_CK.xlsx Components_new
v2.7.2_PIS-model-JALR.xlsx Components_New


In [9]:
df_components = pd.concat(dfs, sort=False)
df_components.reset_index(drop=True, inplace=True)

In [10]:
pd.value_counts(df_components['NodeType'])

plant_coding       756
metabolite         104
plant coding        69
pathogen_coding     14
plant_abstract      12
plant_ncRNA          8
process              6
plant_noncoding      4
x                    2
plant_complex        1
Name: NodeType, dtype: int64

In [11]:
df_components[df_components['NodeType']=='x']

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,ExtDBlink,Process,ExternalDB,ModelV,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms
854,x,x,x,x,x,x,x,x,x,x,x,,x,x,x,x,x,x,x
976,x,x,x,x,x,x,x,x,x,x,x,,x,x,x,x,x,x,x


In [12]:
x = df_components[df_components['NodeType']=='x'].index; x

Int64Index([854, 976], dtype='int64')

In [13]:
df_components.drop(x, inplace=True)

In [14]:
df_components.loc[df_components["NodeName"].isna(), 'NodeName']

830    NaN
831    NaN
Name: NodeName, dtype: object

In [15]:
df_components.loc[df_components["NodeName"].isna(), 'NodeName'] =  df_components.loc[df_components["NodeName"].isna(), 'NodeID']

In [16]:
components_node_type_to_node_label = {
    "plant coding":"PlantCoding",
    "plant_coding":"PlantCoding",
    "plant_noncoding":"PlantNonCoding",
    "plant_ncRNA":"PlantNonCoding",

    "plant_complex":"Complex", 

    "metabolite":"Metabolite",

    "pathogen_coding":"ExternalCoding",
    "pathogen_noncoding":"ExternalNonCoding",
    
    "plant_abstract":"PlantAbstract",
    
    "process":"Process", 

    np.nan:"Undefined"
}

In [17]:
# update node labels
df_components["NodeLabel"] = df_components["NodeType"].apply(lambda x: components_node_type_to_node_label[x])
pd.value_counts(df_components['NodeLabel'])

PlantCoding       825
Metabolite        104
ExternalCoding     14
PlantNonCoding     12
PlantAbstract      12
Process             6
Undefined           1
Complex             1
Name: NodeLabel, dtype: int64

In [18]:
df_components[df_components["NodeLabel"] == "Undefined"]

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,ExtDBlink,Process,ExternalDB,ModelV,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel
975,JALR,osa,,Class I Clp ATPase,Class I Clp ATPase,Os11g0104300,D53,use,D53,D53 binds to the complex S-D14-MAX2 to be degr...,,,GMM-OCDs,vNA,,,,,,Undefined


In [19]:
def only_asci(x):
    return "".join([character for character in x if character.isascii()])

In [20]:
df_components['Species'] = df_components["Species"].apply(lambda x: x.lower().strip())
df_components['observed_species'] = df_components["Species"].apply(helpers.get_second_item)
df_components['also_observed_in'] = df_components["Species"].apply(helpers.rest_of_items)


df_components["GMM_Synonyms"] = df_components["GMM_Synonyms"].apply(helpers.string_to_nice_string)
df_components['chebi_identifier'] = df_components[df_components['ExternalDB'] == "ChEBI"]["ExtDBlink"]
df_components['pubmed_identifier'] = df_components[df_components['ExternalDB'] == "PubChem"]["ExtDBlink"]
df_components['AdditionalInfo'].fillna('', inplace=True)

In [21]:
df_components.loc[df_components['ModelV'].isna(), 'ModelV'] = 'vNA'

In [22]:
df_components.loc[df_components['AddedBy'].isna(), 'AddedBy'] = 'NAN'

In [23]:
df_components['AddedBy'] = df_components['AddedBy'].apply(lambda x: x.upper())

In [24]:
df_components.loc[df_components['AddedBy']=='ZR/MZ', 'AddedBy'] = 'MZ' 

In [25]:
df_components['AddedBy'].unique()

array(['KG', 'ZR', 'MZ', 'MAK', 'ŠT', 'AG', 'JALR'], dtype=object)

In [26]:
def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def get_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print(character, ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0' : b" ",           # funky WIN whitespace
    b'\xe2\x80\xa6': b'...',      # …
    b'\xe2\x80\x8b' : b'',        # have no figging clue
    b'\xe2\x80\x93' : b'-',       # –
    b'\xce\xb1' : b"alpha",       # α
    b'\xc3\x9f' : b"beta",        # ß
    b'\xce\xb2' : b"beta",        # β
    b'\xe2\x80\x98' : b"prime",   # ‘
    b'\xe2\x80\x99' : b"prime",   # ’
    b'\xc2\xb4': b'prime',        # ´
    # Sorry accents :(
    b'\xc5\xa0' : b"S",           # Š
    b'\xc5\xa1' : b's',           # š
    b'\xc5\xbd' : b'Z',           # Ž
    b'\xc4\x8d' : b'c'             # č
}


def replacer(x):
    x = x.encode('utf-8')
    for old, new in ascii_replacers.items():
        x = x.replace(old, new)
    return x.decode('utf-8')


In [27]:
bad_cols = []
for c in df_components.columns:
    print(c, "\n-------------")
    if any(df_components[c].apply(get_non_ascii)):
        bad_cols.append(c)
    print()

AddedBy 
-------------
Š 352 b'\xc5\xa0'
ŠT

Species 
-------------

NodeType 
-------------

Family 
-------------

Clade 
-------------

NodeID 
-------------

NodeName 
-------------

ModelStatus 
-------------

NodeDescription 
-------------
ß 223 b'\xc3\x9f'
12-hydroxyjasmonic acid 12-O-ß-D-glucoside
  160 b'\xc2\xa0'
gibberellin 3-oxidase 
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-monophosphate
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-diphosphate
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-triphosphate
α 945 b'\xce\xb1'
7-(α-D-glucosyl)-N6-isopentenyladenine 
α 945 b'\xce\xb1'
9-(α-D-glucosyl)-N6-isopentenyladenine 
β 946 b'\xce\xb2'
O-β-D-glucosyl-trans-zeatin 
α 945 b'\xce\xb1'
9-(α-D-glucosyl)-trans-zeatin
α 945 b'\xce\xb1'
7-(α-D-glucosyl)dihydrozeatin 
α 945 b'\xce\xb1'
9-(α-D-glucosyl)dihydrozeatin 
β 946 b'\xce\xb2'
O-β-D-glucosyl-cis-zeatin 
β 946 b'\xce\xb2'
7-(β-D-glucosyl)-cis-zeatin 
β 946 b'\xce\xb2'
9-(β-D-glucosyl)-cis-zeatin 
α 945 

In [28]:
for c in bad_cols:
    df_components[c] = df_components[c].apply(replacer)

In [29]:
from IPython.display import display

In [30]:
for label, subdf in df_components.groupby('NodeLabel'):
    dups =  subdf[subdf.duplicated(['NodeName'], keep=False)]
    if dups.shape[0] > 0:
        print(label)
        display(dups.sort_values('NodeName'))

Metabolite


Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier
529,MZ,plant_all,metabolite,PostROS,H2O,H2O,H2O,ignore,water,,...,,,,,,Metabolite,plant_all,,,
939,AG,all,metabolite,H2O,H2O,H2O,H2O,use,water,,...,,,,,,Metabolite,all,,,
523,MZ,plant_all,metabolite,PreROS,O2,O2,O2,use,ROS precursors; oxygen,,...,,,,,,Metabolite,plant_all,,CHEBI:25805,
940,AG,all,metabolite,O2,O2,O2,O2,use,dioxygen,,...,,,,,,Metabolite,all,,,


PlantCoding


Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,GMM_OCD1,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,observed_species,also_observed_in,chebi_identifier,pubmed_identifier
803,MAK,ath,plant_coding,CPS,CPS,AT4G02780,CPS,use,ent-copalyl diphosphate synthase,also known as GA REQUIRING 1 (GA1),...,OCD_all_001702,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,Terpenoid cyclases/Protein prenyltransferases ...,GA1,"ABC33,ATCPS1,CPS,CPS1,GA1,TPSGA1",PlantCoding,ath,,,
826,MAK,stu,plant_coding,CPS,CPS,Sotub06g034690.1.1,CPS,use,ent-copalyl diphosphate synthase,also known as GA REQUIRING 1 (GA1),...,,,,,,PlantCoding,stu,,,
808,MAK,ath,plant_coding,GA20ox,GA20ox,AT4G25420,GA20ox1,use,gibberellin 20-oxidase,,...,OCD_all_000842,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,2-oxoglutarate (2OG) and Fe(II)-dependent oxyg...,GA20OX1,"20ox1,At2301,GA20OX1,GA5",PlantCoding,ath,,,
828,MAK,stu,plant_coding,GA20ox,GA20ox,Sotub03g007160.1.1,GA20ox1,use,gibberellin 20-oxidase,,...,,,,,,PlantCoding,stu,,,
810,MAK,ath,plant_coding,GA20ox,GA20ox,AT5G07200,GA20ox3,use,gibberellin 20-oxidase,,...,OCD_all_023006,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,gibberellin 20-oxidase 3,GA20OX3,"20ox3,ATGA20OX3,GA20OX3,YAP169",PlantCoding,ath,,,
827,MAK,stu,plant_coding,GA20ox,GA20ox,Sotub11g029030.1.1,GA20ox3,use,gibberellin 20-oxidase,,...,,,,,,PlantCoding,stu,,,
811,MAK,ath,plant_coding,GA20ox,GA20ox,AT1G60980,GA20ox4,use,gibberellin 20-oxidase,,...,OCD_all_023006,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,gibberellin 20-oxidase 4,GA20OX4,"ATGA20OX4,GA20OX4",PlantCoding,ath,,,
829,MAK,stu,plant_coding,GA20ox,GA20ox,Sotub01g031210.1.1,GA20ox4,use,gibberellin 20-oxidase,,...,,,,,,PlantCoding,stu,,,
840,MAK,ath,plant_coding,MYB,GAMYB,AT5G06100,MYB33,use,gibberellin induced TF,,...,OCD_all_003344,http://www.gomapman.org/ortholog/OCD_all/OCD_a...,myb domain protein 33,MYB33,"ATMYB33,MYB33",PlantCoding,ath,,,
846,MAK,stu,plant_coding,MYB,GAMYB,Sotub06g030530.1.1,MYB33,use,gibberellin induced TF,,...,,,,,,PlantCoding,stu,,,


In [31]:
# remove duplicated metabolites
df_components.drop([939, 940], inplace=True)

In [32]:
path = os.path.join(input_path, 'parsed-components.tsv')
df_components.to_csv(path, sep="\t", index=None)
print(path)

../data/raw/parsed-components.tsv


In [33]:
!head $path

AddedBy	Species	NodeType	Family	Clade	NodeID	NodeName	ModelStatus	NodeDescription	AdditionalInfo	ExtDBlink	Process	ExternalDB	ModelV	GMM_OCD1	GMM_OCD	GMM_Description	GMM_ShortName	GMM_Synonyms	NodeLabel	observed_species	also_observed_in	chebi_identifier	pubmed_identifier
KG	ath	plant_coding	SAM	SAMS	AT1G02500	SAM1	use	SAM synthetase [EC:2.5.1.6]		OCD_all_000621	Hormone:ET	GMM-OCDs	v1.0	OCD_all_000621	http://www.gomapman.org/ortholog/OCD_all/OCD_all_000621	S-adenosylmethionine synthetase 1	SAM1	AtSAM1,MAT1,METK1,SAM-1,SAM1	PlantCoding	ath			
KG	ath	plant_coding	SAM	SAMS	AT4G01850	SAM2	use	SAM synthetase [EC:2.5.1.6]		OCD_all_000621	Hormone:ET	GMM-OCDs	v1.0	OCD_all_000621	http://www.gomapman.org/ortholog/OCD_all/OCD_all_000621	S-adenosylmethionine synthetase 2	SAM2	AtSAM2,MAT2,SAM-2,SAM2	PlantCoding	ath			
KG	ath	plant_coding	SAM	SAMS	AT2G36880	SAM3	use	SAM synthetase [EC:2.5.1.6]		OCD_all_000621	Hormone:ET	GMM-OCDs	v1.0	OCD_all_000621	http://www.gomapman.org/ortholog/OCD_all/OCD_all_

# END

## Components summary

In [89]:
q = '''MATCH (n) RETURN DISTINCT n.name AS name, n.level AS level'''
nodes = graph.run(q).data()
all_nodes_in_components = set([(d["name"], d["level"]) for d in nodes])

In [90]:
len(all_nodes_in_components)

245

In [92]:
sorted(df_components['NodeLabel'].unique())

['Complex',
 'ExternalCoding',
 'Metabolite',
 'PlantAbstract',
 'PlantCoding',
 'PlantNonCoding',
 'Process']

In [93]:
node_dict = {}
for label in node_labels:
    q = '''MATCH (n:%s) RETURN DISTINCT n.name'''%label
    s = set([d['n.name'] for d in graph.run(q).data()])
    print(label, len(s))
    node_dict[label] = s


PlantCoding 142
PlantNonCoding 9
PlantAbstract 7
Complex 1
ExternalOrganism 3
ExternalCoding 14
ExternalNonCoding 0
Process 6
MetaboliteFamily 4
Metabolite 59


### Reactions sheet

In [94]:
df_edges_orig = pd.read_excel(input_path, 
                         sheet_name="Reactions", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_orig = df_edges_orig[df_edges_orig['Status'].isin(["forCB", "forCB_INVENTED"])]
df_edges_orig.drop(['Status', 'FOXMES', 'Legacy:Process', 'Legacy:ReactionMode'], axis=1, inplace=True)
df_edges_orig.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_localisation', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_localisation', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_localisation', 'input3_type', 
          'ReactionEffect', 'ReactionMode', 'Modifications',
          'output1_ID', 'output1_level', 'output1_localisation', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV', 'kinetics'],
      dtype='object')

df_edges_new = pd.read_excel(input_path, 
                         sheet_name="Reactions_New", 
                         header=[1], 
                         dtype=str, 
                         na_values=helpers.empty_strings)
df_edges_new = df_edges_new[~df_edges_new['AddedBy'].isin(['-'])]
df_edges_new.drop(['Status'], axis=1, inplace=True)
df_edges_new.columns = pd.Index(['AddedBy', 'ConnID', 'Species', 
          'input1_ID', 'input1_level', 'input1_type', 
          'input2_ID', 'input2_level', 'input2_type', 
          'input3_ID', 'input3_level', 'input3_type',                                  
          'ReactionEffect', 'ReactionMode', 
          'output1_ID', 'output1_level', 'output1_type', 
          'TrustLevel', 'Literature', 'AdditionalInfo', 'Comment', 'ModelV'],
      dtype='object')


df_edges = pd.concat([df_edges_orig, df_edges_new], sort=False)

In [95]:
df_edges = df_edges[~df_edges["AddedBy"].isna()]

In [96]:
df_edges.tail()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,output1_ID,output1_level,output1_localisation,output1_type,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics
46,sb,Conn310,ath,CAT,family,,plant_coding,LSD1,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.113.225805,LSD1 interacted with all three catalases both ...,,vNA,
47,sb,Conn310,ath,SA,node,,metabolite,CAT2,node,,...,,,,,[R4] indirect reaction,DOI:10.1016/j.chom.2017.01.007,SA decreased CAT2 activity in a dose-dependent...,,vNA,
48,sb,Conn310,ath,CAT2,node,,plant_coding,ACX2,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1016/j.chom.2017.01.007,CAT2 Promotes the Activityof ACX2/ACX3 ( (test...,,vNA,
49,sb,Conn310,ath,CAT2,node,,plant_coding,ACX3,node,,...,,,,,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1016/j.chom.2017.01.007,CAT2 Promotes the Activityof ACX2/ACX3 ( (test...,,vNA,
1737,x,x,x,x,x,,x,,x,,...,x,x,,x,x,x,x,x,x,


In [97]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
df_edges.drop(x, inplace=True)

Int64Index([1737], dtype='int64')


In [98]:
df_edges["TrustLevel"].unique()

array(['[R1] targetted experiments (e.g. Y2H, BIFC)',
       '[Ry] invented reaction', '[Rx] incomplete/unspecific reaction',
       '[R2] high-throughput experiment (e.g. ChIP-seq)',
       '[R3] in-silico prediction', '[R4] indirect reaction'],
      dtype=object)

In [99]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
df_edges['observed_species'] = df_edges["Species"].apply(helpers.get_second_item)
df_edges['also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [100]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

array(['KG', 'MZ', 'ZR', 'MPE', 'ACR', 'MAK', 'ŠT', 'SB'], dtype=object)

In [101]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

array(['v1.0', 'v2.5', 'v2.7', 'v2.6', 'vNA'], dtype=object)

In [102]:

def doi_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x = only_asci(x.lower())
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []

def list_to_string(x):
    return ",".join([str(i) for i in x])

In [103]:
# format literature sources
df_edges["Literature"].fillna(value="", inplace=True)
for i, row in df_edges.iterrows():
    s = row['Literature']
    source = doi_list(s)
    source += pubmed_list(s)
    for z in s.split("|"):
        key = z.lower()
        if ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + z.split(":")[1].strip()
                source.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + z.split(":")[1].strip()
                source.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pubmed" in key) or ("pmid" in key):
                pmid_string = "pubmed:" + z.split(":")[1].strip()
        elif "invented" in key:
            source.append("invented")
        else:
            print("no/bad reference", row["ConnID"], z)
            source.append("other:" + only_asci(z.strip()))
    if len(source) > 0:
        df_edges.loc[i, "literature_sources"] = list_to_string(source)
    else:
        print(row["ConnID"], z)


no/bad reference Conn040 
no/bad reference Conn118 
no/bad reference Conn120 
no/bad reference Conn122 
no/bad reference Conn182 
no/bad reference Conn183 
no/bad reference Conn199 ? Kg need to find reference
no/bad reference Conn273 KEGG 
no/bad reference Conn274 KEGG 
no/bad reference Conn275 KEGG 
no/bad reference Conn276 KEGG 
no/bad reference Conn276  10.1073/pnas.98.4.2065
no/bad reference Conn277 KEGG 
no/bad reference Conn278 KEGG 
no/bad reference Conn279 KEGG 
no/bad reference Conn280 KEGG 
no/bad reference Conn281 KEGG 
no/bad reference Conn282 KEGG 
no/bad reference Conn283 KEGG 
no/bad reference Conn309 


In [104]:
df_edges[["ConnID", "Literature", "literature_sources"]].to_csv("lit-check.tsv", sep="\t", index=None)

In [105]:
df_edges.reset_index(inplace=True, drop=True)

In [106]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [107]:
df_edges[df_edges['ConnID'].duplicated()]

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
74,KG,Conn080,ath,Trichome-initiation,family,,process,potyvirus,family,,...,[Ry] invented reaction,"DOI:10.1105/tpc.111.083261 (Qi, 2011: The jasm...",,,v2.7,,Ry,ath,,doi:10.1105/tpc.111.083261
260,MAK,,ath,"GID1a,b,c",clade/orthologue,,plant_coding,GA3,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.106.047415,,,vNA,,R1,ath,,doi:10.1105/tpc.106.047415
261,MAK,,ath,"GID1a,b,c",clade/orthologue,,plant_coding,GA4,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.106.047415,,,vNA,,R1,ath,,doi:10.1105/tpc.106.047415
268,MAK,,ath,GA1-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
269,MAK,,ath,GA3-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
270,MAK,,ath,GA4-GID1,family,,plant_complex,DELLA,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956,,,vNA,,R1,ath,,doi:10.1104/pp.112.200956
271,MAK,,ath,SLY1,node,,plant_coding,SCF,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.010827,,,vNA,,R1,ath,,doi:10.1105/tpc.010827
272,MAK,,osa,GID2,node,,plant_coding,SCF,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1126/science.1081077,,,vNA,,R1,osa,,doi:10.1126/science.1081077
273,MAK,,ath,GA1-GID1-DELLA,family,,plant_complex,SCF-SLY1,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956 | DOI:10.1105/tpc.01...,,,vNA,,R1,ath,,"doi:10.1104/pp.112.200956,doi:10.1105/tpc.010827"
274,MAK,,osa,GA1-GID1-SLR1,family,,plant_complex,SCF-GID2,family,,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1104/pp.112.200956 | https://doi.org/10...,,,vNA,,R1,osa,,doi:10.1104/pp.112.200956


In [108]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG,other:10.107..."
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"


In [109]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [110]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,TrustLevel,Literature,AdditionalInfo,Comment,ModelV,kinetics,trust_level,observed_species,also_observed_in,literature_sources
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,SAMS catalyse L-Met to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACS catalyse ACC to SAMe reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",AraCyc:ETHYL-PWY,ACO catalyse ACC to ET reaction.,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper gets transported from the cytoplasm to ...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG,other:10.107..."
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,"[R1] targetted experiments (e.g. Y2H, BIFC)",DOI:10.1105/tpc.001768 (Ethylene Biosynthesis ...,Copper activates the membrane bound ethylene r...,,v1.0,,R1,ath,,"doi:10.1042/bj20120245,other:KEGG"


In [111]:
def convert_node_to_family(x):
    # pathogen proteins are listed as proteins, so cannot use dict
    id_, type_, level_ = x.values
      
    if id_ is np.nan:
        return np.nan, np.nan

    new_label = None
    family_id = None
    
    ########################
    # Simple Cases
    ########################
    if type_ in ['complex', 'complex [active]', 'complex [activated]', 'complex [inactive]', 'plant_complex']:
        if not (id_ in node_dict["Complex"]):
            complexes_to_add.append(id_)
        new_label = 'Complex'
        family_id = id_
    
    elif type_ in ["metabolite"]:
        for label in ["Metabolite", "MetaboliteFamily"]:
            if id_ in node_dict[label]:
                new_label = label
                break
        if not new_label:
            missing_in_components.update([id_])        
            print(id_, type_, level_, " | (label) not a listed metabolite")
        family_id = id_

    elif type_ in ['process']:
        if (id_ in node_dict["Process"]):
            family_id = id_
            new_label = "Process"
        else:
            print(id_, type_, level_, " | (label) process not a listed process")
        
    else:
        ########################
        # family ID
        ########################
        check_external = False
        if level_ == "family":    
            family_id = id_
        elif level_ in ["clade", "clade/orthologue"]:
            try:
                family_id = clade_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        elif level_ == "node":
            try:
                family_id = node_to_family[id_]
                replace_w_family.update([id_])
            except KeyError:
                check_external = True
        
        if check_external:
            for label in ["ExternalOrganism", "ExternalCoding", "ExternalNonCoding"]:
                if id_ in node_dict[label]:
                    new_label = label
                    family_id = id_
                    break
                    
        if not family_id:
            print(id_, type_, level_, " | (family id) could not convert to family/external")
                
        ########################
        # Label
        ########################
        if (family_id) and (not new_label):
            id_labels = [] #looping just in case an id occurs mutiple times
            for label in node_labels:
                if family_id in node_dict[label]:
                    id_labels.append(label)

            if len(id_labels) == 1:
                new_label = id_labels[0]
            elif len(id_labels) > 1:
                print(id_, type_, level_, " | (label) many labels fit")
                missing_in_components.update([id_])       

            else:
                print(id_, type_, level_, " | (label) could not find label")
                missing_in_components.update([id_])        

    return family_id, new_label
        

In [112]:
complexes_to_add = []
missing_in_components = set()
replace_w_family = set()

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, level_col, new_id, new_label_col  =\
            [prefix + x for x in ('_ID',  '_type',  '_level',  '_newID', '_label')]    
    
    df_edges[[new_id, new_label_col]] = df_edges[[id_col, type_col, level_col]].apply(convert_node_to_family, axis=1, result_type='expand')


miR6022 plant_ncRNA node  | (family id) could not convert to family/external
LSD1 plant_coding node  | (family id) could not convert to family/external


In [113]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,also_observed_in,literature_sources,input1_newID,input1_label,input2_newID,input2_label,input3_newID,input3_label,output1_newID,output1_label
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,"doi:10.1042/bj20120245,other:KEGG",L-Met,Metabolite,SAM,PlantCoding,,,SAMe,Metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",SAMe,Metabolite,ACS,PlantCoding,,,ACC,Metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",ACC,Metabolite,ACO,PlantCoding,,,ET,Metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG,other:10.107...",Cu2+,Metabolite,HMA,PlantCoding,,,Cu2+,Metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,"doi:10.1042/bj20120245,other:KEGG",ETR,PlantCoding,Cu2+,Metabolite,,,ETR,PlantCoding


In [114]:
def get_orthologues(x, prefix=""):
    id_, level_, label_ = x.values
    ########################
    # Specie specific homologues
    ########################
    
    return_D = {f"{prefix}_{specie}_homologues":"" for specie in all_species}
    
    if label_ in ['PlantCoding', 'PlantNonCoding',  'PlantAbstract']:

        for species in all_species:
            species = f"{species}_homologues"
            k = f"{prefix}_{species}"
            if level_ == 'node':
                return_D[k] = node_ids_key[species][id_]
            elif level_ == 'clade':
                return_D[k] =  clade_ids_key[species][id_]
            elif level_ == 'family':
                return_D[k] =  family_ids_key[species][id_]

    return_D = {x:helpers.list_to_string(list(return_D[x])) for x in return_D}
    return return_D

In [115]:
new_dfs = []
for prefix in ['input1', 'input2', 'input3', 'output1']:
    print(prefix)
    id_col, level_col, new_label_col  =\
                [prefix + x for x in ('_ID',  '_level',  '_label')]        
    
    new_df = df_edges[[id_col, level_col, new_label_col ]].apply(get_orthologues, axis=1, result_type='expand', prefix=prefix)
    new_dfs.append(new_df)
    #df_edges = df_edges.join(new_df, sort=False)

input1
input2
input3
output1


In [116]:
homologues_df = pd.concat(new_dfs, sort=False, axis=1)

In [117]:
homologues_df.loc[0]

input1_ath_homologues                                            
input1_osa_homologues                                            
input1_stu_homologues                                            
input2_ath_homologues     AT2G36880,AT1G02500,AT4G01850,AT3G17390
input2_osa_homologues                                            
input2_stu_homologues                                            
input3_ath_homologues                                            
input3_osa_homologues                                            
input3_stu_homologues                                            
output1_ath_homologues                                           
output1_osa_homologues                                           
output1_stu_homologues                                           
Name: 0, dtype: object

In [118]:
df_edges = df_edges.join(homologues_df, sort=False)

In [119]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input1_stu_homologues,input2_ath_homologues,input2_osa_homologues,input2_stu_homologues,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,"AT2G36880,AT1G02500,AT4G01850,AT3G17390",,,,,,,,
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,"AT5G65800,AT5G51690,AT4G26200,AT3G49700,AT1G01...",,,,,,,,
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,"AT1G62380,AT1G12010,AT1G77330,AT1G05010,AT2G19590",,,,,,,,
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,"AT5G44790,AT5G21930,AT4G33520,AT1G63440",,,,,,,,
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,,,,,"AT2G40940,AT1G66340,AT3G23150,AT1G04310,AT3G04580",,


In [120]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    'process [active]':"process_active",

    np.nan:"", 
    "plant_coding":"unknown"
}

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[type_col].apply(lambda x: node_type_to_node_form_dict[x])

input1
input2
input3
output1


In [121]:
df_edges.head()

Unnamed: 0,AddedBy,ConnID,Species,input1_ID,input1_level,input1_localisation,input1_type,input2_ID,input2_level,input2_localisation,...,input3_ath_homologues,input3_osa_homologues,input3_stu_homologues,output1_ath_homologues,output1_osa_homologues,output1_stu_homologues,input1_form,input2_form,input3_form,output1_form
0,KG,Conn001,ath,L-Met,family,ER,metabolite,SAMS,clade,ER,...,,,,,,,metabolite,protein_active,,metabolite
1,KG,Conn002,ath,SAMe,family,ER,metabolite,ACS,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
2,KG,Conn003,ath,ACC,family,ER,metabolite,ACO,family,ER,...,,,,,,,metabolite,protein_active,,metabolite
3,KG,Conn004,ath,Cu2+,family,cytoplasm,metabolite,HMA,family,ER,...,,,,,,,metabolite,protein,,metabolite
4,KG,Conn005,ath,ETR,family,ER,protein,Cu2+,family,ER,...,,,,"AT2G40940,AT1G66340,AT3G23150,AT1G04310,AT3G04580",,,protein,metabolite,,protein_active


In [163]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus'
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])


def node_localisation_std(x):
    if not type(x) == str:
        return ""
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation')]
    
    x = df_edges[['ConnID', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    
    #for _, y in x.iterrows():
    #    if y[localisation_col] in ['mitochondria?', np.nan]:
    #        print(y['ConnID'], "\t", y[id_col], "\t", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[localisation_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[localisation_col])

In [164]:
node_localisations

{'ER',
 'Golgi',
 'chloroplast',
 'cytoplasm',
 'extracellular',
 'mitochondria?',
 nan,
 'nuc',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [165]:
new_localisation

{'',
 'chloroplast',
 'cytoplasm',
 'endoplasmic reticulum',
 'extracellular',
 'golgi apparatus',
 'nucleus',
 'peroxisome',
 'vacuole'}

In [166]:
df_edges.to_csv(os.path.join("..", "data", "raw", "edges-sheet.tsv"), sep="\t")

In [167]:
homologue_cols = [f"{x}_homologues" for x in all_species]

In [125]:
all_species

['ath', 'stu', 'osa']

In [124]:
with open(os.path.join("..", "data", "raw", "complexes_to_add.tsv"), "w") as out:
    for c in complexes_to_add:
        out.write(f"{c}\n")