In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 3/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [None]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image

In [None]:
import helpers

In [None]:
from importlib import reload

In [None]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
graph = Graph(host="neo4j")

## Read in sheets

In [None]:
sheets = [#(file, sheet_name)
    ("v2.7.9_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions"),
#     ("v2.7.5_PIS-model.xlsx", "Reactions_New"), 
#     ("Model_CK.xlsx", "Reactions_new"), 
#     ("v2.7.2_PIS-model-JALR.xlsx", "Reactions_New")
]

In [None]:
# resave xlsx as tsv
drops = ['FOXMES', 'Legacy:Process', 'Legacy:ReactionMode', "Comment"]#, 'ConnID']
col_rename = {
    'Status':'Status',
    'AddedBy':'AddedBy',
    'Species':'Species',
    'ID':'input1_ID',
    'level':'input1_level',
    'localisation':'input1_localisation',
    'type':'input1_type',
    'ID.1':'input2_ID',
    'level.1':'input2_level',
    'localisation.1':'input2_localisation',
    'type.1':'input2_type',
    'ID.2':'input3_ID',
    'level.2':'input3_level',
    'localisation.2':'input3_localisation',
    'type.2':'input3_type',
    'ReactionEffect':'ReactionEffect',
    'ReactionMode':'ReactionMode',
    'Modifications':'Modifications',
    'ID.3':'output1_ID',
    'level.3':'output1_level',
    'localisation.3':'output1_localisation',
    'type.3':'output1_type',
    'TrustLevel':'TrustLevel',
    'Literature':'Literature',
    'AdditionalInfo':'AdditionalInfo',
    'Comment':'Comment',
    'Model-v':'ModelV',
    'KINETICS':'kinetics', 
    'ConnID': 'ConnID'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = input_path / file_name
    
    base_name, extension = os.path.splitext(file_name)
    new_file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    #if os.path.exists(new_file_path):
    #    continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)

    #df = df[~df["AddedBy"].isna()]
    #if 'Status' in df.columns:
    #    df = df[df['Status'].isin(["forCB", "forCB_INVENTED", np.nan])]

    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df['origin'] = df['Status'] + f'-{base_name}-{sheet_name}'
    
    df.to_csv(new_file_path, sep="\t", index=None)

In [None]:
dfs = []

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)

    file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    df = pd.read_csv(file_path, sep="\t")

    dfs.append(df)

In [None]:
df_edges = pd.concat(dfs, sort=False)
df_edges.reset_index(drop=True, inplace=True)

In [None]:
df_edges.head()

In [None]:
df_edges['ReactionMode'].value_counts()

In [None]:
for c in df_edges.columns:
    df_edges[c] = df_edges[c].str.strip()

In [None]:
x = df_edges[df_edges['AddedBy']=='x'].index
print(x)
#df_edges.drop(x, inplace=True)

In [None]:
for x in df_edges["TrustLevel"].unique():
    print(x)

In [None]:
df_edges['trust_level']  = df_edges["TrustLevel"].apply(lambda x: re.search( r"(R[1|2|3|4|x|y]|undefined)", x).groups()[0])
#df_edges['observed_species'] = df_edges['Species'].apply(helpers.lower_string)
#df_edges['species_also_observed_in'] = df_edges["Species"].apply(helpers.rest_of_items)
#df_edges['Comment'] = df_edges['Comment'].fillna("")
df_edges['AdditionalInfo'] = df_edges['AdditionalInfo'].fillna("")

In [None]:
df_edges['AddedBy'] = df_edges['AddedBy'].apply(lambda x: x.upper())
df_edges["AddedBy"].unique()

In [None]:
df_edges.loc[df_edges['ModelV'].isna(), 'ModelV'] = 'vNA'
df_edges['ModelV'].unique()

In [None]:
# See here for keys https://www.utf8-chartable.de/unicode-utf8-table.pl 

def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def find_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print('    ', character, ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0'         : b' ',            # funky WIN whitespace
    b'\xe2\x80\xa6'     : b'...',          # …
    b'\xe2\x80\x8b'     : b'',             # have no figging clue
    b'\xe2\x80\x93'     : b'-',            # –
    
    b'\xce\xb1'         : b'&alpha;',      # α
    b'\xc3\x9f'         : b'&beta;',       # ß
    b'\xce\xb2'         : b'&beta;',       # β
    
    # some "prime" symbols...
    b'\xe2\x80\x98'     : b'&prime;',      # ‘ Left Single Quotation Mark
    b'\xe2\x80\x99'     : b'&prime;',      # ’ Right Single Quotation Mark
    b'\xc2\xb4'         : b'&prime;',      # ´ Acute Accent
    # actual prime
    b'\xe2\x80\xb2'     : b'&prime;',      # ′ Prime
    
    # Sorry accents :(
    b'\xc5\xa0'         : b'S',            # Š
    b'\xc5\xa1'         : b's',            # š
    b'\xc5\xbd'         : b'Z',            # Ž
    b'\xc4\x8d'         : b'c'             # č
}

def replacer(x, verbose=False):
    if type(x) == float:
        return x
    y = x.encode('utf-8')
    for old, new in ascii_replacers.items():
        y = y.replace(old, new)
    y = y.decode('utf-8')
    
    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
    
    return y.strip()


In [None]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        bad_cols.append(c)
    print()

In [None]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(replacer, verbose=True)
    print()

In [None]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_non_ascii)):
        print()

In [None]:
# also need to look for ' (quote) and " (doube; quote)
# which are used instead of prime, (may cause string issues??)

# replacing 5' with 5&prime; and 3' with 3&prime;
def find_quotes(x):
    x = str(x)
    if x.find('"') != -1: 
        print(x)
        return True
    elif x.find("'") != -1:
        print(x)
        return True
    else:
        return False
        

def quote_replacer(x, verbose=False):
    if type(x) == float:
        x = ""
    y = x.replace("5'", "5&prime;")
    y = y.replace("3'", "3&prime;")

    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
        
    return y



In [None]:
bad_cols = []
for c in df_edges.columns:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        bad_cols.append(c)
    print()

In [None]:
for c in bad_cols:
    print(c, "\n-------------")
    df_edges[c] = df_edges[c].apply(quote_replacer, verbose=True)
    print()

In [None]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_edges[c].apply(find_quotes)):
        print()

In [None]:
df_edges["Literature"].fillna(value="", inplace=True)

In [None]:
# format literature sources

re_ec = "ec(?:\:|\s)?(\d+(?:\.(?:\-|\d+)){1,3}(?:\.n\d+)?)(?:\s|$|\]|,|\.)"


def doi_list(x):
    x = x.lower()
    match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["doi:" + m.rstrip('.') for m in match]
    else:
        return []

def pubmed_list(x):
    x =x.lower()
    match = re.findall("(?:pmid)\:\s*(.+?)(?:\s|$)", x)
    if not match is None:
        return ["pmid:" + m.rstrip('.') for m in match]
    else:
        return []


def format_literature(row):
    issued = False
    dbs_list = []
    
    x = row['Literature'].lower().strip()
    doi_match = re.findall("(?:doi)(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    

    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
                 
    for key in x.split("|"):
        if key =="":
            print(f"BLANK\\{row['ConnID']}")
            issued = True

        elif ":" in key:
            if "aracyc" in key:
                aracyc_string = "aracyc:" + key.split(":")[1].strip()
                dbs_list.append(aracyc_string)
            elif "kegg" in key:
                kegg_string = "kegg:" + key.split(":")[1].strip()
                dbs_list.append(kegg_string)
            elif "doi" in key:
                # already fetched
                continue
            elif ("pmcid" in key) or ("pmid" in key):
                continue
            else:
                print(f"UNKOWN DB\\{row['ConnID']}\\{key}")                 
                issued = True
        elif "invented" in key:
            dbs_list.append("invented:reason")
        else:
            print(f"NOVALUE\\{row['ConnID']}\\{key}")
            issued = True

    if (len(dbs_list)==0) and not issued:
        print(f"BADorMISSING\\{row['ConnID']}\\{x}")

    ################
    x = row['AdditionalInfo']
    x = x.lower()
    
    # EC:3.3.3.-
    ec_match = re.findall(re_ec, x)
    dbs_list += [f"ec:{idf}" for idf in ec_match]
    
    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
    
    doi_match = re.findall("doi(?:\:|\/)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"doi:{idf.strip().rstrip('.')}" for idf in doi_match]    
    
    kegg_match = re.findall(r"((?:k|map|ko|ec|rn)\d{5})", x)
    dbs_list += [f"kegg:{idf}" for idf in kegg_match]              
    
    ncbi_nuccore_match = re.findall("NCBI ID: (.+)", x)
    dbs_list += [f"ncbi_nuccore:{idf}" for idf in ncbi_nuccore_match]              
         
    return ','.join(list(set(dbs_list)))

In [None]:
df_edges['external_links'] = df_edges.apply(format_literature, axis=1)

In [None]:
df_edges[df_edges['external_links']==''][['ConnID', 'origin', 'Literature', 'AdditionalInfo']]

In [None]:
df_edges[["ConnID",  "external_links", "Literature", "AdditionalInfo"]].to_csv(output_path / "reactions-lit-check.tsv", sep="\t", index=None)

In [None]:
df_edges.reset_index(inplace=True, drop=True)

In [None]:
save_df = df_edges.copy()
#df_edges = save_df.copy()

In [None]:
df_edges.head()

In [None]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(helpers.reorder_ids)

In [None]:
node_type_to_node_form_dict = {
    "gene":"gene",
    
    "protein":"protein",
    "protein [inactivated]":"protein", 
    "protein [activated]":"protein_active",
    'protein [active]': "protein_active",
    'protein_active': 'protein_active',
    
    "ncRNA":"ncRNA",
    "plant_ncRNA":"ncRNA",
    'ta-siRNA':"ta-siRNA", 
        
    "complex":"complex", 
    "plant_complex":"complex",
    'complex [active]': "complex_active",
    'complex_active': "complex_active",
    
    "metabolite":"metabolite",
    
    "process":"process", 
    "process_active":"process_active", 
    'process [active]':"process_active",

    np.nan:"", 
}

node_type_to_active_form_dict = {
    "protein [active]":"protein_active",
    "protein":"protein_active",
    "protein_active":"protein_active",
    
    "complex":"complex_active", 
    "complex_active": "complex_active",
    
    "process":"process_active", 
    'process_active':"process_active",
}

node_type_to_inactive_form_dict = {
    "protein":"protein",
    "protein_active":"protein",
    "protein [inactivated]":"protein",
    
    "complex":"complex", 
    "complex_active": "complex",
    
    "process":"process", 
    'process_active':"process",
}



def get_node_form(row, prefix="input1"):
    id_ = row.iloc[0]
    if type(id_) == float:
        return np.nan
    
    type_ = row.iloc[1]
    form = node_type_to_node_form_dict[type_]
    if (  len(re.findall("\(a\)", id_)) > 0  ) or (  len(re.findall("\[active\]", id_)) > 0  ):
        form = node_type_to_active_form_dict[type_]
        print(id_, type_, "-->", form)
        
    return form

for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, new_form_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_form')]
    
    #x = df_edges[[id_col, type_col]].dropna(how='all')
    
    print(prefix)
    df_edges[new_form_col] = df_edges[[id_col, type_col]].apply(get_node_form, prefix=prefix, axis=1)

In [None]:
def remove_brackets(x):
    # reorder ids for complexes
    if type(x) == np.float:
        return np.nan
    else:
        s = re.sub("\(a\)", "", x)
        s = re.sub("\(p\)", "", s)
        if s != x:
            print(x, s)
        return s

In [None]:
for x in ['input1', 'input2', 'input3', 'output1']:
    df_edges.loc[:, x + "_ID"] = df_edges[x + "_ID"].apply(remove_brackets)

In [None]:
df_edges['species'] = df_edges['Species'].apply(lambda x: ",".join(x.lower().split('/')))

In [None]:
df_edges['species'].unique()

In [None]:
node_localisation_dict = {
    'nuc':'nucleus',
    'er':'endoplasmic reticulum',
    'golgi':'golgi apparatus', 
    'mitochondria?': 'putative:mitochondrion', 
    'cytoplasm?': 'putative:cytoplasm', 
}


good_localisations = set([
     'nucleus',
     'nucleolus',
     'cytoplasm',
     'vacuole',
     'endoplasmic reticulum',
     'chloroplast',
     'mitochondrion',
     'golgi apparatus',
     'peroxisome',
     'apoplast',
     'extracellular'
])

good_localisations.update(['putative:' + s for s in good_localisations])


def node_localisation_std(x):
    if not type(x) == str:
        return "putative:cytoplasm"
    
    x = x.lower()
    if x in node_localisation_dict.keys():
        x = node_localisation_dict[x]
    
    if x in good_localisations:
        return x
    else:
        print(x)
        return ""

node_localisations = set()
new_localisation = set()
for prefix in ['input1', 'input2', 'input3', 'output1']:
    id_col, type_col, localisation_col, location_col  =\
        [prefix + x for x in ('_ID',  '_type',  '_localisation', '_location')]
    
    x = df_edges[['ConnID', 'origin', id_col, localisation_col]].dropna(how='all', subset=[ id_col, localisation_col])
    for _, y in x.iterrows():
        if (not (y[id_col] in helpers.empty_strings)) and (y[localisation_col] in helpers.empty_strings + ['']):
            print(y['origin'], "|", y['ConnID'], "|", prefix, "|", y[id_col], "|", y[localisation_col])
    
    
    node_localisations.update(x[localisation_col])
    
    #print(prefix)
    df_edges[location_col] = df_edges[localisation_col].apply(node_localisation_std)
    
    new_localisation.update(df_edges[location_col])

In [None]:
node_localisations

In [None]:
new_localisation

In [None]:
good_localisations

In [None]:
print('reaction_mode_dict = {')
for s in df_edges['ReactionMode'].unique():
    print(f"\t'{s}':'{'/'.join([x.lower().strip() for x in str(s).split('/')])}',")
print('}')

In [None]:
# # Old version before edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'translation',
# 	'transcription':'transcription',
# 	'by binding':'by binding',
# 	'nan':'nan',
# 	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'protein phosphorylation',
# }

# Old version after edit
# reaction_mode_dict = {
# 	'catalysis / auto-catalysis':'catalysis/auto-catalysis',
# 	'translocation':'translocation',
# 	'protein activation':'protein activation',
# 	'binding / oligomerisation':'binding/oligomerisation',
# 	'protein deactivation':'protein deactivation',
# 	'transcription/translation repression':'transcription/translation repression',
# 	'transcription/translation induction':'transcription/translation induction',
# 	'degradation / secretion':'degradation/secretion',
# 	'dissociation':'dissociation',
# 	'translation':'bad',
# 	'transcription':'bad',
# 	'by binding':'bad',
# 	np.nan:'bad',
# 	'cleavage / auto-cleavage':'bad',
# 	'binding/oligomerisation':'binding/oligomerisation',
# 	'protein phosphorylation':'bad',
# }

reaction_mode_dict = {
	'catalysis / auto-catalysis':'catalysis',
	'translocation':'translocation',
	'protein activation':'protein activation',
	'binding / oligomerisation':'binding/oligomerisation',
	'protein deactivation':'protein deactivation',
	'transcriptional / translational repression':'transcriptional/translational repression',
	'transcriptional / translational induction':'transcriptional/translational activation',
	'degradation / secretion':'degradation/secretion',
	np.nan:'undefined',
	'dissociation':'dissociation',
	'cleavage / auto-cleavage':'cleavage/auto-cleavage',
}

In [None]:
df_edges['reaction_type'] = df_edges['ReactionMode'].apply(lambda x: reaction_mode_dict[x])

In [None]:
df_edges[df_edges['reaction_type']=='undefined'][['origin', 'ConnID', 'ReactionMode', 'ReactionEffect', 'reaction_type']]

In [None]:
df_edges.to_csv("parsed_reactions.tsv", sep="\t")

# END