In [None]:
version = "v0.1.0"

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 7/?

Code to translate v2.7.9_PIS-model.xlsx to neo4j database. 

## Setup

In [None]:
from collections import defaultdict

In [None]:
import pandas as pd
import re
import numpy as np
import os

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
import helpers

In [None]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [None]:
graph = Graph(host="neo4j")

In [None]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## MK identifier

In [None]:
# FIXED in sheet
# cy = """
#     MATCH p=(r {name:"rx00308"})--(n {family:"MYB"})
#     SET n.name = 'MYB33[Sotub06g030530.1.1]'
#     SET n.stu_homolugues = ['Sotub06g030530.1.1']
#     SET n._identifiers = 'Sotub06g030530.1.1'
# """
# graph.run(cy)

In [None]:
cy = """
    MATCH p=(r {name:"rx00308"})--(n {family:"MYB"})
    RETURN n
"""

graph.run(cy)

## Pathways

In [None]:
df = pd.read_csv(base_path / "data" / "raw" / "pss-pathways - neo4j-for-CB.tsv", sep="\t")

In [None]:
df = df[~df["pathway"].isna()]

In [None]:
df['label'] = df["node_type"].apply(lambda x: ":".join(eval(x)))

In [None]:
df.label.value_counts()

In [None]:
df.head()

In [None]:
for label, subdf in df.groupby("label"):
    f = f"pathways-{label}.tsv"
    subdf.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label}  {{ name:line.name}})
           SET n.pathway = line.pathway
           RETURN n.name, n.pathway, line.pathway
    '''.format(label=label, file=f)
#     print(cy)
    qr = graph.run(cy)
#     print(qr)
    print(label, "\t\t\t", len(qr.data())==subdf.shape[0])

## Add species to reactions

In [None]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [None]:
f = "reaction_species.tsv"
df_edges[['reaction_id', 'species']].to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

In [None]:
pd.value_counts(df_edges['species'])

In [None]:
cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label}  {{ name:line.reaction_id}})
       SET n.species = split(line.species, ',')
       RETURN n.name, n.species, line.species
'''.format(label="Reaction", file=f)

graph.run(cy)

# FunctionalCluster annotations from Ziva file

### metabolites

In [None]:
base_path / "data" /"raw" / "gmmmeta_20211011.tsv"

In [None]:
df = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmmeta_20211011.tsv", sep="\t", header=None, 
                 usecols=[2, 8],
                names=["name", "chebi"])

In [None]:
df = df[~df["chebi"].isna()]

In [None]:
df["chebi"] = df["chebi"].apply(lambda x: ','.join([s.strip().lower() for s in x.split("|")]))

In [None]:
df.head()

In [None]:
f = "gmm_annot_Metabolite.tsv"
df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

In [None]:
cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label})
       WHERE n.description =~ "(?i)"+line.name
       OR n.name =~ "(?i)"+line.name
       WITH  n.external_links + split(line.chebi, ",") as new, n, line
       SET n.external_links = apoc.coll.toSet(new)
       RETURN n.name, n.external_links
'''.format(label="Metabolite", file=f)
#     print(cy)
qr = graph.run(cy)

In [None]:
qr

### srna

In [None]:
df = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmsrna_20211011.tsv", sep="\t", header=None, 
                 #usecols=[2, 8],
                #names=["name", "chebi"]
                )

In [None]:
for c in df.columns:
    print(c)
    display(df[c].value_counts())

### prot

In [None]:
helpers.empty_strings

In [None]:
df1 = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmprot_20211011.tsv", sep="\t", header=None, 
                 usecols=[2, 4, 7, 8], 
                 names=["identifier", "description", "short_name", "synonyms"], 
                 na_values=helpers.empty_strings, 
                 converters={"synonyms": lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")])}
                )
for c in df1.columns:
    df1[c] = df1[c].str.strip()
df1.fillna('', inplace=True)
df1.set_index("identifier", inplace=True)


df1.columns = ["gmm_description1", "short_name", "synonyms"]

df1.head()

In [None]:
df2 = pd.read_excel(base_path / "data" /"raw" / "v2.7.9_PIS-model.xlsx", 
                    sheet_name="defGMM", 
                    header=[0], 
                    dtype=str, 
                    usecols=[0, 1, 2, 3, 4, 5],
                    converters={"GMM:Synonyms": lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")])}
)
for c in df2.columns:
    df2[c] = df2[c].str.strip()
df2.fillna('', inplace=True)
df2.set_index("GeneID", inplace=True)

df2.columns = ['gmm_ocd_all', 'gmm_ocd_plaza', 'gmm_description', 'GMM:ShortName',
       'GMM:Synonyms']


df2.head()

In [None]:
df3 = pd.read_excel(base_path / "data" /"raw" / "v2.7.9_PIS-model.xlsx", 
                    sheet_name="Components", 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings,
                    usecols=[4, 7, 8, 9, 11, 12],
)
for c in df3.columns:
    df3[c] = df3[c].str.strip()
df3.fillna('', inplace=True)

df3 = df3[df3['NodeType']=='plant_coding']

df3["GMM:Synonyms"] = df3["GMM:Synonyms"].apply(lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")]))

df3 = df3[["NodeID", "NodeName",  "GMM:Synonyms", "NodeDescription", "AdditionalInfo"]]
df3.columns = ["NodeID", "pis_shortname", "pis_synonyms", "pis_description",  "additional_information"]

df3.set_index("NodeID", inplace=True)

df3.head()

In [None]:
df = df1.join(df2, how='outer').join(df3, how='outer')
df.fillna('', inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.loc['AT3G63110']

In [None]:
def get_gmm_description_one(x):
    zd = x["gmm_description1"] #ziva file
    fd = x["gmm_description"]  #xtra sheet in PIS
    
    if zd != '':
        return zd
    else:
        return fd



df["new_gmm_description"] = df[["gmm_description1", "gmm_description", "pis_description"]].apply(get_gmm_description_one, axis=1)

In [None]:
def get_description_one(x):
    zd = x["gmm_description1"] #ziva file
    fd = x["gmm_description"]  #xtra sheet in PIS
    lr = x["pis_description"]  #pis NodeDescription
    
    if lr != '':
        return lr
    elif fd != '':
        return fd
    else:
        return zd


df["new_description"] = df[["gmm_description1", "gmm_description", "pis_description"]].apply(get_description_one, axis=1)

In [None]:
def get_set(x):
    zd = x["synonyms"]
    fd = x["GMM:Synonyms"]
    lr = x["pis_synonyms"]

    s = set(zd.split(",") + fd.split(",") + lr.split(",") + [x["pis_shortname"], x["short_name"]])
    s.discard('')
    
    return ','.join(list(s))

df["new_synonyms"] = df[["pis_shortname", "short_name", "synonyms", "GMM:Synonyms", "pis_synonyms"]].apply(get_set, axis=1)

In [None]:
df.tail()

In [None]:
df = df[["new_description", "new_gmm_description", "new_synonyms", "additional_information"]]
df.columns = ["description", "gmm_description", "synonyms", "additional_information"]

In [None]:
df.loc['AT5G38450']

In [None]:
df.loc['Os11g0104300']

In [None]:
df.loc['LOC_Os02g36974']

In [None]:
cy = """
    MATCH (n:FunctionalCluster)
    WHERE size(n.ath_homologues) = 1
    AND "AT3G03990" IN n.ath_homologues
    RETURN n, size(n.ath_homologues)
"""

qr = graph.run(cy)
qr

In [None]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.ath_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.osa_homologues IS NULL
    RETURN count(*)
"""

qr = graph.run(cy)
qr

In [None]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.sly_homologues) = 1
    AND n.ath_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

In [None]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.stu_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

In [None]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.osa_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.ath_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

In [None]:
df.index.name = 'identifier'

In [None]:
per_species = {}
for species, prefix in [
    ('ath', "AT"),
    #('osa', ), os in table are LOC_Os????
    ('sly', "Soly"), 
    ('stu', 'Sotub', )
     ]:
    tdf = df[df.index.str.startswith(prefix)]
    print(species, tdf.shape[0])
    f = f"gmm_annot_Prot-{species}.tsv"
    tdf.reset_index().to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    per_species[species]  = f

In [None]:
#x_found['new_synonyms'] = x_found['n.name'].apply(lambda x : x.split("[")[0])
def get_synonyms(x):
    synonyms = [x['short_name']]

    if x['line.short_name']:
        synonyms.append(x['line.short_name'])
    
    if x['line.synonyms']:
        synonyms += x['line.synonyms'].split(',')
    
    synonyms = [x.strip().upper() for x in synonyms]
    
    synonyms = set(synonyms)
    synonyms.discard('')
    
    return ','.join(sorted(list(synonyms)))
    
def combine_exists_and_found(species):
    f = per_species[species]

    s_exc = ''
    for sp2 in ["ath", "osa", "sly", "stu"]:
        if sp2 != species:
            s_exc += f"AND n.{sp2}_homologues IS NULL\n"
    
    
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label})
           WHERE size(n.{species}_homologues) = 1
           {s_exc}
           AND ANY ( item IN n.{species}_homologues WHERE item =~ "(?i)" + line.identifier )
           RETURN n.name, line.short_name, n.description, line.description, n.synonyms, line.synonyms, line.gmm_description 
    '''.format(label="FunctionalCluster", file=f, species=species, s_exc=s_exc)
    print(cy)
    qr = graph.run(cy)    
    x_found = pd.DataFrame(qr.data())
    print(len(x_found))
    
    x_found['short_name'] = x_found['n.name'].apply(lambda x : x.split("[")[0])
    x_found['synonyms'] = x_found[['line.short_name', 'line.synonyms', 'short_name']].apply(get_synonyms, axis=1)

    x_found['name'] = x_found['n.name']
    x_found['description'] = x_found['line.description']
    x_found['gmm_description'] = x_found['line.gmm_description']

    
    x_found = x_found[['name', 'short_name', 'synonyms', 'description', 'gmm_description']]
    
    return x_found

def add_properties(species, df):
    f = f"gmm_annot_Prot-{species}-combined.tsv"

    df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label} {{name:line.name}})
           SET n.short_name = line.short_name
           SET n.synonyms = split(line.synonyms, ',')
           SET n.description = line.description
           SET n.gmm_description = line.gmm_description
           RETURN n.name 
    '''.format(label="FunctionalCluster", file=f, species=species, s_exc=s_exc)
    print(cy)
    qr = graph.run(cy)    
    
    return qr

In [None]:
x_found = combine_exists_and_found("ath")
qr = add_properties('ath', x_found)
print(len(qr.data()))

In [None]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.stu_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN n.name
"""

qr = graph.run(cy)
qr.data()

In [None]:
x_found = combine_exists_and_found("stu")
qr = add_properties('stu', x_found)
print(len(qr.data()))

In [None]:
x_found = combine_exists_and_found("sly")
# #qr = add_properties('sly', x_found)
# #print(len(qr.data()))

In [None]:
cy = """
    MATCH (n:FunctionalCluster)
    WHERE n.stu_homologues IS NULL
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN n.name
"""

qr = graph.run(cy)
qr

In [None]:
df = pd.DataFrame(qr.data())

In [None]:
df['short_name'] = df['n.name'].apply(lambda x : x.split("[")[0])

In [None]:
df.columns = ["name", "short_name"]

In [None]:
df.head()

In [None]:
f = f"gmm_annot_Prot-{species}-nonname.tsv"

df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label} {{name:line.name}})
       SET n.short_name = line.short_name
       RETURN n.name 
'''.format(label="FunctionalCluster", file=f)
print(cy)
qr = graph.run(cy)    


In [None]:
qr