In [1]:
version = "v0.1.0"

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 7/?

Code to translate v2.7.9_PIS-model.xlsx to neo4j database. 

## Setup

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## MK identifier

In [10]:
# FIXED in sheet
# cy = """
#     MATCH p=(r {name:"rx00308"})--(n {family:"MYB"})
#     SET n.name = 'MYB33[Sotub06g030530.1.1]'
#     SET n.stu_homolugues = ['Sotub06g030530.1.1']
#     SET n._identifiers = 'Sotub06g030530.1.1'
# """
# graph.run(cy)

In [11]:
cy = """
    MATCH p=(r {name:"rx00308"})--(n {family:"MYB"})
    RETURN n
"""

graph.run(cy)

 n                                                                                                                               
---------------------------------------------------------------------------------------------------------------------------------
 (_471:FunctionalCluster:PlantCoding {family: 'MYB', name: 'MYB33[SOTUB06G030530.1.1]', stu_homologues: ['SOTUB06G030530.1.1']}) 
 (_471:FunctionalCluster:PlantCoding {family: 'MYB', name: 'MYB33[SOTUB06G030530.1.1]', stu_homologues: ['SOTUB06G030530.1.1']}) 

## Pathways

In [18]:
df = pd.read_csv(base_path / "data" / "raw" / "pss-pathways - neo4j-for-CB.tsv", sep="\t")

In [19]:
df = df[~df["pathway"].isna()]

In [20]:
df['label'] = df["node_type"].apply(lambda x: ":".join(eval(x)))

In [21]:
df.label.value_counts()

FunctionalCluster:PlantCoding       229
Metabolite                          116
Complex                              83
ForeignCoding                        14
FunctionalCluster:PlantNonCoding     10
FunctionalCluster:PlantAbstract       8
Process                               6
ForeignEntity                         3
Name: label, dtype: int64

In [22]:
df.head()

Unnamed: 0,name,node_type,pathway,label
2,"AGO1,5,7,10|CI","[""Complex""]",P:Silencing,Complex
3,"AGO1,5,7,10|HC-Pro","[""Complex""]",P:Silencing,Complex
4,ATPB|HC-Pro,"[""Complex""]",P:Secondary-metabolism_Terpenoids,Complex
5,BAK1|FLS2|flg22,"[""Complex""]",Hormone:BS,Complex
6,BIK1|Ca2+,"[""Complex""]",S:Ca,Complex


In [24]:
for label, subdf in df.groupby("label"):
    f = f"pathways-{label}.tsv"
    subdf.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label}  {{ name:line.name}})
           SET n.pathway = line.pathway
           RETURN n.name, n.pathway, line.pathway
    '''.format(label=label, file=f)
#     print(cy)
    qr = graph.run(cy)
#     print(qr)
    print(label, "\t\t\t", len(qr.data())==subdf.shape[0])

Complex 			 True
ForeignCoding 			 True
ForeignEntity 			 True
FunctionalCluster:PlantAbstract 			 True
FunctionalCluster:PlantCoding 			 True
FunctionalCluster:PlantNonCoding 			 True
Metabolite 			 True
Process 			 True


## Add species to reactions

In [166]:
df_edges = pd.read_csv(parsed_path / "edges-sheet.tsv", sep="\t", index_col=0)

In [174]:
f = "reaction_species.tsv"
df_edges[['reaction_id', 'species']].to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

In [178]:
pd.value_counts(df_edges['species'])

ath                342
ath,nta             11
stu                 10
osa                  8
all                  7
ath,nbe              5
ath,stu              4
ath,osa              3
ath,osa,psa          2
ath,osa,phy          1
ath,osa,sly,zma      1
Name: species, dtype: int64

In [179]:
cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label}  {{ name:line.reaction_id}})
       SET n.species = split(line.species, ',')
       RETURN n.name, n.species, line.species
'''.format(label="Reaction", file=f)

graph.run(cy)

 n.name  | n.species | line.species 
---------|-----------|--------------
 rx00001 | ['ath']   | ath          
 rx00002 | ['ath']   | ath          
 rx00003 | ['ath']   | ath          

# FunctionalCluster annotations from Ziva file

### metabolites

In [25]:
base_path / "data" /"raw" / "gmmmeta_20211011.tsv"

PosixPath('../data/raw/gmmmeta_20211011.tsv')

In [26]:
df = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmmeta_20211011.tsv", sep="\t", header=None, 
                 usecols=[2, 8],
                names=["name", "chebi"])

In [27]:
df = df[~df["chebi"].isna()]

In [28]:
df["chebi"] = df["chebi"].apply(lambda x: ','.join([s.strip().lower() for s in x.split("|")]))

In [29]:
df.head()

Unnamed: 0,name,chebi
2,1-aminocyclopropanecarboxylate,chebi:30526
4,1-deoxy-d-xylulose 5-phosphate,"chebi:16493,chebi:57792,chebi:57792"
12,15-cis-phytoene,chebi:27787
18,2-aminoadipic acid,chebi:37024
19,2-aminobenzoate,chebi:16567


In [30]:
f = "gmm_annot_Metabolite.tsv"
df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

In [33]:
cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label})
       WHERE n.description =~ "(?i)"+line.name
       OR n.name =~ "(?i)"+line.name
       WITH  n.external_links + split(line.chebi, ",") as new, n, line
       SET n.external_links = apoc.coll.toSet(new)
       RETURN n.name, n.external_links
'''.format(label="Metabolite", file=f)
#     print(cy)
qr = graph.run(cy)

In [34]:
qr

 n.name | n.external_links                               
--------|------------------------------------------------
 ADP    | ['chebi:16761', 'chebi:456216']                
 AMP    | ['chebi:16027', 'chebi:28971', 'chebi:456215'] 
 ATP    | ['chebi:15422', 'chebi:30616']                 

### srna

In [35]:
df = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmsrna_20211011.tsv", sep="\t", header=None, 
                 #usecols=[2, 8],
                #names=["name", "chebi"]
                )

In [36]:
for c in df.columns:
    print(c)
    display(df[c].value_counts())

0


2       1
675     1
671     1
669     1
667     1
       ..
1304    1
1302    1
1300    1
1298    1
1965    1
Name: 0, Length: 1966, dtype: int64

1


2    1966
Name: 1, dtype: int64

2


novel-miR8157-5p    1
phasiRNA2241        1
phasiRNA1042        1
phasiRNA1208        1
novel-miR8244-5p    1
                   ..
phasiRNA1204        1
phasiRNA2091        1
phasiRNA2101        1
phasiRNA335         1
phasiRNA1664        1
Name: 2, Length: 1966, dtype: int64

3


2017-10-13 00:05:00    1966
Name: 3, dtype: int64

4


novel-miR8157-5p    1
novel-miR8260-3p    1
phasiRNA772         1
phasiRNA2336        1
phasiRNA2241        1
                   ..
phasiRNA1204        1
phasiRNA2091        1
phasiRNA2101        1
phasiRNA335         1
phasiRNA1664        1
Name: 4, Length: 1966, dtype: int64

5


1    1966
Name: 5, dtype: int64

6


Series([], Name: 6, dtype: int64)

7


Series([], Name: 7, dtype: int64)

8


Series([], Name: 8, dtype: int64)

### prot

In [280]:
helpers.empty_strings

['-', '?', '[empty]', 'nan', 'n.a.', nan, '[undefined]', 'NULL', '']

In [344]:
df1 = pd.read_csv(base_path / "data" /"raw" / "GMM" / "gmmprot_20211011.tsv", sep="\t", header=None, 
                 usecols=[2, 4, 7, 8], 
                 names=["identifier", "description", "short_name", "synonyms"], 
                 na_values=helpers.empty_strings, 
                 converters={"synonyms": lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")])}
                )
for c in df1.columns:
    df1[c] = df1[c].str.strip()
df1.fillna('', inplace=True)
df1.set_index("identifier", inplace=True)


df1.columns = ["gmm_description1", "short_name", "synonyms"]

df1.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0_level_0,gmm_description1,short_name,synonyms
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AT1G01010,NAC domain containing protein 1,NAC001,"ANAC001,NAC001,NTL10"
AT1G01020,ARV1 family protein,ARV1,ARV1
AT1G01030,AP2/B3-like transcriptional factor family protein,NGA3,NGA3
AT1G01040,dicer-like 1,DCL1,"ASU1,ATDCL1,CAF,DCL1,EMB60,EMB76,SIN1,SUS1"
AT1G01046,microRNA 838A,MIR838A,


In [349]:
df2 = pd.read_excel(base_path / "data" /"raw" / "v2.7.9_PIS-model.xlsx", 
                    sheet_name="defGMM", 
                    header=[0], 
                    dtype=str, 
                    usecols=[0, 1, 2, 3, 4, 5],
                    converters={"GMM:Synonyms": lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")])}
)
for c in df2.columns:
    df2[c] = df2[c].str.strip()
df2.fillna('', inplace=True)
df2.set_index("GeneID", inplace=True)

df2.columns = ['gmm_ocd_all', 'gmm_ocd_plaza', 'gmm_description', 'GMM:ShortName',
       'GMM:Synonyms']


df2.head()

  data = io.parse(


Unnamed: 0_level_0,gmm_ocd_all,gmm_ocd_plaza,gmm_description,GMM:ShortName,GMM:Synonyms
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AT1G01010,OCD_all_012402,OCD_PLAZA_011642,NAC domain containing protein 1,NAC001,"ANAC001,NAC001,NTL10"
AT1G01020,OCD_all_006935,OCD_PLAZA_005240,ARV1 family protein,ARV1,ARV1
AT1G01030,OCD_all_001050,OCD_PLAZA_024590,AP2/B3-like transcriptional factor family protein,NGA3,NGA3
AT1G01040,OCD_all_000310,OCD_PLAZA_001847,dicer-like 1,DCL1,"ASU1,ATDCL1,CAF,DCL1,EMB60,EMB76,SIN1,SUS1"
AT1G01046,OCD_all_146545,OCD_PLAZA_162139,microRNA 838A,MIR838A,


In [369]:
df3 = pd.read_excel(base_path / "data" /"raw" / "v2.7.9_PIS-model.xlsx", 
                    sheet_name="Components", 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings,
                    usecols=[4, 7, 8, 9, 11, 12],
)
for c in df3.columns:
    df3[c] = df3[c].str.strip()
df3.fillna('', inplace=True)

df3 = df3[df3['NodeType']=='plant_coding']

df3["GMM:Synonyms"] = df3["GMM:Synonyms"].apply(lambda x: '' if x =='NULL' else ','.join([s.strip() for s in x.split("|")]))

df3 = df3[["NodeID", "NodeName",  "GMM:Synonyms", "NodeDescription", "AdditionalInfo"]]
df3.columns = ["NodeID", "pis_shortname", "pis_synonyms", "pis_description",  "additional_information"]

df3.set_index("NodeID", inplace=True)

df3.head()

Unnamed: 0_level_0,pis_shortname,pis_synonyms,pis_description,additional_information
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT1G02500,SAM1,"AtSAM1,MAT1,METK1,SAM-1,SAM1",SAM synthetase [EC:2.5.1.6],
AT4G01850,SAM2,"AtSAM2,MAT2,SAM-2,SAM2",SAM synthetase [EC:2.5.1.6],
AT2G36880,SAM3,"MAT3,METK3",SAM synthetase [EC:2.5.1.6],
AT3G17390,SAM4,"MAT4,METK4,MTO3,SAMS3,SAM4",SAM synthetase [EC:2.5.1.6],
AT3G61510,ACS1,"ACC2,ACS1,AT-ACS1",ACC synthase [EC:4.4.1.14],


In [386]:
df = df1.join(df2, how='outer').join(df3, how='outer')
df.fillna('', inplace=True)

In [387]:
df.shape

(488956, 12)

In [388]:
df.head()

Unnamed: 0,gmm_description1,short_name,synonyms,gmm_ocd_all,gmm_ocd_plaza,gmm_description,GMM:ShortName,GMM:Synonyms,pis_shortname,pis_synonyms,pis_description,additional_information
007A03AF.esd,,,,,,,,,,,,
027A04AF.esd,hypothetical protein [Neurospora crassa] gb_EA...,,,,,,,,,,,
027D07AF.esd,,,,,,,,,,,,
028C01AF.esd,,,,,,,,,,,,
032H03AF.esd,,,,,,,,,,,,


In [389]:
df.loc['AT3G63110']

gmm_description1                                   isopentenyltransferase 3
short_name                                                             IPT3
synonyms                                                        ATIPT3,IPT3
gmm_ocd_all                                                  OCD_all_001709
gmm_ocd_plaza                                              OCD_PLAZA_010676
gmm_description                                    isopentenyltransferase 3
GMM:ShortName                                                          IPT3
GMM:Synonyms                                                    ATIPT3,IPT3
pis_shortname                                                          IPT3
pis_synonyms                                                    ATIPT3,IPT3
pis_description           adenylate dimethylallyltransferase EC:2.5.1.27...
additional_information                             isopentenyltransferase 3
Name: AT3G63110, dtype: object

In [390]:
def get_gmm_description_one(x):
    zd = x["gmm_description1"] #ziva file
    fd = x["gmm_description"]  #xtra sheet in PIS
    
    if zd != '':
        return zd
    else:
        return fd



df["new_gmm_description"] = df[["gmm_description1", "gmm_description", "pis_description"]].apply(get_gmm_description_one, axis=1)

In [391]:
def get_description_one(x):
    zd = x["gmm_description1"] #ziva file
    fd = x["gmm_description"]  #xtra sheet in PIS
    lr = x["pis_description"]  #pis NodeDescription
    
    if lr != '':
        return lr
    elif fd != '':
        return fd
    else:
        return zd


df["new_description"] = df[["gmm_description1", "gmm_description", "pis_description"]].apply(get_description_one, axis=1)

In [392]:
def get_set(x):
    zd = x["synonyms"]
    fd = x["GMM:Synonyms"]
    lr = x["pis_synonyms"]

    s = set(zd.split(",") + fd.split(",") + lr.split(",") + [x["pis_shortname"], x["short_name"]])
    s.discard('')
    
    return ','.join(list(s))

df["new_synonyms"] = df[["pis_shortname", "short_name", "synonyms", "GMM:Synonyms", "pis_synonyms"]].apply(get_set, axis=1)

In [393]:
df.tail()

Unnamed: 0,gmm_description1,short_name,synonyms,gmm_ocd_all,gmm_ocd_plaza,gmm_description,GMM:ShortName,GMM:Synonyms,pis_shortname,pis_synonyms,pis_description,additional_information,new_gmm_description,new_description,new_synonyms
tr|W5IC80|W5IC80_WHEAT,Uncharacterized protein,,,,,,,,,,,,Uncharacterized protein,Uncharacterized protein,
tr|W5ICC7|W5ICC7_WHEAT,Uncharacterized protein,,,,,,,,,,,,Uncharacterized protein,Uncharacterized protein,
tr|W5QKY9|W5QKY9_WHEAT,Chalcone-flavonone isomerase family protein,,,,,,,,,,,,Chalcone-flavonone isomerase family protein,Chalcone-flavonone isomerase family protein,
tr|W6EK34|W6EK34_WHEAT,S-adenosylmethionine decarboxylase proenzyme,,,,,,,,,,,,S-adenosylmethionine decarboxylase proenzyme,S-adenosylmethionine decarboxylase proenzyme,
tr|X5DA31|X5DA31_WHEAT,Squamosa promoter-binding-like protein 21,,,,,,,,,,,,Squamosa promoter-binding-like protein 21,Squamosa promoter-binding-like protein 21,


In [394]:
df = df[["new_description", "new_gmm_description", "new_synonyms", "additional_information"]]
df.columns = ["description", "gmm_description", "synonyms", "additional_information"]

In [395]:
df.loc['AT5G38450']

description                              cytokinin hydroxylase EC:1.14.13.-
gmm_description           cytochrome P450%2C family 735%2C subfamily A%2...
synonyms                                                           CYP735A1
additional_information                                 Cytochrome P450 35A1
Name: AT5G38450, dtype: object

In [396]:
df.loc['Os11g0104300']

description                                           D53; Protein DWARF 53
gmm_description                                                            
synonyms                                                                D53
additional_information    D53 binds to the complex S-D14-MAX2 to be degr...
Name: Os11g0104300, dtype: object

In [397]:
df.loc['LOC_Os02g36974']

description                                F-box protein GID2
gmm_description           14-3-3 protein, putative, expressed
synonyms                                                 GID2
additional_information                                       
Name: LOC_Os02g36974, dtype: object

In [398]:
cy = """
    MATCH (n:FunctionalCluster)
    WHERE size(n.ath_homologues) = 1
    AND "AT3G03990" IN n.ath_homologues
    RETURN n, size(n.ath_homologues)
"""

qr = graph.run(cy)
qr

 n                                                                                                                                                          | size(n.ath_homologues) 
------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------
 (_391:FunctionalCluster:PlantCoding {ath_homologues: ['AT3G03990'], family: '&alpha;/&beta; hydroxylase', name: 'D14[AT3G03990]', pathway: 'Hormone:SLs'}) |                      1 

In [399]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.ath_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.osa_homologues IS NULL
    RETURN count(*)
"""

qr = graph.run(cy)
qr

 count(*) 
----------
      134 

In [400]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.sly_homologues) = 1
    AND n.ath_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

 count(*) 
----------
        0 

In [401]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.stu_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

 count(*) 
----------
        8 

In [402]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.osa_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.stu_homologues IS NULL
    AND n.ath_homologues IS NULL    
    RETURN count(*)
"""

qr = graph.run(cy)
qr

 count(*) 
----------
        5 

In [403]:
df.index.name = 'identifier'

In [404]:
per_species = {}
for species, prefix in [
    ('ath', "AT"),
    #('osa', ), os in table are LOC_Os????
    ('sly', "Soly"), 
    ('stu', 'Sotub', )
     ]:
    tdf = df[df.index.str.startswith(prefix)]
    print(species, tdf.shape[0])
    f = f"gmm_annot_Prot-{species}.tsv"
    tdf.reset_index().to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    per_species[species]  = f

ath 33341
sly 35768
stu 35004


In [441]:
#x_found['new_synonyms'] = x_found['n.name'].apply(lambda x : x.split("[")[0])
def get_synonyms(x):
    synonyms = [x['short_name']]

    if x['line.short_name']:
        synonyms.append(x['line.short_name'])
    
    if x['line.synonyms']:
        synonyms += x['line.synonyms'].split(',')
    
    synonyms = [x.strip().upper() for x in synonyms]
    
    synonyms = set(synonyms)
    synonyms.discard('')
    
    return ','.join(sorted(list(synonyms)))
    
def combine_exists_and_found(species):
    f = per_species[species]

    s_exc = ''
    for sp2 in ["ath", "osa", "sly", "stu"]:
        if sp2 != species:
            s_exc += f"AND n.{sp2}_homologues IS NULL\n"
    
    
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label})
           WHERE size(n.{species}_homologues) = 1
           {s_exc}
           AND ANY ( item IN n.{species}_homologues WHERE item =~ "(?i)" + line.identifier )
           RETURN n.name, line.short_name, n.description, line.description, n.synonyms, line.synonyms, line.gmm_description 
    '''.format(label="FunctionalCluster", file=f, species=species, s_exc=s_exc)
    print(cy)
    qr = graph.run(cy)    
    x_found = pd.DataFrame(qr.data())
    print(len(x_found))
    
    x_found['short_name'] = x_found['n.name'].apply(lambda x : x.split("[")[0])
    x_found['synonyms'] = x_found[['line.short_name', 'line.synonyms', 'short_name']].apply(get_synonyms, axis=1)

    x_found['name'] = x_found['n.name']
    x_found['description'] = x_found['line.description']
    x_found['gmm_description'] = x_found['line.gmm_description']

    
    x_found = x_found[['name', 'short_name', 'synonyms', 'description', 'gmm_description']]
    
    return x_found

def add_properties(species, df):
    f = f"gmm_annot_Prot-{species}-combined.tsv"

    df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)
    
    cy = '''
    LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
           MATCH (n:{label} {{name:line.name}})
           SET n.short_name = line.short_name
           SET n.synonyms = split(line.synonyms, ',')
           SET n.description = line.description
           SET n.gmm_description = line.gmm_description
           RETURN n.name 
    '''.format(label="FunctionalCluster", file=f, species=species, s_exc=s_exc)
    print(cy)
    qr = graph.run(cy)    
    
    return qr

In [443]:
x_found = combine_exists_and_found("ath")
qr = add_properties('ath', x_found)
print(len(qr.data()))


    LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-ath.tsv' AS line FIELDTERMINATOR '	'
           MATCH (n:FunctionalCluster)
           WHERE size(n.ath_homologues) = 1
           AND n.osa_homologues IS NULL
AND n.sly_homologues IS NULL
AND n.stu_homologues IS NULL

           AND ANY ( item IN n.ath_homologues WHERE item =~ "(?i)" + line.identifier )
           RETURN n.name, line.short_name, n.description, line.description, n.synonyms, line.synonyms, line.gmm_description 
    
134

    LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-ath-combined.tsv' AS line FIELDTERMINATOR '	'
           MATCH (n:FunctionalCluster {name:line.name})
           SET n.short_name = line.short_name
           SET n.synonyms = split(line.synonyms, ',')
           SET n.description = line.description
           SET n.gmm_description = line.gmm_description
           RETURN n.name 
    
134


0


In [438]:
cy = """
    MATCH (n:FunctionalCluster:PlantCoding)
    WHERE size(n.stu_homologues) = 1
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN n.name
"""

qr = graph.run(cy)
qr.data()

[{'n.name': 'CPS[SOTUB06G034690.1.1]'},
 {'n.name': 'GA20ox.x3[SOTUB09G017710.1.1]'},
 {'n.name': 'GA20ox.x5[SOTUB10G011620.1.1]'},
 {'n.name': 'GA20ox1[SOTUB03G007160.1.1]'},
 {'n.name': 'GA20ox3[SOTUB11G029030.1.1]'},
 {'n.name': 'GA20ox4[SOTUB01G031210.1.1]'},
 {'n.name': 'GA3ox[SOTUB06G023360.1.1]'},
 {'n.name': 'MYB33[SOTUB06G030530.1.1]'}]

In [444]:
x_found = combine_exists_and_found("stu")
qr = add_properties('stu', x_found)
print(len(qr.data()))


    LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-stu.tsv' AS line FIELDTERMINATOR '	'
           MATCH (n:FunctionalCluster)
           WHERE size(n.stu_homologues) = 1
           AND n.ath_homologues IS NULL
AND n.osa_homologues IS NULL
AND n.sly_homologues IS NULL

           AND ANY ( item IN n.stu_homologues WHERE item =~ "(?i)" + line.identifier )
           RETURN n.name, line.short_name, n.description, line.description, n.synonyms, line.synonyms, line.gmm_description 
    
8

    LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-stu-combined.tsv' AS line FIELDTERMINATOR '	'
           MATCH (n:FunctionalCluster {name:line.name})
           SET n.short_name = line.short_name
           SET n.synonyms = split(line.synonyms, ',')
           SET n.description = line.description
           SET n.gmm_description = line.gmm_description
           RETURN n.name 
    
8


In [445]:
x_found = combine_exists_and_found("sly")
# #qr = add_properties('sly', x_found)
# #print(len(qr.data()))


    LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-sly.tsv' AS line FIELDTERMINATOR '	'
           MATCH (n:FunctionalCluster)
           WHERE size(n.sly_homologues) = 1
           AND n.ath_homologues IS NULL
AND n.osa_homologues IS NULL
AND n.stu_homologues IS NULL

           AND ANY ( item IN n.sly_homologues WHERE item =~ "(?i)" + line.identifier )
           RETURN n.name, line.short_name, n.description, line.description, n.synonyms, line.synonyms, line.gmm_description 
    
0


KeyError: 'n.name'

In [450]:
cy = """
    MATCH (n:FunctionalCluster)
    WHERE n.stu_homologues IS NULL
    AND n.sly_homologues IS NULL
    AND n.ath_homologues IS NULL
    AND n.osa_homologues IS NULL    
    RETURN n.name
"""

qr = graph.run(cy)
qr

 n.name     
------------
 BA2H[]     
 GPAphid2[] 
 IPL[]      

In [451]:
df = pd.DataFrame(qr.data())

In [453]:
df['short_name'] = df['n.name'].apply(lambda x : x.split("[")[0])

In [455]:
df.columns = ["name", "short_name"]

In [456]:
df.head()

Unnamed: 0,name,short_name
0,BA2H[],BA2H
1,GPAphid2[],GPAphid2
2,IPL[],IPL
3,R-gene[],R-gene
4,Rx1[],Rx1


In [459]:
f = f"gmm_annot_Prot-{species}-nonname.tsv"

df.to_csv(base_path / "data" / "import" / f, sep="\t", index=None)

cy = '''
LOAD CSV WITH HEADERS FROM  'file:///{file}' AS line FIELDTERMINATOR '\t'
       MATCH (n:{label} {{name:line.name}})
       SET n.short_name = line.short_name
       RETURN n.name 
'''.format(label="FunctionalCluster", file=f)
print(cy)
qr = graph.run(cy)    



LOAD CSV WITH HEADERS FROM  'file:///gmm_annot_Prot-sly-nonname.tsv' AS line FIELDTERMINATOR '	'
       MATCH (n:FunctionalCluster {name:line.name})
       SET n.short_name = line.short_name
       RETURN n.name 



In [460]:
qr

 n.name     
------------
 BA2H[]     
 GPAphid2[] 
 IPL[]      