In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 1/4

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [2]:
import pandas as pd
import re
import numpy as np
import os
from IPython.display import Image, display

In [3]:
import helpers

In [4]:
from importlib import reload

In [5]:
reload(helpers)

<module 'helpers' from '/home/jovyan/work/helpers.py'>

In [6]:
node_labels = helpers.node_labels

## Read in sheets

In [7]:
from pathlib import Path

base_path = Path("..")
input_path = base_path / "data" / "raw"
output_path = base_path / "data" / "parsed"

### Components sheet

In [8]:
sheets = [#(file, sheet_name)]
    ("v2.7.6_PIS-model.xlsx", "Components"),
#    ("v2.7.5_PIS-model.xlsx", "Components_New"), 
#    ("Model_CK.xlsx", "Components_new"), 
#    ("v2.7.2_PIS-model-JALR.xlsx", "Components_New")
]

In [10]:
# resave xlsx as tsv
drops = ['Legacy:Process', 'mID', 'Notes', 'Unnamed: 21', 'GMM_OCD', 'ExternalDB', 'Notes']
col_rename = {
    'AddedBy':'AddedBy', 
    'Species':'Species', 
    'NodeType':'NodeType', 
    'Family':'Family', 
    'Clade':'Clade', 
    'NodeID':'NodeID', 
    'NodeName':'NodeName', 
    'ModelStatus':'ModelStatus', 
    'NodeDescription':'NodeDescription', 
    'AdditionalInfo':'AdditionalInfo', 
    'Process':'Process', 
    'ModelV':'ModelV', 

    'ExtDBlink':'ExtDBlink', 

    'GMM_OCD1':'GMM_OCD', 
    'GMM:Description':'GMM_Description', 
    'GMM:ShortName':'GMM_ShortName', 
    'GMM:Synonyms':'GMM_Synonyms', 

    'Node':'NodeName'
}

for file_name, sheet_name in sheets:
    
    print(file_name, sheet_name)
    
    file_path = input_path / file_name
    
    base_name = file_path.stem
    new_file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    
    print(new_file_path)
#    if os.path.exists(new_file_path):
#        continue
        
    df = pd.read_excel(file_path, 
                    sheet_name=sheet_name, 
                    header=[1], 
                    dtype=str, 
                    na_values=helpers.empty_strings)
    
    to_drop = list(set(drops) & set(df.columns)) + list(df.filter(regex=("Unnamed.*")).columns)
    df.drop(to_drop, axis=1, inplace=True)
    
    new_cols = [col_rename[x] for x in df.columns]
    df.columns = new_cols
    
    df.to_csv(new_file_path, sep="\t", index=None)

v2.7.6_PIS-model.xlsx Components
../data/parsed/v2.7.6_PIS-model-Components.tsv


In [11]:
dfs = []

for file_name, sheet_name in sheets:
    print(file_name, sheet_name)
    base_name, extension = os.path.splitext(file_name)
    file_path = output_path / f'{base_name}-{sheet_name}.tsv'
    df = pd.read_csv(file_path, sep="\t")
    dfs.append(df)

v2.7.6_PIS-model.xlsx Components


In [12]:
df_components = pd.concat(dfs, sort=False)
df_components.reset_index(drop=True, inplace=True)

In [13]:
pd.value_counts(df_components['NodeType'])

plant_coding       858
metabolite         102
pathogen_coding     14
plant_abstract      12
plant_noncoding     12
process              6
plant_complex        3
Name: NodeType, dtype: int64

In [14]:
df_components.shape[0]

1007

In [14]:
# df_components[df_components['NodeType']=='x']
# x = df_components[df_components['NodeType']=='x'].index; display(x)
# df_components.drop(x, inplace=True)

In [15]:
#df_components.loc[df_components["NodeName"].isna(), 'NodeName']
#df_components.loc[df_components["NodeName"].isna(), 'NodeName'] =  df_components.loc[df_components["NodeName"].isna(), 'NodeID']

In [15]:
components_node_type_to_node_label = {
    "plant coding":"PlantCoding",
    "plant_coding":"PlantCoding",
    "plant_noncoding":"PlantNonCoding",
    "plant_ncRNA":"PlantNonCoding",

    "plant_complex":"Complex", 

    "metabolite":"Metabolite",

    "pathogen_coding":"ExternalCoding",
    "pathogen_noncoding":"ExternalNonCoding",
    
    "plant_abstract":"PlantAbstract",
    
    "process":"Process", 

    np.nan:"Undefined"
}

In [16]:
# update node labels
df_components["NodeLabel"] = df_components["NodeType"].apply(lambda x: components_node_type_to_node_label[x])
pd.value_counts(df_components['NodeLabel'])

PlantCoding       858
Metabolite        102
ExternalCoding     14
PlantNonCoding     12
PlantAbstract      12
Process             6
Complex             3
Name: NodeLabel, dtype: int64

In [17]:
# Prev run 
# PlantCoding       825
# Metabolite        104
# ExternalCoding     14
# PlantNonCoding     12
# PlantAbstract      12
# Process             6
# Undefined           1
# Complex             1

In [18]:
df_components[df_components["NodeLabel"] == "Undefined"]

Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,ExtDBlink,Process,ModelV,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel


In [19]:
df_components.sort_values(["NodeLabel", "Family", "NodeName"], inplace=True)

In [20]:
def only_asci(x):
    return "".join([character for character in x if character.isascii()])

In [21]:
df_components['species'] = df_components["Species"].apply(lambda x: x.lower().strip())
#df_components['observed_species'] = df_components["Species"].apply(helpers.get_second_item)
#df_components['also_observed_in'] = df_components["Species"].apply(helpers.rest_of_items)

df_components["GMM_Synonyms"] = df_components["GMM_Synonyms"].apply(helpers.list_string_to_nice_string)
df_components['AdditionalInfo'].fillna('', inplace=True)
df_components.loc[df_components['ModelV'].isna(), 'ModelV'] = 'vNA'

df_components['GMM_OCD'].fillna('', inplace=True)
df_components['ExtDBlink'].fillna('', inplace=True)

In [22]:
df_components['species'].unique()

array(['all', 'external', 'ath', 'osa', 'stu', 'sly'], dtype=object)

In [23]:
df_components.loc[df_components['AddedBy'].isna(), 'AddedBy']

Series([], Name: AddedBy, dtype: object)

In [24]:
df_components['AddedBy'] = df_components['AddedBy'].apply(lambda x: x.upper())
df_components.loc[df_components['AddedBy']=='ZR/MZ', 'AddedBy'] = 'MZ' 
df_components['AddedBy'].unique()

array(['ZR', 'KG', 'AG', 'MAK', 'MZ', 'JALR', 'ST'], dtype=object)

In [25]:
re_ec = "ec(?:\:|\s)?(\d+(?:\.(?:\-|\d+)){1,3}(?:\.n\d+)?)(?:\s|$|\]|,|\.)"

# also use "AdditionalInfo", "NodeDescription"
def get_external_links(row):
    dbs_list = []
    
    ################
    if row['Family'] == "R-gene":
        dbs_list.append("invented:unidentified")
    
    ################
    if row["NodeName"] in ["X1", "X2", "X3", "X4"]:
        dbs_list.append("invented:unidentified")
    
    ################
    ocd_id = row['GMM_OCD']
    if ocd_id:
        dbs_list.append(f"gmm_ocd:{ocd_id.lower()}")

    ################
    x = row['ExtDBlink']
    x = x.lower()
    pubchem_match = re.findall("(?:pubchem:)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"pubchem:{idf}" for idf in pubchem_match]
    chebi_match = re.findall("(?:chebi:)\s*(.+?)(?:\s|$)", x)
    dbs_list += [f"chebi:{idf}" for idf in chebi_match]

    # regex ocd from x as well
    ocd_match = re.findall("(ocd_all_.+?(?:\s|$))", x)
    dbs_list += [f"gmm_ocd:{idf}" for idf in ocd_match]

    ################
    x = row['AdditionalInfo']
    x = x.lower()
    
    # EC:3.3.3.- EC 2.2.1.7
    ec_match = re.findall(re_ec, x)
    dbs_list += [f"ec:{idf}" for idf in ec_match]
    
    pubmed_match = re.findall("pmid(?:\:|)\s*(\d+)", x)
    dbs_list += [f"pmid:{idf}" for idf in pubmed_match]
    
    pubmedc_match =  re.findall("pmcid(?:\:|)\s*(pmc\d+)", x)
    dbs_list += [f"pmcid:{idf}" for idf in pubmedc_match]
    
    doi_match = re.findall("doi(?:\:|\/)\s*(.+?)(?:\s|$|:)", x)
    dbs_list += [f"doi:{idf.strip().rstrip(',.')}" for idf in doi_match]    
    
    kegg_match = re.findall(r"((?:k|map|ko|ec|rn|ath)\d{5})", x)
    dbs_list += [f"kegg:{idf}" for idf in kegg_match]              
    
    ncbi_nuccore_match = re.findall("NCBI ID: (.+)", x)
    dbs_list += [f"ncbi_nuccore:{idf}" for idf in ncbi_nuccore_match]
                 
    ################                 
    x = row['NodeDescription']
    x = x.lower()
    
    # EC:3.3.3.-
    ec_match = re.findall(re_ec, x)
    dbs_list += [f"ec:{idf}" for idf in ec_match]
     
    kegg_match = re.findall(r"((?:k|map|ko|ec|rn)\d{5})", x)
    dbs_list += [f"kegg:{idf}" for idf in kegg_match]              
    
    return ','.join(list(set(dbs_list)))
    


In [26]:
x = df_components.apply(get_external_links, axis=1)

In [27]:
# checks
for v in [x[df_components['NodeName']=='ETR1(EIN1)'], \
          x[df_components['NodeName']=='GST1'], \
          x[df_components['NodeName']=='NDB3'], \
          x[df_components['NodeName']=='NPH3(SR1IP-SR1)'], \
          x[df_components['NodeName']=='RBX'], \
          x[df_components['NodeName']=='DXPS3'], \
          x[df_components['NodeName']=='ACX4']\
          ]:
    print(v.values)

['gmm_ocd:ocd_all_000289,ec:2.7.13.-,pmid:12045274']
['pmid:20198573,pmid:14576289,pmid:8090746,pmid:12897257,gmm_ocd:ocd_all_000927']
['doi:10.1104/pp.103.024208,gmm_ocd:ocd_all_000721']
['gmm_ocd:ocd_all_000424,doi:10.1111/tpj.12473']
['gmm_ocd:ocd_all_003304,pmid:12172031']
['kegg:ec00900,gmm_ocd:ocd_all_000638,ec:2.2.1.7']
['ec:1.3.3.6,kegg:k00232,gmm_ocd:ocd_all_004282']


In [28]:
df_components['external_links'] = x

In [35]:
def get_db(external_links, search_db="kegg"):
    if external_links != "":
        for dbval in external_links.split(","):
            try:
                db, val = dbval.split(":")
                if db == search_db:
                    return val
            except ValueError:
                print("issue", dbval)
                return ""
    return ""

In [36]:
df_components['gmm_ocd'] = x.apply(get_db, search_db="gmm_ocd")

In [37]:
df_components[["NodeLabel", "Family", "Clade", "NodeName",  "NodeID", \
               "external_links", "gmm_ocd", "ExtDBlink", "GMM_OCD", \
               "AdditionalInfo", "NodeDescription"]].to_csv(os.path.join("..", "data", "parsed", "components-lit-check.tsv"), sep="\t", index=None)

In [38]:
# See here for keys https://www.utf8-chartable.de/unicode-utf8-table.pl 

def only_asci(x):
    return "".join([character for character in x if character.isascii()])

def find_non_ascii(x):
    x = str(x)
    has_nonascii = False
    for character in x:
        if not character.isascii():
            has_nonascii = True
            print(character, ord(character), character.encode())
    if has_nonascii:
        print(x)
        return True
    else:
        return False

ascii_replacers = {
    b'\xc2\xa0'         : b' ',            # funky WIN whitespace
    b'\xe2\x80\xa6'     : b'...',          # …
    b'\xe2\x80\x8b'     : b'',             # have no figging clue
    b'\xe2\x80\x93'     : b'-',            # –
    b'\xce\xb1'         : b'&alpha;',       # α
    b'\xc3\x9f'         : b'&beta;',        # ß
    b'\xce\xb2'         : b'&beta;',        # β
    b'\xe2\x80\x98'     : b'&prime;',       # ‘
    b'\xe2\x80\x99'     : b'&prime;',       # ’
    b'\xc2\xb4'         : b'&prime;',       # ´
    # Sorry accents :(
    b'\xc5\xa0'         : b'S',            # Š
    b'\xc5\xa1'         : b's',            # š
    b'\xc5\xbd'         : b'Z',            # Ž
    b'\xc4\x8d'         : b'c'             # č
}


def replacer(x, verbose=False):
    if type(x) == float:
        return x
    y = x.encode('utf-8')
    for old, new in ascii_replacers.items():
        y = y.replace(old, new)
    y = y.decode('utf-8')
    
    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
    
    return y.strip()


In [39]:
bad_cols = []
for c in df_components.columns:
    print(c, "\n-------------")
    if any(df_components[c].apply(find_non_ascii)):
        bad_cols.append(c)
    print()

AddedBy 
-------------

Species 
-------------

NodeType 
-------------

Family 
-------------

Clade 
-------------

NodeID 
-------------

NodeName 
-------------

ModelStatus 
-------------

NodeDescription 
-------------
ß 223 b'\xc3\x9f'
12-hydroxyjasmonic acid 12-O-ß-D-glucoside
α 945 b'\xce\xb1'
7-(α-D-glucosyl)dihydrozeatin 
α 945 b'\xce\xb1'
9-(α-D-glucosyl)dihydrozeatin 
α 945 b'\xce\xb1'
UDP-α-D-glucose
β 946 b'\xce\xb2'
7-(β-D-glucosyl)-cis-zeatin 
β 946 b'\xce\xb2'
9-(β-D-glucosyl)-cis-zeatin 
β 946 b'\xce\xb2'
O-β-D-glucosyl-cis-zeatin 
α 945 b'\xce\xb1'
7-(α-D-glucosyl)-N6-isopentenyladenine 
α 945 b'\xce\xb1'
9-(α-D-glucosyl)-N6-isopentenyladenine 
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-diphosphate
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-monophosphate
‘ 8216 b'\xe2\x80\x98'
N6-(dimethyallyl)adenosine 5‘-triphosphate
α 945 b'\xce\xb1'
9-(α-D-glucosyl)-trans-zeatin
β 946 b'\xce\xb2'
O-β-D-glucosyl-trans-zeatin 
  160 b'\xc2\xa0'
gibberellin 3-ox

In [40]:
for c in bad_cols:
    print(c, "\n-------------")
    df_components[c] = df_components[c].apply(replacer, verbose=True)
    print()

NodeDescription 
-------------
'12-hydroxyjasmonic acid 12-O-ß-D-glucoside' : '12-hydroxyjasmonic acid 12-O-&beta;-D-glucoside'
'7-(α-D-glucosyl)dihydrozeatin ' : '7-(&alpha;-D-glucosyl)dihydrozeatin '
'9-(α-D-glucosyl)dihydrozeatin ' : '9-(&alpha;-D-glucosyl)dihydrozeatin '
'UDP-α-D-glucose' : 'UDP-&alpha;-D-glucose'
'7-(β-D-glucosyl)-cis-zeatin ' : '7-(&beta;-D-glucosyl)-cis-zeatin '
'9-(β-D-glucosyl)-cis-zeatin ' : '9-(&beta;-D-glucosyl)-cis-zeatin '
'O-β-D-glucosyl-cis-zeatin ' : 'O-&beta;-D-glucosyl-cis-zeatin '
'7-(α-D-glucosyl)-N6-isopentenyladenine ' : '7-(&alpha;-D-glucosyl)-N6-isopentenyladenine '
'9-(α-D-glucosyl)-N6-isopentenyladenine ' : '9-(&alpha;-D-glucosyl)-N6-isopentenyladenine '
'N6-(dimethyallyl)adenosine 5‘-diphosphate' : 'N6-(dimethyallyl)adenosine 5&prime;-diphosphate'
'N6-(dimethyallyl)adenosine 5‘-monophosphate' : 'N6-(dimethyallyl)adenosine 5&prime;-monophosphate'
'N6-(dimethyallyl)adenosine 5‘-triphosphate' : 'N6-(dimethyallyl)adenosine 5&prime;-triphosphate'

In [41]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_components[c].apply(find_non_ascii)):
        print()

NodeDescription 
-------------
AdditionalInfo 
-------------


In [42]:
# also need to look for ' (quote) and " (doube; quote)
# which are used instead of prime, and may cause string issues

# replacing 5' with 5&prime; and 3' with 3&prime;
def find_quotes(x):
    x = str(x)
    if x.find('"') != -1: 
        print(x)
        return True
    elif x.find("'") != -1:
        print(x)
        return True
    else:
        return False
        

def quote_replacer(x, verbose=False):
    if type(x) == float:
        x = ""
    y = x.replace("5'", "5&prime;")
    y = y.replace("3'", "3&prime;")

    if verbose and ( y != x):
        print(f"'{x}' : '{y}'")
        
    return y

In [43]:
bad_cols = []
for c in df_components.columns:
    print(c, "\n-------------")
    if any(df_components[c].apply(find_quotes)):
        bad_cols.append(c)
    print()

AddedBy 
-------------

Species 
-------------

NodeType 
-------------

Family 
-------------

Clade 
-------------

NodeID 
-------------

NodeName 
-------------

ModelStatus 
-------------

NodeDescription 
-------------
adenosine 5'-diphosphate
adenosine 5'-monophosphate
adenosine 5'-triphosphate
9-ribosyl-trans-zeatin-5'-diphosphate
9-ribosyl-trans-zeatin-5'-monophosphate
9-ribosyl-trans-zeatin-5'-triphosphate
cytokinin riboside 5'-monophosphate phosphoribohydrolase EC:3.2.2.n1
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG34(EC:3.2.2.n1
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG5 (EC:3.2.2.n2
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG7 (EC:3.2.2.n3
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG8 (EC:3.2.2.n4
cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG8

In [44]:
for c in bad_cols:
    print(c, "\n-------------")
    df_components[c] = df_components[c].apply(quote_replacer, verbose=True)
    print()

NodeDescription 
-------------
'adenosine 5'-diphosphate' : 'adenosine 5&prime;-diphosphate'
'adenosine 5'-monophosphate' : 'adenosine 5&prime;-monophosphate'
'adenosine 5'-triphosphate' : 'adenosine 5&prime;-triphosphate'
'9-ribosyl-trans-zeatin-5'-diphosphate' : '9-ribosyl-trans-zeatin-5&prime;-diphosphate'
'9-ribosyl-trans-zeatin-5'-monophosphate' : '9-ribosyl-trans-zeatin-5&prime;-monophosphate'
'9-ribosyl-trans-zeatin-5'-triphosphate' : '9-ribosyl-trans-zeatin-5&prime;-triphosphate'
'cytokinin riboside 5'-monophosphate phosphoribohydrolase EC:3.2.2.n1' : 'cytokinin riboside 5&prime;-monophosphate phosphoribohydrolase EC:3.2.2.n1'
'cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1' : 'cytokinin riboside 5&prime;-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1'
'cytokinin riboside 5'-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1' : 'cytokinin riboside 5&prime;-monophosphate phosphoribohydrolase LOG3 (EC:3.2.2.n1'
'cytokinin riboside 5'-monophosph

In [45]:
for c in bad_cols:
    print(c, "\n-------------")
    if any(df_components[c].apply(find_quotes)):
        print()

NodeDescription 
-------------
AdditionalInfo 
-------------
Benzoic acid 2-hydroxylase: Isolated from tobacco, can't find in ath.
Encodes a protein showing similarities to zinc finger transcription factors, involved in regulation of flowering under long days. Acts upstream of FT and SOC1.; Plays a role in the regulation of flowering time by acting on 'SUPPRESSOR OF OVEREXPRESSION OF CO1', 'TERMINAL FLOWER 1' and 'FLOWERING LOCUS T'. Also regulates P5CS2 and ACS10 (involved in proline and ethylene biosynthesis, respectively).
Rav2 is part of a complex that has been named `regulator of the (H+)-ATPase of the vacuolar and endosomal membranes' (RAVE) | KEGG09287
Encodes a MYC-related transcriptional activator with a typical DNA binding domain of a basic helix-loop-helix leucine zipper motif. Binds to an extended G-Box promoter motif and interacts with Jasmonate ZIM-domain proteins. | MYC2, MYC3, and MYC4, three basic helix-loop-helix transcription factors that are known to additively cont

In [46]:
# duplicated node names
for label, subdf in df_components.groupby('NodeLabel'):
    dups =  subdf[subdf.duplicated(['NodeName'], keep=False)]
    if dups.shape[0] > 0:
        print(label)
        display(dups.sort_values('NodeName'))

PlantCoding


Unnamed: 0,AddedBy,Species,NodeType,Family,Clade,NodeID,NodeName,ModelStatus,NodeDescription,AdditionalInfo,...,Process,ModelV,GMM_OCD,GMM_Description,GMM_ShortName,GMM_Synonyms,NodeLabel,species,external_links,gmm_ocd
794,MAK,ath,plant_coding,CPS,CPS,AT4G02780,CPS,use,ent-copalyl diphosphate synthase,also known as GA REQUIRING 1 (GA1),...,Hormone:GA,v2.7,OCD_all_001702,Terpenoid cyclases/Protein prenyltransferases ...,GA1,"ABC33,ATCPS1,CPS,CPS1,GA1,TPSGA1",PlantCoding,ath,gmm_ocd:ocd_all_001702,ocd_all_001702
795,MAK,stu,plant_coding,CPS,CPS,Sotub06g034690.1.1,CPS,use,ent-copalyl diphosphate synthase,also known as GA REQUIRING 1 (GA1),...,Hormone:GA,v2.7,,,,,PlantCoding,stu,gmm_ocd:ocd_all_001702,ocd_all_001702
796,ZR,stu,plant_coding,CPS,CPS,Sotub08g006560.1.1,CPS.x1,use,Ent-copalyl diphosphate synthase (Fragment),,...,Hormone:GA,v2.7,,,,,PlantCoding,stu,gmm_ocd:ocd_all_001702,ocd_all_001702
799,ZR,sly,plant_coding,CPS,CPS,Solyc06g084240.2.1,CPS.x1,use,copalyl diphosphate synthase,,...,Hormone:GA,v2.7,,,,,PlantCoding,sly,gmm_ocd:ocd_all_001702,ocd_all_001702
797,ZR,stu,plant_coding,CPS,CPS,Sotub08g020310.1.1,CPS.x2,use,Ent-copalyl diphosphate synthase (Fragment),,...,Hormone:GA,v2.7,,,,,PlantCoding,stu,gmm_ocd:ocd_all_001702,ocd_all_001702
800,ZR,sly,plant_coding,CPS,CPS,Solyc08g005710.3.1,CPS.x2,orthology,Terpene synthase 41,,...,Hormone:GA,v2.7,,,,,PlantCoding,sly,gmm_ocd:ocd_all_001702,ocd_all_001702
822,ZR,stu,plant_coding,GA20ox,GA20ox,Sotub06g023200.1.1,GA20ox.x1,orthology,gibberellin 20-oxidase,,...,Hormone:GA,v2.7,,,,,PlantCoding,stu,gmm_ocd:ocd_all_000842,ocd_all_000842
827,ZR,sly,plant_coding,GA20ox,GA20ox,Solyc06g050110.2.1,GA20ox.x1,orthology,gibberellin 20-oxidase,,...,Hormone:GA,v2.7,,,,,PlantCoding,sly,gmm_ocd:ocd_all_000842,ocd_all_000842
823,ZR,stu,plant_coding,GA20ox,GA20ox,Sotub09g017720.1.1,GA20ox.x2,orthology,gibberellin 20-oxidase,,...,Hormone:GA,v2.7,,,,,PlantCoding,stu,gmm_ocd:ocd_all_000842,ocd_all_000842
828,ZR,sly,plant_coding,GA20ox,GA20ox,Solyc09g009110.3.1,GA20ox.x2,orthology,gibberellin 20-oxidase,,...,Hormone:GA,v2.7,,,,,PlantCoding,sly,gmm_ocd:ocd_all_000842,ocd_all_000842


In [47]:
df_components.columns

Index(['AddedBy', 'Species', 'NodeType', 'Family', 'Clade', 'NodeID',
       'NodeName', 'ModelStatus', 'NodeDescription', 'AdditionalInfo',
       'ExtDBlink', 'Process', 'ModelV', 'GMM_OCD', 'GMM_Description',
       'GMM_ShortName', 'GMM_Synonyms', 'NodeLabel', 'species',
       'external_links', 'gmm_ocd'],
      dtype='object')

In [48]:
want_columns = ['AddedBy', 'species', 'NodeLabel', 'NodeType', 
                'Family', 'Clade', 'NodeID', 'NodeName', 
                'external_links', 'NodeDescription', 'AdditionalInfo', 
                'Process', 'ModelV', 'ModelStatus',                 
                'gmm_ocd', 'GMM_Description', 'GMM_ShortName', 'GMM_Synonyms'
               ]

In [49]:
path =  output_path / "components.tsv"
df_components[want_columns].fillna('').to_csv(path, sep="\t", index=None)
print(path)

../data/parsed/components.tsv


In [50]:
!head $path

AddedBy	species	NodeLabel	NodeType	Family	Clade	NodeID	NodeName	external_links	NodeDescription	AdditionalInfo	Process	ModelV	ModelStatus	gmm_ocd	GMM_Description	GMM_ShortName	GMM_Synonyms
ZR	all	Complex	plant_complex	SCF	SCF	SCF	SCF		SCF			v2.7	use				
ZR	all	Complex	plant_complex	WD/bHLH/MYB	WD/bHLH/MYB	WD/bHLH/MYB	WD/bHLH/MYB		WD/bHLH/MYB			v2.7	use				
ZR	all	Complex	plant_complex	ribosome	ribosome	ribosome	ribosome		ribosome			v2.6	use				
KG	external	ExternalCoding	pathogen_coding	bacteria	trichous-bacteria	elf18	elf18		EF-Tu fragment	N terminus of elongation factor Tu (EF-Tu), the most abundant bacterial protein.	Pathogen_Effector	vNA	use				
KG	external	ExternalCoding	pathogen_coding	bacteria	trichous-bacteria	flg22	flg22		flagellin fragment	Flagellin is the structural protein that forms the major portion of flagellar filaments; this is the 22 amino acids flagellin peptide known as flg22 (spans 22 amino acids in the core of the conserved domain). Flagellins from different b

# END