# Parser for PAS-X Data File
This notebook focuses on parsing data from PAS-X software to extract and model/organize high-cell density fermentation data.  

PAS-X file contains data about one project which contains several runs, and each run contains data of several features.   

The <b>primary objective</b> is to parse the PAS-X data file with configs files in order to generate document structures (JSON) for our graph, concretely node documents and edge documents such as: project, run, strain, species, etc.   

### 0. Install dependencies, import modules and define file paths

In [51]:
#!pip install pandas
#!pip install pyarango

In [1]:
import os
import yaml
import json
import hashlib
from functools import reduce
import operator
import pandas as pd
import random

In [2]:
# Input paths
data_dir = '../data'
data_path = os.path.join(data_dir, "20240328_dataset for ambrDB_DDBproject.json")
strain_mapping_path = os.path.join(data_dir, "strain_metadata_benchling.csv")
taxa_mapping_path = os.path.join(data_dir, "organism_metadata_benchling.csv")
project_metadata_mapping_path = os.path.join(data_dir, "FermentDB_metadata_project_metadata.csv")
medium_mapping_path = os.path.join(data_dir, "medium_metadata_benchling.csv")
ingredient_mapping_path = os.path.join(data_dir, "FermentDB_metadata_medium_concentrations.csv")
experiment_mapping_path = os.path.join(data_dir, "experiment_metadata_benchling.csv")
var_description_mapping_path = os.path.join("../notebooks/bioprocess_ontology/bioprocess_variables_description.json")
imodulon_files = {'e_coli': {'precise1k': os.path.join(data_dir, 'iM_table.csv')}}

config_dir = '../config'
main_config_path = os.path.join(config_dir, 'main_parser_config.yaml')
#var_config_path = os.path.join(config_dir, 'var_parser_config.yaml')


# Output paths
output_dir = '../output'
nodes_path = os.path.join(output_dir, "fermentdb_nodes_full.json")
edges_path = os.path.join(output_dir, "fermentdb_edges_full.json")

### 1. Load data and config files

In [3]:
# Load PASX data file
with open(data_path, 'r') as dbfile:
    data = json.load(dbfile)

data.keys()

dict_keys(['end', 'created_by', 'editable', 'unit_operations', 'workflow', 'creation_time', 'project', 'description', 'status', 'name', 'event_names', 'key_variable', 'id', 'type', 'result_notes', 'progress', 'country', 'manager', 'trending_settings', 'deletable', 'modification_time', 'batch_phase_names', 'tags', 'start', 'departments', 'editable_plots', 'sites', 'batches'])

In [4]:
# Load main_parser_config.yaml
with open(main_config_path, 'r') as cfile:
    main_config = yaml.load(cfile, Loader=yaml.SafeLoader)

main_config

{'Project': {'_key': 'project',
  'id': 'project',
  'creation_time': 'creation_time'},
 'Country': {'_key': 'country', 'id': 'country', 'name': 'country'},
 'User': {'_key': 'manager',
  'id': 'manager',
  'name': 'manager',
  'surname': 'manager'},
 'Experiment': {'batches': {'variables': {'_key': 'Experiment',
    'name': 'Experiment'}}},
 'Run': {'batches': {'_key': 'id',
   'id': 'id',
   'name': 'name',
   'run_start': 'batch_start',
   'run_end': 'batch_end',
   'run_date': 'creation_time',
   'variables': {'is_control': 'Control?',
    'replicate_number': 'Replicate #'}}},
 'Fermenter': {'batches': {'variables': {'_key': 'Container Type',
    'id': 'Container Type',
    'name': 'Container Type'}}},
 'Medium': {'batches': {'variables': {'_key': 'Base Medium',
    'name': 'Base Medium'}}},
 'Phase': {'batches': {'phases': {'_key': 'name',
    'id': 'name',
    'name': 'name'}}},
 'Event': {'batches': {'events': {'_key': 'name',
    'id': 'name',
    'name': 'name'}}}}

### 2. Create node collections dictionary structure
Create a dictionary structure for nodes collections with main_parser_config file and PASX data file. 

In [36]:
# Function and Variable Definitions

compound = ["acetic acid", "citric acid", "d-glucose", "ethanol", "lactate", "pyruvic acid", "succnic acid", "tryptophan", "melatonin"]
substrate = ["d-glucose"]
product = ["tryptophan", "melatonin", "biomass"]


def get_from_nested_dict(data_dict:dict, map_list:list):
    '''
    Extracts value from a nested dictionary given a list of keys (different levels).

    parameters:
        data_dict (dict): dictionary structure 
        map_list (list): list of nested keys

    return:
        nested_value: nested value given the provided list of keys

    example:
        >>> data_dict = {'a': {'b': {'c': 5}}}
        >>> value = get_from_nested_dict(data_dict=data_dict, map_list=['a', 'b', 'c'])
        >>> print(value)
        5
    '''

    nested_value = None
    try:
        nested_value = reduce(operator.getitem, map_list, data_dict)
    except:
        pass

    return nested_value

def get_hash(key, prefix=""):
    '''
    Get a hash value for a given key:
    Hash a string using SHA-1, before encode key string into bytes using the UTF-8 encoding, as the sha1() expects bytes as input. 
    Convert the binary hash value into a hexadecimal string and then into an 8-digit integer.
    Convert it to a string and add a prefix.

    parameters:
        key (str): Input string to be hashed.
        prefix (str, optional): Prefix to prepend to the hash value. Defaults to "".

    return:
        str: Hash value.

    example:
        >>> input_key = "example_key"
        >>> get_hash(input_key, prefix="HASH_")
        'HASH_45200d86'
    '''
    hkey = str(int(hashlib.sha1(key.encode("utf-8")).hexdigest(),16) % (10 ** 8))
    hkey = f"{prefix}{hkey}"
    
    return hkey

def get_unique_json_from_list_of_dicts(dict_list, unique_key='id'):
    '''
    Get a list of unique dictionaries based on a specified key.
    
    parameters:
        dict_list (list): list of dictionaries.
        unique_key (str, optional): Key to determine uniqueness. Default to 'id'.
    
    return: 
        list: list of unique dictionaries. 
    
    example:
        >>> example_list = [
        ...     {"id": 1, "name": "Albert"},
        ...     {"id": 2, "name": "Chris"},
        ...     {"id": 1, "name": "Charlie"},  # Duplicate id
        ...     {"id": 3, "name": "Albert"}     # Duplicate name
        ... ]
        >>> get_unique_json_from_list_of_dicts(example_list)
        #Intermediate step:
        # {1: {"id": 1, "name": "Albert"}, 2: {"id": 2, "name": "Chris"}, 3: {"id": 3, "name": Albert}}.
        [{'id': 1, 'name': 'Albert'}, {'id': 2, 'name': 'Chris'}, {'id': 3, 'name': 'Albert'}]
    '''
    unique_list = list({v[unique_key]:v for v in dict_list}.values())
    
    return unique_list

In [45]:
# Functions to add update nodes to nodes_collection main function. 

def get_institution_node(nodes_collection:dict) -> dict:
    #if 'Institution' not in node_collections:
    institution_key = get_hash("NNFCB", prefix="IN")
    nodes_collection['Institution'] = [{'_key': institution_key,
                                'name': 'NNFCB - Novo Nordisk Foundation Center for Biosustainability (DTU Biosustain)',
                                'address': 'Building 220, Kemitorvet. 2800 Kgs. Lyngby',
                                'email': 'biosustain@biosustain.dtu.dk',
                                'phone_number':'+45 45 25 80 00'
                                }]
    return nodes_collection

def get_taxa_nodes(nodes_collection:dict,taxa_mapping_path:str):
    with open(taxa_mapping_path, 'r') as taxa_file:
        taxa_mapping = pd.read_csv(taxa_file, sep=',')
    
    # from df to dictionary structure
    taxa_mapping = {r[0]: {'name': r[1], 'synonym': r[2], 'taxid': r[3]} for i,r in taxa_mapping.iterrows()}

    taxas = []

    for strain in nodes_collection['Strain']:
        parent_strain = strain('parent')
        if parent_strain in taxa_mapping:
            taxa_id = str(taxa_mapping[parent_strain]['taxid'])
            taxa_key = get_hash(taxa_id, prefix="TA")
            taxas.append({
                '_key': taxa_key,
                'id': taxa_id,
                'source_id': parent_strain,
                'name': str(taxa_mapping[parent_strain]['name']),
                'synonyms': list(taxa_mapping[parent_strain]['synonym']),
                'link': f'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={taxa_id}'

            })

    nodes_collection['Taxa'] = get_unique_json_from_list_of_dicts(dict_list=taxas,
                                                            unique_key='_key')
    return nodes_collection

def get_strain_nodes(nodes_collection:dict, strain_mapping_path:str):
    with open(strain_mapping_path, 'r') as strain_file:
        strain_mapping = pd.read_csv(strain_file, sep=',')
    
    # from df to dictionary structure
    strain_mapping = {r[1]: {'id': r[0], 'name': r[2], 'parent': r[3], 'host': r[4], 'genotype': r[5], 'genotype_change': r[6], 'notes': r[11]} for i,r in strain_mapping.iterrows()}

    strains = []
    for strain_key, strain_data in strain_mapping.items():
        strain_id = str(strain_data['id'])
        strains.append({
            '_key': get_hash(strain_id, prefix='ST'),
            'id': strain_id,
            'name':  str(strain_data['name']),
            'parent': str(strain_data['parent']),
            'host': str(strain_data['host']),
            'genotype': str(strain_data['genotype']),
            'genotype_change': str(strain_data['genotype_change']),
            'notes': str(strain_data['notes'])
        })

    nodes_collection['Strain'] = get_unique_json_from_list_of_dicts(dict_list=strains, unique_key='_key')

    return nodes_collection

In [None]:
# Function that generates all nodes

def generate_nodes_collection(data:dict, config:dict, taxa_mapping_path:str) -> dict:
    '''
    This function creates a dictionary with the node collections expected
    in FermentDB. It requires a dictionary with the data exported from PASX
    in json format, and a configuration file that specifies the mapping between
    PASX and FermentDB structure. See example in '/config/main_parser_config.yaml'

    parameters:
        data (dict): dictionary with the data read from PASX in json format
        config (dict): mapping configuration to adapt to FermentDB structure

    return:
        collections (dict): dictionary with the expected node objects in FermentDB

    example:
        >>> get_collections_from_pasx(data=pasx_json_dict, config=main_parser_config.yaml)
    '''
    nodes_collection = {}
    for key1 in config:
        nodes_collection[key1]= []
        collection = {}
        for key2 in config[key1]:
            if type(config[key1][key2]) != dict:
                key = config[key1][key2].split('/') # returns a list even with no need to split
                value = get_from_nested_dict(data, key)
                if key2 == '_key':
                    value = str(value) # keys must be str in ArangoDB
                    if key1 == 'Project':
                        value += str(random.random())
                        value = get_hash(value, prefix="P")
                    elif key1 == 'Country':
                        value = get_hash(value, prefix="C")
                    elif key1 == 'User':
                        value = get_hash(value, prefix="U")
                collection.update({key2: value})
            else:
                batches = data[key2]
                for batch in batches:
                    collection = {}
                    for key3 in config[key1][key2]:
                        if type(config[key1][key2][key3]) != dict:
                            key = config[key1][key2][key3].split('/')
                            value = get_from_nested_dict(batch, key)
                            if key3 == '_key':
                                value = str(value)
                                if key1 == 'Run':
                                    value = get_hash(value, prefix="R")
                            collection.update({key3: value})
                        else:
                            variables =  batch.get(key3)
                            for variable in variables:
                                for key4 in config[key1][key2][key3]:
                                    key = config[key1][key2][key3][key4]
                                    if key == variable['name']:
                                        value = str(variable['data'])
                                        if key4 == '_key':
                                            if key1 == 'Fermenter':
                                                value = get_hash(value, prefix="F")
                                            if key1 == 'Experiment':
                                                value = get_hash(value, prefix="EX")
                                            if key1 == 'Medium':
                                                value = get_hash(value, prefix="ME")
                                        collection.update({key4: value})
                                    elif key1 == "Phase":
                                        key = config[key1][key2][key3][key4].split('/')
                                        value = str(get_from_nested_dict(variable, key))
                                        if key4 == '_key':
                                            value += str(random.random())
                                            value = get_hash(value, prefix="PH")
                                        collection.update({key4: value})
                                    elif key1 == "Event":
                                        key = config[key1][key2][key3][key4].split('/')
                                        value = str(get_from_nested_dict(variable, key))
                                        if key4 == '_key':
                                            value += str(random.random())
                                            value = get_hash(value, prefix="EV")
                                        collection.update({key4: value})        
                    nodes_collection[key1].append(collection)  
        nodes_collection[key1].append(collection)
        #node_collections[key1] = get_unique_json_from_list_of_dicts(dict_list=node_collections[key1], unique_key='_key')
        nodes_collection.update(get_institution_node(nodes_collection))
        nodes_collection.update(get_taxa_nodes(nodes_collection, taxa_mapping_path))
        nodes_collection.update(get_strain_nodes(nodes_collection, strain_mapping_path))
    return nodes_collection

In [None]:
# Create node collections 
nodes_collection = generate_nodes_collection(data, main_config, taxa_mapping_path)

# print(node_collections['Institution'])

In [None]:
# Doble check the number of nodes and _keys in each collection
def check_nodes_keys(name_node_collection_list:list):
    '''
    Check the number of nodes and _keys in each collection.
    '''
    for collection in name_node_collection_list:
        print(f"Number of nodes in {collection}: {len(nodes_collection[collection])}")
        print(f"Number of total _keys in {collection}: {sum(1 for x in nodes_collection[collection] if '_key' in x)}") 
        print(f"Number of unique _keys in {collection}: {len({x['_key'] for x in nodes_collection[collection] if '_key' in x})}")


nodes_list = ['Project', 'User', 'Country', 'Fermenter', 'Medium', 'Experiment', 'Run', 'Phase', 'Event', 'Institution']

check_nodes_keys (nodes_list)

### 3. Create edges collection



In [None]:
# Edge Function Definitions
def get_institution_static_edges(nodes_collection:dict, edges_collection:dict) -> dict:
    project = nodes_collection['Project']
    user = nodes_collection['User']
    country = nodes_collection['Country']
    institution = nodes_collection['Institution']

    edges_collection.update({
        'created_at': {'edges':[{
                        '_from': f"Project/{project}['_key]",
                        '_to': f"Institution/{institution}['_key]"
                        }],
                'from_collection': ['Project'],
                'to_collection': ['Institution']},
        'from':{'edges':[{
                        '_from': f"Institution/{institution}['_key]",
                        '_to': f"Country/{country}['_key]"
                        }],
                'from_collection': ['Institution'],
                'to_collection': ['Country']},
        'works_at': {'edges':[{
                            '_from': f"User/{user}['_key]",
                            '_to': f"Institution/{institution}['_key]"
                            }],
                     'from_collection': ['User'],
                     'to_collection': ['Institution']}
        
    })
    return edges_collection

def get_project_static_edges(nodes_collection:dict, edges_collection:dict) -> dict:
    projects = nodes_collection['Project']
    users = nodes_collection['User']

    edges_collection.update({
        'created_by': {'edges': [],
                       'from_collection': ['Project'],
                       'to_collection': ['User']}
    })

    for user in users:
        for project in projects:
            if '_key' in project:
                        edges_collection['created_by']['edges'].append({'_from': f"Project/{project['_key']}",
                                                            '_to': f"User/{user['_key']}"})
    return edges_collection

def get_has_experiment_edge(nodes_collection:dict, edges_collection:dict) -> dict:
    projects = nodes_collection['Project']
    experiments = nodes_collection['Experiment']

    edges_collection.update({
        'has_experiment':{'edges':[],
                          'from_collection': ['Project'],
                          'to_collection': ['Experiment']}
    })
     
    for project in projects:
        for experiment in experiments:
            if '_key' in experiment:
                edges_collection['has_experiment']['edges'].append({'_from': f"Project/{project['_key']}",
                                                                    '_to': f"Experiment/{experiment['_key']}"})

    return edges_collection


def get_experiment_edges(nodes_collection:dict, edges_collection:dict) -> dict:

    experiment_edges = {}
    
    edges_collection.update(experiment_edges)
    return edges_collection

def get_belongsto_edges(nodes_collection:dict, edges_collection:dict, taxa_mapping_path:str):
    with open(taxa_mapping_path, 'r') as taxa_file:
        taxa_mapping = pd.read_csv(taxa_file, sep=',')
    
    # from df to dictinary structure
    taxa_mapping = {r[0]: {'name': r[1], 'synonym': r[2], 'taxid': r[3]} for i,r in taxa_mapping.iterrows()}
    
    edges_collection.update({
                'belongs_to':{'edges':[],
                        'from_collection': ['Strain'],
                        'to_collection': ['Taxa']}
    })

    for strain in nodes_collection['Strain']:
        strain_key = strain['_key']
        parent_strain = strain('parent')
        if parent_strain in taxa_mapping:
            taxa_id = str(taxa_mapping[parent_strain]['taxid'])
            edges_collection['belongs_to']['edges'].append({
                '_from': f"Strain/{strain_key}",
                '_to': f"Taxa/{taxa_id}"
            })
    return edges_collection

def get_isparentof_edges(nodes_collection:dict, edges_collection:dict):
    edges_collection.update({
                'is_parent_of':{'edges':[],
                        'from_collection': ['Strain'],
                        'to_collection': ['Strain']},
    })

    for strain in nodes_collection['Strain']:
        strain_key = strain['_key']
        parent_strain = strain['parent']
        parent_key = get_hash(parent_strain, prefix="ST")
        strain_id = strain['id']
        if parent_strain == strain_id:
            edges_collection['is_pare_of']['edges'].append({
                '_from': f"Strain/{strain_key}",
                '_to': f"Strain/{parent_key}"
            })

In [None]:
# Function for generating all edges

def generate_edges_collection(nodes_collection:dict) -> dict:
    '''
    This functon creates a dictionary with the edge collections backbone.
    
    parameters:
        node_collections (dict): dictionary with the node objects in FermentDB
    
    return:
        edge_collections (dict): dictionary with the expected edge objects in FermentDB
    '''
    

    edges_collection = {
        'has_run':{'edges':[],
                   'from_collection': ['Experiment'],
                   'to_collection': ['Run']}, 
        'has_product':{'edges':[],
                       'from_collection': ['Run'],
                       'to_collection': ['Compound']},
        'has_substrate':{'edges':[],
                         'from_collection': ['Run'],
                         'to_collection': ['Compound']},
        'uses_fermenter':{'edges':[],
                            'from_collection': ['Run'],
                            'to_collection': ['Fermenter'],
                            'reactor_id': []},
        'cultures_strain':{'edges':[],
                           'from_collection': ['Run'],
                           'to_collection': ['Strain'],
                           'strain_batch':[]},
        'has_medium':{'edges':[],
                      'from_collection': ['Run'],
                      'to_collection': ['Medium'],
                      'reference_volume':[],
                      'unit':[],
                      'ph':[]},
        'derived_from':{'edges':[],
                        'from_collection': ['Medium'],
                        'to_collection': ['Medium']},
        'has_ingredient':{'edges':[],
                          'from_collection': ['Medium'],
                          'to_collection': ['Compound'],
                          'concentration':[],
                          'unit':[]},
        'has_measured_cultivation_cond':{'edges':[],
                                         'from_collection': ['Run'],
                                         'to_collection': ['Cultivation_cond'],
                                         'data':[],
                                         'is_categorical':[],
                                         'unit':[]},
        'has_measured_process_var':{'edges':[],
                                    'from_collection': ['Run'],
                                    'to_collection': ['Process_var'],
                                    'data':[],
                                    'data_format':[],
                                    'unit':[],
                                    'timestamp':[]},
        'has_measured_compound': {'edges': [],
                                  'from_collection': ['Run'],
                                  'to_collection': ['Compound'],
                                  'data':[],
                                  'data_format':[],
                                  'unit':[],
                                  'timestamp':[]},
        'associated_with': {'edges':[],
                            'from_collection': ['Compound'],
                            'to_collection':['Calculated_var']},
        'has_calculated_var': {'edges':[],
                               'from_collection': ['Run'],
                               'to_collection': ['Calculated_var'],
                               'data':[],
                                'data_format':[],
                                'unit':[],
                                'timestamp':[]},
        'has_calculated_imodulon': {'edges':[],
                                    'from_collection': ['Run'],
                                    'to_collection': ['iModulon'],
                                    'data':[],
                                    'data_format':[],
                                    'timestamp':[]},
        'has_calculated_phase': {'edges':[],
                                'from_collection': ['Run'],
                                'to_collection': ['Phase'],
                                'calculated_start':[],
                                'calculated_end':[]},
        'is_event_of': {'edges':[],
                        'from_collection': ['Event'],
                        'to_collection': ['Phase'],
                        'is_start':[]}           
    }
    
    edges_collection.update(get_institution_static_edges(nodes_collection))
    edges_collection.update(get_project_static_edges(nodes_collection))
    edges_collection.update(get_experiment_edges(nodes_collection, edges_collection))
    edges_collection.update(get_has_experiment_edge(nodes_collection, edges_collection))
    edges_collection.update(get_belongsto_edges(nodes_collection, edges_collection, taxa_mapping_path))
    edges_collection.update(get_isparentof_edges(nodes_collection, edges_collection))

    return edges_collection



In [None]:
# Create all edges
edges_collection = generate_edges_collection(nodes_collection)

In [122]:
# Function Definitions

def get_run_conditions(collections, rconfig):
    '''
    Update node collections structure with nodes: initial condition, process conditions, fermenter, and strain.
    Generate edges structure with collections: has_initial_condition, has_condition, cultures_strain, and uses_fermenter.  
    
    parameter:
    - collections (dict): A dictionary containing collections of data.

    returns:
    - edges (dict): A dictionary containing edges data representing the relationships between runs, conditions, and strain.
    '''
    iconditions_collection = []
    pconditions_collection = []
    fermenter_collection = []
    strain_collection = []
    edges = {'has_initial_condition': {'edges':[],
                                        'from_collection': ['Run'],
                                        'to_collection': ['Initial_condition']},
                'has_condition': {'edges': [],
                                'from_collection': ['Run'],
                                'to_collection': ['Process_condition']},
                'has_measured_imodulon': {'edges':[],
                              'from_collection': ['Run'],
                              'to_collection':['iModulon']},
                'cultures_strain': {'edges': [],
                                'from_collection': ['Run'],
                                'to_collection': ['Strain']},
                'uses_fermenter': {'edges': [],
                                'from_collection': ['Run'],
                                'to_collection': ['Fermenter']},
            }
    
    for run in collections['Run']:
        run["_key"] = run["name"]+"_"+str(run['id'])
        run["_key"] = get_hash(run["_key"], prefix="R")
        for variable in run['variables']:
            variable["_key"] = str(variable['name'])
            variable["_key"] = get_hash(variable["_key"], prefix="C")
            data = variable.pop('data')
            timestamps = variable.pop('timestamps')
            unit = variable.pop('unit')
            _ = variable.pop('categorical_data')
            _ = variable.pop('raw_data')
            _ = variable.pop('datetime_data')
            _ = variable.pop('errors')
            
            if variable['name'] in rconfig['Run']:
                if type(data) == list:
                    data = data[0][0]
                if variable['name'] == "Strain Batch":
                    strain = '_'.join(data.split('-')[:1])
                    strain_key = get_hash(strain, prefix="S")
                    strain_collection.append({'_key': strain_key,
                                            'name': strain,
                                            'rank': 'strain'})
                    edges['cultures_strain']['edges'].append({'_from': f"Run/{run['_key']}",
                                                            '_to': f'Strain/{strain_key}',
                                                            'strain_batch': data})
                run.update({rconfig['Run'][variable['name']]: data}) # Add to edge cultures strain and delete?
            elif variable['name'] in rconfig['Fermenter']:
                variable["_key"] = get_hash(data, prefix="F")
                fermenter_collection.append({'_key': variable["_key"],
                                             'name': data})
                edges['uses_fermenter']['edges'].append({'_from': f"Run/{run['_key']}",
                           '_to': f"Fermenter/{variable['_key']}"})
            elif variable['name'] in rconfig['Initial_condition']:
                iconditions_collection.append(variable)
                edges['has_initial_condition']['edges'].append({'_from': f"Run/{run['_key']}",
                           '_to': f"Initial_condition/{variable['_key']}",
                           'data': data,
                           'unit': unit})
            elif '_RNAseq' in variable['name']:
                if not all(v == 0 for v in data):
                    variable['name'] = ' '.join(variable['name'].replace('_RNAseq', '').split('_'))
                    variable["_key"] = get_hash(variable["name"], prefix="iM")
                    edges['has_measured_imodulon']['edges'].append({'_from': f"Run/{run['_key']}",
                                                           '_to': f"iModulon/{variable['_key']}",
                                                           'data': data,
                                                           'timestamps': timestamps})
            else:
                pconditions_collection.append(variable)
                if not all(v == 0 for v in data):
                    edges['has_condition']['edges'].append({'_from': f"Run/{run['_key']}",
                            '_to': f"Process_condition/{variable['_key']}",
                            'data': data,
                            'unit': unit,
                            'timestamps': timestamps})
        del run['variables']
    
    iconditions_collection = get_unique_json_from_list_of_dicts(d=iconditions_collection, 
                                                                    unique_key='_key')
    pconditions_collection = get_unique_json_from_list_of_dicts(d=pconditions_collection, 
                                                                    unique_key='_key')
    fermenter_collection = get_unique_json_from_list_of_dicts(d=fermenter_collection, 
                                                                    unique_key='_key')
    strain_collection = get_unique_json_from_list_of_dicts(d=strain_collection, 
                                                                    unique_key='_key')
    collections['Initial_condition'] = iconditions_collection
    collections['Process_condition'] = pconditions_collection
    collections['Fermenter'] = fermenter_collection
    collections['Strain'] = strain_collection
    
    return edges

def get_run_phases(collections):
    '''
    Retrieves phases data from Run collection and create Phase_event node collection and has_phase edge collection.  

    parameter:
    - collections (dict): A dictionary containing collections of data.

    returns:
    - dict: A dictionary containing edges data representing the relationships between runs and phases.
    '''
    phases_collection = []
    edges = {'has_phase': {'edges':[],
                           'from_collection': ['Run'],
                           'to_collection':['Phase_event']}}

    for run in collections['Run']:
        for phase in run['phases']:
            phase["_key"] = phase['name']
            phase["_key"] = get_hash(phase["_key"], prefix="PH")
            attributes = {"event_start": phase.pop("start"),
                          "event_end": phase.pop("end"),
                          "comment": phase.pop("comment"),
                          "created_by": phase.pop("created_by"),
                          "start": phase.pop("relative_start"),
                          "end": phase.pop("relative_end")}
            attributes.update({'_from': f"Run/{run['_key']}",
                           '_to': f"Phase_event/{phase['_key']}"})
            phases_collection.append(phase)
            edges['has_phase']['edges'].append(attributes)
            phases_collection = get_unique_json_from_list_of_dicts(d=phases_collection, 
                                                                    unique_key='_key')
        del run['phases']
    
    collections['Phase_event'] = phases_collection

    return edges

### 4. Create iModulon node and edges collection

In [10]:
#  Define iModulon function 
def get_imodulon_collection(organism, dataset, table_path):
    '''
    Process iModulon data to generate a collection for a given organism and dataset. 
    - Reads data from a CSV file into a pandas DataFrame. 
    - Adds additional fields such as a key column from hashing name column, and creates linkout column by constructing URLs to generate individual links for each iModulon entry.
    - Processes dataset by handling misisng values and dropping the 'k' column as it is no loger needed (values areincorporated into the linkout column) 

    parameters:
        organism (str): The name of the organism (e.g., 'e_coli').
        dataset (str): The name of the dataset (e.g., 'precise1k').
        table_path (str): The file path to the CSV file containing the iModulon data (e.g., '/path/to/iM_table.csv').

    return:
        data_dict: A list of dictionaries representing processed iModulon data
    '''
    imodulon_link = f'https://imodulondb.org/iModulon.html?organism={organism}&dataset={dataset}&k='
    data = pd.read_csv(table_path, sep=',', header=0)
    data['_key'] = data['name'].apply(lambda n: get_hash(n, prefix="iM"))
    data['linkout'] = data['k'].apply(lambda k: imodulon_link+str(k))
    data = data.fillna('NaN').drop('k', axis=1)
    data_dict = data.to_dict(orient='records')
    return data_dict

In [11]:
# Generate iModulon collection
collections['iModulon'] = []
for organism in imodulon_files:
    for dataset in imodulon_files[organism]:
        imodulon_path = imodulon_files[organism][dataset]
        collections['iModulon'].extend(get_imodulon_collection(organism=organism, dataset=dataset, table_path=imodulon_path))

### 4. Output nodes and edges collections

In [17]:
# Output nodes collections 
os.makedirs(os.path.dirname(nodes_path), exist_ok = True)
nodes_str = json.dumps(node_collections)
with open(nodes_path, 'w') as out:
    out.write(nodes_str)

In [18]:
# Output edges collections
os.makedirs(os.path.dirname(edges_path), exist_ok = True)
edges_str = json.dumps(edge_collections)
with open(edges_path, 'w') as out:
    out.write(edges_str)