In [19]:
#!pip install pandas
#! pip install pyarango

In [186]:
import os
import yaml
import json
from functools import reduce
import operator

In [187]:
data_dir = '../data'
data_path = os.path.join(data_dir, "CrashMS_high resolution HD experiment.json")
config_path = 'parser_config.yaml'

In [188]:
with open(data_path, 'r') as dbfile:
    data = json.load(dbfile)

In [189]:
data.keys()

dict_keys(['modification_time', 'result_notes', 'tags', 'departments', 'event_names', 'start', 'key_variable', 'editable', 'sites', 'creation_time', 'end', 'workflow', 'created_by', 'manager', 'trending_settings', 'editable_plots', 'country', 'unit_operations', 'name', 'description', 'deletable', 'type', 'batch_phase_names', 'project', 'progress', 'id', 'status', 'batches'])

In [190]:
with open(config_path, 'r') as cfile:
    config = yaml.load(cfile, Loader=yaml.SafeLoader)

In [191]:
config

{'Project': {'id': 'project',
  'name': 'name',
  'description': 'description',
  'progress': 'progress',
  'status': 'status',
  'deletable': 'deletable',
  'editable': 'editable',
  'type': 'type',
  'creation_time': 'creation_time',
  'tags': 'tags',
  'modification_time': 'modification_time',
  'start': 'start',
  'end': 'end'},
 'Country': {'id': 'country', 'name': 'country'},
 'User': {'id': 'manager/username',
  'name': 'manager/first_name',
  'surname': 'manager/last_name'},
 'Batch_cell_culture': {'id': 'id', 'name': 'name'},
 'Run': {'batches': {'id': 'id',
   'name': 'name',
   'description': 'description',
   'creation_time': 'creation_time',
   'modification_time': 'modification_time',
   'batch_start': 'batch_start',
   'batch_end': 'batch_end',
   'first_timestamp': 'first_timestamp',
   'last_timestamp': 'last_timestamp'}}}

In [192]:
def get_from_nested_dict(data_dict:dict, map_list:list):
    '''
    Extracts value from a nested dictionary given a list of keys (different levels).

    parameters:
        data_dict (dict): dictionary structure 
        map_list (list): list of nested keys

    return:
        nested_value: nested value given the provided list of keys

    example:
        data_dict = {'a': {'b': {'c': 5}}}
        value = get_from_nested_dict(data_dict=data_dict, map_list=['a', 'b', 'c'])
        print(value)
        5
    '''
    nested_value = reduce(operator.getitem, map_list, data_dict)
    
    return nested_value

In [193]:
def get_collections_fom_pasx(data: dict, config: dict) -> dict:
    '''
    This function creates a dictionary with the collections expected
    in FermentDB. It requires a dictionary with the data exported from PASEX
    in json format, and a configuration file that specifies the mapping between
    PASEX and FermentDB structure. See example in '/config/parser_config.yaml'

    parameters:
        data (dict): dictionary with the data read from PASEX in json format
        config (dict): mapping configuration to adapt to FermentDB struture

    return:
        collections (dict): dictionary with the expected objects in FermentDB

    Example:
        get_collections_fom_pasex(data=pasex_json_dict, config=parser_config.yaml)
    '''
    collections = {}
    for c in config:
        collections[c] = {}
        for a in config[c]:
            if type(config[c][a]) != dict:
                key = config[c][a].split('/')
                collections[c].update({a: get_from_nested_dict(data, key)})
            else:
                collections[c] = []
                for nested_data in data[a]:
                    nested_collection = {}
                    for nested_a in config[c][a]:
                        key = config[c][a][nested_a].split('/')
                        nested_collection.update({nested_a: get_from_nested_dict(nested_data, key)})
                    collections[c].append(nested_collection)

    return collections

In [194]:
collections = get_collections_fom_pasx(data, config)

In [195]:
collections_str = json.dumps(collections)

In [196]:
with open('fermentdb.json', 'w') as out:
    out.write(collections_str)