In [2]:
import os
import json

In [3]:
# set parameters and paths
data_dir = "../data"
out_dir = "../outputs"

db_path = os.path.join(data_dir, "20240328_dataset for ambrDB_DDBproject.json")

In [4]:
with open(db_path, 'r') as dbfile:
    db = json.load(dbfile)

In [5]:
print(db.keys())
len(db.keys())

dict_keys(['end', 'created_by', 'editable', 'unit_operations', 'workflow', 'creation_time', 'project', 'description', 'status', 'name', 'event_names', 'key_variable', 'id', 'type', 'result_notes', 'progress', 'country', 'manager', 'trending_settings', 'deletable', 'modification_time', 'batch_phase_names', 'tags', 'start', 'departments', 'editable_plots', 'sites', 'batches'])


28

In [6]:
project_number = db['project']
runs = db["batches"]

print(f"""
      Project number: {project_number}
      Dataset name: {db["name"]}
      Whole number of project batches/runs: {len(runs)}
      """)


      Project number: 15
      Dataset name: dataset for ambrDB
      Whole number of project batches/runs: 517
      


## Run variables exploration:
A batch is a run in our data model. 

In [7]:
# Run 0 variables exploration:
print("Run 0 variables exploration:")

variables = runs[0]['variables']

for var in variables:
    print(var["name"], "--",var["unit"])

Run 0 variables exploration:
Accumulated sample volume -- mL
Acetic acid formation rate -- g/h
Acetic acid formation rate molar -- mol/h
Acetic acid formed -- g
Acetic acid formed c-molar -- cmol
Acetic acid formed molar -- mol
Acetic acid lost in sample -- g
Acetic acid specific formation rate -- g/g(DW)*h
Acetic acid specific formation rate molar -- mol/cmol(X)*h
Acetic acid yield (S) -- g/g
Acetic acid yield (X) -- g/g(DW)
Acetic acid yield c-molar (S) -- c-mol/c-mol(S)
Acetic acid yield c-molar (X) -- c-mol/c-mol(X)
Acetic acid yield molar (S) -- mol/mol(S)
Acetic acid yield molar (X) -- mol/c-mol(X)
Acid volume -- L
Aeration Gas Type -- dimensionless
Air flow -- lpm
Base Medium -- dimensionless
Base rate -- L/h
Base volume -- L
Biomass amount -- g
Biomass amount molar -- mol
Biomass concentration -- g/L
Biomass formation rate -- g/h
Biomass formation rate molar -- c-mol/h
Biomass formed -- g
Biomass formed c-molar -- cmol
Biomass formed molar -- mol
Biomass lost in sample -- g
Bio

In [15]:
# Unique variables set

for i,run in enumerate(runs):

    variables = run['variables']
    unique_variables = set()

    for var in variables:
        unique_variables.add(var['name'])
        # if '_RNAseq' in var["name"]:
        #     print(var["name"], "--",var["unit"])
        # else:
        #     continue

print(unique_variables)
print(len(unique_variables))

{'Formed CO2', 'Offgas CO2', 'pH Control Base Solution', 'DO Control Cascade Level 1', 'Reflectance_baselined', 'Mn_Biomass', 'Zn_Extracellular', 'Cu_Biomass', 'Cu_Extracellular', 'Base rate', 'Ca_Extracellular', 'Air flow', 'Aeration Gas Type', 'D-glucose', 'Maximum Aeration (slpm)', 'DO Control Cascade Level 3', 'CTR', 'Mg_Biomass', 'Pressure', 'Base volume', 'Fe_Biomass', 'Starting OD', 'Co_Biomass', 'Feed Medium #1', 'Inlet air O2', 'Culture Volume Unit', 'Na_Extracellular', 'Mo_Biomass', 'Replicate #', 'Seed', 'Temperature Setpoint (C)', 'RQ', 'Sample ID', 'Mo_Extracellular', 'Initial Culture VolumeNone', 'Ni_Biomass', 'Ni_Extracellular', 'Liquid volume', 'DO Control Setpoint (%)', 'Mn_Extracellular', 'Zn_Biomass', 'Fe_Extracellular', 'Container Type', 'DO', 'Temperature', 'Sample volume', 'OTR', 'Control?', 'Container ID (calculated)', 'Feed 1 rate volumetric', 'K_Extracellular', 'Reactor/Plate/Flask Number', 'Maximum Stirring or Shaking Speed (rpm)', 'Offgas O2', 'Consumed O2', 

## Project exploration

In [90]:
def get_run_names(runs):
    names = []
    for i in range(len(runs)):
        names.append(runs[i]["name"])
    return names

In [117]:
# db[batches][run]["variables"][var]["target_key"] --> return target_value
def get_unique_values(runs, target_variable_names):
    values = []

    for run in runs:
        for variable in run["variables"]:
            if variable["name"] in target_variable_names:
                values.append(variable["data"])

    return list(set(values))

def get_medium_names(runs):
    target_variable_names = ['Base Medium', 'Feed Medium #1']
    
    return get_unique_values(runs, target_variable_names)

def get_experiment_names(runs):
    target_variable_names = ['Experiment']
    return get_unique_values(runs, target_variable_names)

In [121]:
print(get_run_names(runs))
print(get_medium_names(runs))
print(get_experiment_names(runs))

['DDB_PD_023_AMBR_1', 'DDB_PD_023_AMBR_5', 'DDB_PD_023_AMBR_6', 'DDB_PD_023_AMBR_2', 'DDB_PD_023_AMBR_4', 'DDB_PD_023_AMBR_3', 'DDB_PD_026_AMBR_11', 'DDB_PD_026_AMBR_12', 'DDB_PD_026_AMBR_5', 'DDB_PD_026_AMBR_6', 'DDB_PD_026_AMBR_7', 'DDB_PD_026_AMBR_8', 'DDB_PD_026_AMBR_10', 'DDB_PD_026_AMBR_9', 'DDB_PD_029_AMBR_9', 'DDB_PD_029_AMBR_11', 'DDB_PD_029_AMBR_10', 'DDB_PD_029_AMBR_12', 'DDB_PD_041_AMBR_1', 'DDB_PD_041_AMBR_2', 'DDB_PD_041_AMBR_4', 'DDB_PD_041_AMBR_3', 'DDB_PD_036_AMBR_18', 'DDB_PD_036_AMBR_23', 'DDB_PD_036_AMBR_17', 'DDB_PD_036_AMBR_22', 'DDB_PD_036_AMBR_21', 'DDB_PD_036_AMBR_19', 'DDB_PD_036_AMBR_15', 'DDB_PD_036_AMBR_13', 'DDB_PD_036_AMBR_14', 'DDB_PD_036_AMBR_20', 'DDB_PD_036_AMBR_24', 'DDB_PD_036_AMBR_16', 'DDB_PD_038_AMBR_7', 'DDB_PD_038_AMBR_11', 'DDB_PD_038_AMBR_12', 'DDB_PD_038_AMBR_5', 'DDB_PD_038_AMBR_6', 'DDB_PD_038_AMBR_9', 'DDB_PD_038_AMBR_10', 'DDB_PD_038_AMBR_8', 'DDB_PD_043_AMBR_9', 'DDB_PD_043_AMBR_10', 'DDB_PD_043_AMBR_12', 'DDB_PD_043_AMBR_11', 'DDB_PD_0

In [122]:
print(f"""

    Total project experiments: {len(get_experiment_names(runs))}
    Total project runs: {len(get_run_names(runs))}
    Total of {len(get_medium_names(runs))} different Base, seed and feed mediums.
      """)






    Total project experiments: 71
    Total project runs: 517
    Total of 32 different Base, seed and feed mediums.
      
