# Scaling Analysis under constant oxygen environments

Using this scaling analysis script allows for applying your single analysis script to a large amount of image sequences organized in the OMERO `project` or `dataset` structures. Therefore, your custom developed analyses can scale to large image volumes without you touching or changing the code. 

## 1. Setup

Define the `omero_id` and `omero_type` of the image data you would like to process. The `omerod_id` is the number you can find in the top right corner when selecting a OMERO `project`, `dataset` or `image` in the `OMERO Web` application. The `omero_type` must be `project` or `dataset` when the OMERO id points to a project or dataset and `image` if it is just a single image! Please note that if you define the wrong `omero_type` you will get an error lateron!

Also provide your credentials for the OMERO server!

In [None]:
import os

# OMERO resource that you want to analyze
omero_type = "dataset" # can be "image", "project" or "dataset"
omero_id = 2948 # change the id if you want to apply the analysis to a different omero resource

# your omero credentials
username = "<your username>"
password = "<your password>"

# do not change the lines below
assert username != "<your username>", "Please replace '<your username>' with your OMERO username"
assert password != "<your password>", "Please replace '<your password>' with your OMERO username"

import logging

if not "OMERO_SERVER" in os.environ:
    logging.warning("No 'OMERO_SERVER' defined. Fallback to default OMERO_SERVER address 'omero'! This can lead to connection faults!")
if not "OMERO_WEB" in os.environ:
    logging.warning("No 'OMERO_WEB' defined. Links to view OMERO data in web viewer might not work!")

credentials = dict(
    serverUrl=os.environ.get('OMERO_SERVER', 'omero'),
    username=username,
    password = password
)

omero_cred = dict(
    host = credentials['serverUrl'],
    username = credentials['username'],
    passwd = credentials['password']
)

## 1.2 Specify the analysis script

Now you have to specify the name of the analysis script you want to apply to the image data. At best copy the script to the same location as this script! Then you only have to specify the name of the script!

**Note:** If the analysis script is not located in the same folder you need to specify the path to it.

In [None]:
analysis_script = "GrowthRate_constant_oxygen.ipynb"

# 2. Information about the underlying data

We summarize the amount of underlying data

In [None]:
from acia.segm.omero.utils import list_image_ids_in
from omero.gateway import BlitzGateway

with BlitzGateway(**omero_cred) as conn:
    image_ids = list_image_ids_in(omero_id, omero_type, conn)

## TODO: give an overview about the data
print(image_ids)

# 3. Scale the analysis script to all image sequences

Now we apply the analysis script to every image sequence individually 🚀! You can lean back and enjoy the working computer 😎 🥂

**Note:** For heavy analysis scripts or for larget `datasets` or `projects` this process may take a while (from minutes to hours or days). The top-level progress bar will indicate the total progress and give you an indication how long this will take. For large image data volumes we can recommend execution over night 🌔!

In [None]:
from datetime import datetime
from pathlib import Path
from acia.analysis import scale

# set the base path for all results
stem = Path(analysis_script).stem
output_path = Path("./automated_executions") / stem / datetime.today().isoformat()

print(f"Results are stored in: {output_path.absolute()}")

# scale your analysis script to many images
result = scale(output_path, analysis_script=analysis_script, image_ids=image_ids)

# 4. Inspect your analysis results


In [None]:
import urllib.parse
from IPython.display import Video, Markdown, display

base_url = os.environ.get("JUPYTERHUB_SERVICE_PREFIX", None)

if base_url is None:
    url = f"file://{output_path.absolute()}"
else:
    url = f"{base_url}lab/tree/{urllib.parse.quote(str(output_path))}"

output = f"""# Inspect your analyses
You can find all the individual analysis scripts here: <a href="{url}">{url}</a>"""

display(Markdown(output))

# 5. Generate Summary Statistics

In this section you can generate your custom summary statistics that combine the results of all experiment analyses. Just design the analysis script that you scaled above such that it outputs the results into a local files. Here, these results can be loaded, merged together and further processed or visualized!

First, "result_growth-rate.csv" files are collected from all the analyzed chambers, then mean and std of growth rate are calculated

In [None]:
from pathlib import Path
import pandas as pd
import os, glob
import numpy as np

directory = Path("./automated_executions") / stem

# Find latest folder
latest_folder = Path(max(glob.glob(os.path.join(directory, '*/')), key=os.path.getmtime))
print(latest_folder)

In [None]:
dfs = []

# if the csv. file exists, take growth rates, otherwise go to the next loop
for sub_folder in latest_folder.glob("execution*"):
    data_folder =  sub_folder / "tmp"
    data_files = os.listdir(data_folder)
    if "result_growth-rate.csv" not in data_files:
        print(sub_folder.name, 'was not analyzed')
    else:
        sub_df = pd.read_csv(data_folder / "result_growth-rate.csv", delimiter = ';')
        sub_df.loc[len(sub_df)] = {'m': sub_folder.name, 'b': sub_folder.name} # adding a new row (ImageID) to sub_df
        dfs.append(sub_df[['m']].T)

joint_df = pd.concat(dfs, ignore_index=True)
joint_df.columns = ['Cell number', 'Cell area', 'ImageID']
print(joint_df)

# calculate mean and std of growth rate
mean = [np.mean(joint_df['Cell number']), np.mean(joint_df['Cell area'])]
std = [np.std(joint_df['Cell number']), np.std(joint_df['Cell area'])]

statistics_df = pd.DataFrame({'': ['mean', 'std'],
                              'Cell number': [mean[0], std[0]],
                              'Cell area': [mean[1], std[1]]})

joint_df.to_csv('./ growth-rate_summary.csv', decimal='.', sep=';')
statistics_df.to_csv('./ growth-rate_mean-std.csv', decimal='.', sep=';')
print(statistics_df)

In [None]:
# Growth rate summary in the box plot

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

fig, ax = plt.subplots(figsize=(5,5))
sns.boxplot(data=joint_df)
ax.set_ylabel('Growth rate [h$^{-1}$]')
ax.grid()

Next, "result.csv" files are collected from all the analyzed chambers, then growth is characterized by population area, mean and std of single cell area

In [None]:
# Change parameters here
dt_image = 10/60   # image acquisition interval (hour)
t_start = 1   # analysis start (hour)
t_end = 2.5   # analysis end (hour)

from pathlib import Path
import pandas as pd
import os, glob
import numpy as np
import matplotlib.pyplot as plt

# if the result.csv file exists, take data
growth_df = pd.DataFrame()
data_list_t = []
pop_area_df = pd.DataFrame()
cell_area_mean_df = pd.DataFrame()
cell_area_std_df = pd.DataFrame()

for sub_folder in latest_folder.glob("execution*"):
    data_list = []
    data_folder =  sub_folder / "tmp"
    data_files = os.listdir(data_folder)
    
    if "result.csv" not in data_files:
        print(sub_folder.name, 'was not analyzed')
        
    else:
        sub_df = pd.read_csv(data_folder / "result.csv", delimiter = ';')
        
        # collect 'area_sum' & 'area_mean' & 'area_std' from all the analyzed chambers
        pop_area_df[sub_folder.name] = sub_df['area_sum']
        cell_area_mean_df[sub_folder.name] = sub_df['area_mean'] 
        cell_area_std_df[sub_folder.name] = sub_df['area_std']


# save collected area_sum
pop_area_df.insert(0, 'time', sub_df['time'])
pop_area_df['mean'] = pop_area_df.mean(axis=1)
pop_area_df['std'] = pop_area_df.std(axis=1)
pop_area_df.to_csv('./ population-area_summary.csv', decimal='.', sep=';')

print('Analysis done!')

Lastly, "allcells.csv" files are collected from all the analyzed chambers

In [None]:
# Change parameters here
dt_image = 10/60   # image acquisition interval (hour)
t_start = 1   # analysis start (hour)
t_end = 2.5   # analysis end (hour)

from pathlib import Path
import pandas as pd
import os, glob
import numpy as np
import matplotlib.pyplot as plt

# if the allcells.csv file exists, take single-cell area data
cell_area_df = pd.DataFrame()
data = []

for sub_folder in latest_folder.glob("execution*"):
    data_list = []
    data_folder =  sub_folder / "tmp"
    data_files = os.listdir(data_folder)
    
    if "allcells.csv" not in data_files:
        print(sub_folder.name, 'was not analyzed')
        
    else:
        sub_allcells_df = pd.read_csv(data_folder / "allcells.csv", delimiter = ';')
        timed_sub_allcells_df = sub_allcells_df[(sub_allcells_df['time'] >= t_start) & (sub_allcells_df['time'] <= t_end)]
        filtered_allcells_df = sub_allcells_df[sub_allcells_df['time'] == 2]
        
        cell_area_df[sub_folder.name] = timed_sub_allcells_df['area']
        
        if not filtered_allcells_df.empty:
            for area in filtered_allcells_df['area']:
                data.append({'image ID': sub_folder.name, 'area': area})

summary_df = pd.DataFrame(data)
summary_df.to_csv('./ single-cell_area_2h.csv', decimal='.', sep=';')