# Aggregate the single-cell profiles to the well level
This notebook is not run as a large amount of RAM is needed to run it. It is provided for reference only.

In [1]:
import pathlib

import pandas as pd
import pycytominer

In [2]:
# directory where combined parquet file are located
data_dir = pathlib.Path("../data")
aggregate_dir = pathlib.Path("../data/aggregated")
aggregate_dir.mkdir(exist_ok=True, parents=True)

In [3]:
# dictionary with each run for the cell type
dict_of_inputs = {
    "first_time": {
        "normalized": pathlib.Path(
            f"{data_dir}/normalized_data/live_cell_pyroptosis_wave1_sc_first_time_norm.parquet"
        ).resolve(strict=True),
        "selected": pathlib.Path(
            f"{data_dir}/feature_selected_data/live_cell_pyroptosis_wave1_sc_first_time_norm_fs.parquet"
        ).resolve(strict=True),
        "aggregate_normalized": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_first_time_norm_agg.parquet"
        ).resolve(),
        "aggregate_selected": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_first_time_norm_fs_agg.parquet"
        ).resolve(),
    },
    "pan_time": {
        "normalized": pathlib.Path(
            f"{data_dir}/normalized_data/live_cell_pyroptosis_wave1_sc_pan_time_norm.parquet"
        ).resolve(strict=True),
        "selected": pathlib.Path(
            f"{data_dir}/feature_selected_data/live_cell_pyroptosis_wave1_sc_pan_time_norm_fs.parquet"
        ).resolve(strict=True),
        "aggregate_normalized": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_pan_time_norm_agg.parquet"
        ).resolve(),
        "aggregate_selected": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_pan_time_norm_fs_agg.parquet"
        ).resolve(),
    },
    "within_time": {
        "normalized": pathlib.Path(
            f"{data_dir}/normalized_data/live_cell_pyroptosis_wave1_sc_within_time_norm.parquet"
        ).resolve(strict=True),
        "selected": pathlib.Path(
            f"{data_dir}/feature_selected_data/live_cell_pyroptosis_wave1_sc_within_time_norm_fs.parquet"
        ).resolve(strict=True),
        "aggregate_normalized": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_within_time_norm_agg.parquet"
        ).resolve(),
        "aggregate_selected": pathlib.Path(
            f"{aggregate_dir}/live_cell_pyroptosis_wave1_within_time_norm_fs_agg.parquet"
        ).resolve(),
    },
}

The cell below must be run as a script on an HPC cluster with sufficient memory.

In [5]:
path = pathlib.Path(
    "../data/preprocessed/live_cell_pyroptosis_wave1_first_time_norm_agg_subset_testing_data.parquet"
).resolve(strict=True)
df = pd.read_parquet(path)
df

Unnamed: 0,Metadata_Well,Metadata_Plate,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,Cytoplasm_AreaShape_Center_X,Cytoplasm_AreaShape_Center_Y,...,Nuclei_AreaShape_SpatialMoment_0_2,Nuclei_AreaShape_SpatialMoment_0_3,Nuclei_AreaShape_SpatialMoment_1_0,Nuclei_AreaShape_SpatialMoment_1_1,Nuclei_AreaShape_SpatialMoment_1_2,Nuclei_AreaShape_SpatialMoment_1_3,Nuclei_AreaShape_SpatialMoment_2_0,Nuclei_AreaShape_SpatialMoment_2_1,Nuclei_AreaShape_SpatialMoment_2_2,Nuclei_AreaShape_SpatialMoment_2_3
0,C04,20241024T194653,-0.322354,-0.317583,-0.020861,-0.023846,-0.021816,-0.036300,-0.022619,-0.031397,...,,,,,,,,,,
1,C04,20241024T224849,-0.300212,-0.307219,-0.009216,-0.044054,-0.003894,-0.035388,-0.001874,-0.038150,...,,,,,,,,,,
2,C04,20241025T015040,-0.277588,-0.268199,-0.012799,-0.009149,-0.004790,-0.016236,-0.008646,-0.011762,...,,,,,,,,,,
3,C04,20241025T045229,-0.298768,-0.326729,-0.004737,-0.173571,0.002378,-0.162153,-0.001047,-0.167759,...,,,,,,,,,,
4,C04,20241025T075429,-0.307432,-0.315754,-0.010111,-0.083552,0.005963,-0.081899,-0.002286,-0.077519,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1803,N11,20241026T134545,-0.370008,-0.429460,-0.004737,0.043209,0.002378,0.048514,-0.004462,0.044706,...,,,,,,,,,,
1804,N11,20241026T164425,-0.341608,-0.413303,0.011387,-0.010067,0.016716,-0.007117,0.008981,-0.008165,...,,,,,,,,,,
1805,N11,20241026T194305,-0.357974,-0.409645,0.031990,0.012897,0.031950,0.025715,0.027359,0.018160,...,,,,,,,,,,
1806,N11,20241028T111908,1.080795,0.864893,0.131421,-0.019253,0.074066,-0.091018,0.106226,-0.055648,...,,,,,,,,,,


In [6]:
df["Metadata_Plate"].unique()

array(['20241024T194653', '20241024T224849', '20241025T015040',
       '20241025T045229', '20241025T075429', '20241025T105738',
       '20241025T135621', '20241025T165502', '20241025T195339',
       '20241025T225218', '20241026T015058', '20241026T044935',
       '20241026T074814', '20241026T104653', '20241026T134545',
       '20241026T164425', '20241026T194305', '20241028T111908',
       'pyroptosis'], dtype=object)

In [None]:
for profile in dict_of_inputs.keys():

    # Load the normalized data
    norm_df = pd.read_parquet(dict_of_inputs[profile]["normalized"])

    norm_aggregate_df = pycytominer.aggregate(
        population_df=norm_df,
        strata=["Metadata_Well", "Metadata_Plate"],
        features="infer",
        operation="median",
    )
    print(f"Normalized data shape: {norm_df.shape}")
    print(f"Aggregated normalized data shape: {norm_aggregate_df.shape}")

    # Save the aggregated normalized data
    norm_aggregate_df.to_parquet(dict_of_inputs[profile]["aggregate_normalized"])
    del norm_df, norm_aggregate_df

    # Load the selected data
    norm_fs_df = pd.read_parquet(dict_of_inputs[profile]["selected"])

    norm_fs_aggregate_df = pycytominer.aggregate(
        population_df=norm_fs_df,
        strata=["Metadata_Well", "Metadata_Plate"],
        features="infer",
        operation="median",
    )
    print(f"Selected data shape: {norm_fs_df.shape}")
    print(f"Aggregated selected data shape: {norm_fs_aggregate_df.shape}")

    # Save the aggregated selected data
    norm_fs_aggregate_df.to_parquet(dict_of_inputs[profile]["aggregate_selected"])
    del norm_fs_df, norm_fs_aggregate_df