In [1]:
import pathlib

import pandas as pd

In [2]:
stats_path = pathlib.Path("../../data/cell_tracks_data").resolve(strict=True)
output_combined_stats_file_path = pathlib.Path(
    "../data/combined_stats.parquet"
).resolve()
output_combined_stats_file_path.parent.mkdir(parents=True, exist_ok=True)
# get the list of all files in the directory
files = sorted(stats_path.glob("*.parquet"))

stats_df = pd.concat(
    [
        pd.read_parquet(f).assign(
            well_fov=f.name.split("_")[0] + "_" + f.name.split("_")[1]
        )
        for f in files
    ],
    ignore_index=True,
)
print(stats_df.shape)
stats_df.head()

(237578, 8)


Unnamed: 0,track_id,t,y,x,id,parent_track_id,parent_id,well_fov
0,1,0.0,17.0,1273.0,1000005.0,-1,-1.0,C-02_F0001
1,1,1.0,41.0,1308.0,2000010.0,-1,1000005.0,C-02_F0001
2,1,2.0,38.0,1292.0,3000009.0,-1,2000010.0,C-02_F0001
3,1,3.0,36.0,1292.0,4000009.0,-1,3000009.0,C-02_F0001
4,1,4.0,34.0,1289.0,5000009.0,-1,4000009.0,C-02_F0001


In [3]:
# add dose information to the stats dataframe
profile = pd.read_parquet(
    pathlib.Path(
        f"../../data/CP_feature_select/profiles/features_selected_profile.parquet"
    ).resolve(strict=True)
)
profile["well_fov"] = profile["Metadata_Well"] + "_F" + profile["Metadata_FOV"]
dose_df = profile[["well_fov", "Metadata_dose"]].drop_duplicates()
# merge the profile information with the stats dataframe
stats_df = stats_df.merge(dose_df, on="well_fov", how="left")
print(stats_df.shape)
stats_df.head()

(237578, 9)


Unnamed: 0,track_id,t,y,x,id,parent_track_id,parent_id,well_fov,Metadata_dose
0,1,0.0,17.0,1273.0,1000005.0,-1,-1.0,C-02_F0001,0.0
1,1,1.0,41.0,1308.0,2000010.0,-1,1000005.0,C-02_F0001,0.0
2,1,2.0,38.0,1292.0,3000009.0,-1,2000010.0,C-02_F0001,0.0
3,1,3.0,36.0,1292.0,4000009.0,-1,3000009.0,C-02_F0001,0.0
4,1,4.0,34.0,1289.0,5000009.0,-1,4000009.0,C-02_F0001,0.0


In [4]:
# group by well_fov, dose, timepoint and get the mean of total_CP_cells and total_annotated_cells
stats_summary = (
    stats_df.groupby(["well_fov", "Metadata_dose", "t"])
    .agg({"track_id": "nunique"})
    .reset_index()
)
stats_summary.rename(columns={"track_id": "num_tracked_cells"}, inplace=True)
stats_summary.head()

Unnamed: 0,well_fov,Metadata_dose,t,num_tracked_cells
0,C-02_F0001,0.0,0.0,36
1,C-02_F0001,0.0,1.0,135
2,C-02_F0001,0.0,2.0,146
3,C-02_F0001,0.0,3.0,146
4,C-02_F0001,0.0,4.0,148


In [5]:
profile_stats = (
    profile.groupby(["well_fov", "Metadata_dose", "Metadata_Time"])
    .agg({"Metadata_plate": "count"})
    .reset_index()
)
profile_stats.rename(columns={"Metadata_plate": "total_CP_cells"}, inplace=True)
profile_stats.head()

Unnamed: 0,well_fov,Metadata_dose,Metadata_Time,total_CP_cells
0,C-02_F0001,0.0,0.0,36
1,C-02_F0001,0.0,1.0,132
2,C-02_F0001,0.0,2.0,141
3,C-02_F0001,0.0,3.0,141
4,C-02_F0001,0.0,4.0,142


In [6]:
# merge the profile stats with the stats summary
final_stats = stats_summary.merge(
    profile_stats,
    left_on=["well_fov", "Metadata_dose", "t"],
    right_on=["well_fov", "Metadata_dose", "Metadata_Time"],
    how="inner",
)
final_stats.drop(columns=["t"], inplace=True)
final_stats["cells_not_tracked"] = (
    final_stats["total_CP_cells"] - final_stats["num_tracked_cells"]
)
final_stats.to_parquet(output_combined_stats_file_path, index=False)
final_stats.head()

Unnamed: 0,well_fov,Metadata_dose,num_tracked_cells,Metadata_Time,total_CP_cells,cells_not_tracked
0,C-02_F0001,0.0,36,0.0,36,0
1,C-02_F0001,0.0,135,1.0,132,-3
2,C-02_F0001,0.0,146,2.0,141,-5
3,C-02_F0001,0.0,146,3.0,141,-5
4,C-02_F0001,0.0,148,4.0,142,-6
