In [1]:
import argparse
import pathlib
import random

import numpy as np
import pandas as pd
from copairs import map
from copairs.matching import assign_reference_index

# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
import warnings

import pycytominer.aggregate
import tqdm

# Suppress all RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

#

In [2]:
if not in_notebook:
    # setup the argument parser
    parser = argparse.ArgumentParser(
        description="Generate a map for differing cell counts"
    )

    parser.add_argument(
        "--percentage", type=float, help="Percentage of wells to use for the map file"
    )
    parser.add_argument("--seed", type=int, help="Seed for the random number generator")
    parser.add_argument(
        "--shuffle", action="store_true", help="Shuffle the order of the wells"
    )
    # parse the arguments
    args = parser.parse_args()
    percentage = args.percentage
    set_seed = args.seed
    shuffle = args.shuffle
else:
    percentage = 0.4
    set_seed = 0
    shuffle = False

output_file = pathlib.Path(
    f"../results/mAP_cell_percentages/{percentage}_{set_seed}_{shuffle}.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)

In [3]:
def run_mAP_across_time(
    df: pd.DataFrame,
):
    """
    Run mAP across timepoints specifies and hardcoded columns for this data

    Parameters
    ----------
    df : pd.DataFrame
        An aggregated dataframe with metadata and features and temporal information

    Returns
    -------
    dict
        A dictionary of dataframes with the mAP results for each
        timepoint.
    """
    unique_timepoints = df.Metadata_Time.unique()
    dict_of_map_dfs = {}
    for timepoint in unique_timepoints:
        single_time_df = df.loc[df.Metadata_Time == timepoint]
        reference_col = "Metadata_reference_index"
        df_activity = assign_reference_index(
            single_time_df,
            "Metadata_treatment == 'DMSO CTL'",
            reference_col=reference_col,
            default_value=-1,
        )
        pos_sameby = ["Metadata_treatment", reference_col]
        pos_diffby = []
        neg_sameby = []
        neg_diffby = ["Metadata_treatment", reference_col]
        metadata = df_activity.filter(regex="Metadata")
        profiles = df_activity.filter(regex="^(?!Metadata)").values

        activity_ap = map.average_precision(
            metadata, profiles, pos_sameby, pos_diffby, neg_sameby, neg_diffby
        )

        activity_ap = activity_ap.query("Metadata_treatment != 'DMSO CTL'")
        activity_map = map.mean_average_precision(
            activity_ap, pos_sameby, null_size=1000000, threshold=0.05, seed=0
        )
        activity_map["-log10(p-value)"] = -activity_map["corrected_p_value"].apply(
            np.log10
        )
        # flatten the multi-index columns to make it easier to work with
        dict_of_map_dfs[timepoint] = activity_map
    return dict_of_map_dfs

In [4]:
sc_metadata_cols_to_drop = [
    "Metadata_ImageNumber",
    "Metadata_Cells_Number_Object_Number",
    "Metadata_Cytoplasm_Parent_Cells",
    "Metadata_Cytoplasm_Parent_Nuclei",
    "Metadata_ImageNumber_1",
    "Metadata_ImageNumber_2",
    "Metadata_ImageNumber_3",
    "Metadata_Nuclei_Number_Object_Number",
    "Metadata_Image_FileName_BF",
    "Metadata_Image_FileName_CL488",
    "Metadata_Image_FileName_CL561",
    "Metadata_Image_FileName_DNA",
    "Metadata_Image_FileName_GSDM",
    "Metadata_Image_PathName_BF",
    "Metadata_Image_PathName_CL488",
    "Metadata_Image_PathName_CL561",
    "Metadata_Image_PathName_DNA",
    "Metadata_Image_PathName_GSDM",
    "Metadata_Nuclei_Location_Center_X",
    "Metadata_Nuclei_Location_Center_Y",
    "Metadata_number_of_singlecells",
    "Metadata_FOV",
]

In [5]:
data_file_path = pathlib.Path(
    "../../4.processing_profiled_features/data/preprocessed_data/live_cell_pyroptosis_wave1_sc_first_time_norm_fs.parquet"
).resolve(strict=True)
df = pd.read_parquet(data_file_path)
df.reset_index(drop=True, inplace=True)
df = df[~df.Metadata_serum.str.contains("NuSerum")]

df.head()

Unnamed: 0,Metadata_treatment,Metadata_Well,Metadata_number_of_singlecells,Metadata_FOV,Metadata_treatment1,Metadata_Time,Metadata_treatment2,Metadata_treatment1_dose,Metadata_treatment1_unit,Metadata_treatment2_dose,...,Nuclei_Texture_InfoMeas2_BF_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CL488_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CL561_3_01_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_00_256,Nuclei_Texture_InverseDifferenceMoment_GSDM_3_01_256,Nuclei_Texture_SumAverage_BF_3_03_256,Nuclei_Texture_SumAverage_CL488_3_01_256,Nuclei_Texture_SumAverage_CL561_3_03_256,Nuclei_Texture_SumAverage_GSDM_3_00_256,Metadata_cells_per_well
14515,DMSO CTL,C05,417,5,DMSO,0,,CTL,,,...,0.889628,0.518516,-1.509714,0.972317,0.92065,-0.175937,-0.249682,-0.970753,0.598953,51378
14516,DMSO CTL,C05,417,5,DMSO,0,,CTL,,,...,0.968803,0.509346,0.212119,-0.719591,-1.168241,0.910783,-0.310844,0.528903,-0.675315,51378
14517,DMSO CTL,C05,417,5,DMSO,0,,CTL,,,...,-0.887092,0.274846,0.810185,-0.215096,-1.168241,-0.199008,-0.222628,0.186214,-0.675315,51378
14518,DMSO CTL,C05,417,5,DMSO,0,,CTL,,,...,-0.209219,1.141047,-1.509714,-0.791934,0.706648,-0.051787,-0.391319,-0.970753,0.032265,51378
14519,DMSO CTL,C05,417,5,DMSO,0,,CTL,,,...,-0.448472,0.759349,0.573115,0.089354,-1.168241,0.72226,-0.351976,0.281935,-0.675315,51378


In [6]:
random.seed(set_seed)
subset_df = df.groupby(["Metadata_Time", "Metadata_treatment"]).apply(
    lambda x: x.sample(frac=percentage, random_state=set_seed),
    include_groups=False,
)
if shuffle:
    random.seed(0)
    # permutate the data
    for col in subset_df.columns:
        subset_df[col] = np.random.permutation(subset_df[col])
metadata_cols = [cols for cols in df.columns if "Metadata" in cols]
features_cols = [cols for cols in df.columns if "Metadata" not in cols]
features_cols = features_cols + ["Metadata_number_of_singlecells"]
aggregate_df = pycytominer.aggregate(
    population_df=df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)
# Drop metadata columns
metadata_cols = [x for x in metadata_cols if x not in sc_metadata_cols_to_drop]

metadata_df = df[metadata_cols]
metadata_df = metadata_df.drop_duplicates()
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
dict_of_map_dfs = run_mAP_across_time(aggregate_df)
output_df = pd.concat(dict_of_map_dfs.values(), keys=dict_of_map_dfs.keys())
output_df.reset_index(inplace=True)
output_df.rename(columns={"level_0": "Metadata_Time"}, inplace=True)
# add the percentage of cells to the keys
output_df["percentage_of_cells"] = percentage
output_df["seed"] = set_seed
output_df["shuffle"] = shuffle
output_df.reset_index(drop=True, inplace=True)
output_df.to_parquet(output_file)
output_df.head()

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,Metadata_Time,Metadata_treatment,Metadata_reference_index,mean_average_precision,indices,p_value,corrected_p_value,below_p,below_corrected_p,-log10(p-value),percentage_of_cells,seed,shuffle
0,0,Ab1-42 0.4 uM,-1,1.0,"[14, 43, 47]",9.99999e-07,1e-06,True,True,5.833669,0.4,0,False
1,0,Ab1-42 10 uM,-1,1.0,"[18, 31, 62]",9.99999e-07,1e-06,True,True,5.833669,0.4,0,False
2,0,Ab1-42 2 uM,-1,1.0,"[17, 33, 57]",9.99999e-07,1e-06,True,True,5.833669,0.4,0,False
3,0,Flagellin 0.1 ug/ml,-1,0.861111,"[1, 3, 9]",0.0998809,0.122077,False,False,0.913367,0.4,0,False
4,0,Flagellin 1 ug/ml,-1,0.619444,"[4, 20, 30]",0.3995696,0.418597,False,False,0.378204,0.4,0,False
