In [1]:
import pathlib

import pandas as pd
import statsmodels.stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
endpoint_path = pathlib.Path(
    "../../data/CP_feature_select/endpoint_whole_image/feature_selected_whole_image.parquet"
).resolve(strict=True)
intensity_feature_path = pathlib.Path(
    "../data/0.ground_truth/annexinv_intensity_features_df.parquet"
).resolve()
intensity_feature_path.parent.mkdir(parents=True, exist_ok=True)
tukey_results_path = pathlib.Path(
    "../data/0.ground_truth/tukey_results.parquet"
).resolve()
tukey_results_path.parent.mkdir(parents=True, exist_ok=True)

endpoint_df = pd.read_parquet(endpoint_path)
endpoint_df.head()

Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Texture_Correlation_DNA_3_02_256,Texture_DifferenceEntropy_DNA_3_02_256,Texture_DifferenceVariance_AnnexinV_3_02_256,Texture_DifferenceVariance_DNA_3_00_256,Texture_InfoMeas1_AnnexinV_3_03_256,Texture_InfoMeas1_DNA_3_01_256,Texture_InfoMeas2_AnnexinV_3_03_256,Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Texture_SumVariance_AnnexinV_3_03_256,Texture_SumVariance_DNA_3_02_256
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,1.337356,0.368165,0.593027,-0.134154,-1.445089,-1.062122,0.221441,2.019266,-0.663868,1.129909
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,0.139861,1.193812,1.66816,-1.100615,0.093066,0.978298,-0.947193,1.953388,-1.935278,1.637002
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,0.910705,0.849906,0.840446,-0.660441,-0.424044,-0.519855,-0.109288,1.308669,-0.873052,1.537121
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,-0.095736,0.794491,0.059917,-0.659922,-1.294545,1.519666,0.094697,2.002679,-1.055458,1.022017
4,1,Staurosporine,2.44,test,,4,,14,E-05,1,...,2.716246,0.178535,-0.713072,-0.61322,-0.987307,-2.79086,0.875835,-0.011052,0.687659,1.741973


In [3]:
metadata_columns = [x for x in endpoint_df.columns if "Metadata_dose" in x]
# get the annexinV columns
annexinV_columns = [x for x in endpoint_df.columns if "Intensity" in x]
annexinv_df = endpoint_df[metadata_columns + annexinV_columns]

annexinv_df.head()
# save the intensity feature df

annexinv_df.to_parquet(intensity_feature_path)

Interesting result here - should be faceted by the channel. 
I am interested in determining the key dose that is the most effective

In [4]:
# perform ANOVA for each intensity column for each dose
list_of_anova_results = []
for column in annexinv_df.columns:
    if column == "Metadata_dose":
        continue
    model = ols(f"{column} ~ C(Metadata_dose)", data=annexinv_df).fit()
    anova_results = anova_lm(model, typ=2)
    anova_results.reset_index(inplace=True)
    anova_results["feature"] = column
    # post hoc test
    tukey = pairwise_tukeyhsd(
        endog=annexinv_df[column], groups=annexinv_df["Metadata_dose"], alpha=0.05
    )
    tukey_results = pd.DataFrame(
        data=tukey._results_table.data[1:], columns=tukey._results_table.data[0]
    )
    tukey_results["feature"] = column
    list_of_anova_results.append(tukey_results)
df = pd.concat(list_of_anova_results)
# correct for multiple testing
df["p-adj_bh"] = statsmodels.stats.multitest.multipletests(
    df["p-adj"], method="fdr_bh"
)[1]

df.to_parquet(tukey_results_path)
df.head()

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,feature,p-adj_bh
0,0.0,0.61,0.4855,0.9945,-1.1743,2.1452,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
1,0.0,1.22,0.6877,0.9421,-0.972,2.3475,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
2,0.0,2.44,0.5774,0.9838,-1.1196,2.2744,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
3,0.0,4.88,0.1214,1.0,-1.5384,1.7811,False,Intensity_LowerQuartileIntensity_AnnexinV,1.0
4,0.0,9.77,1.3755,0.1961,-0.2842,3.0352,False,Intensity_LowerQuartileIntensity_AnnexinV,0.901367
