In [1]:
import pathlib
import warnings

import pandas as pd
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")  # Ignore all warnings
warnings.simplefilter("ignore")  # Additional suppression method

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
profile_dict = {
    "organoid_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/organoid_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic Categories",
            "single_cell_count",
        ],
    },
    "single_cell_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/sc_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic Categories",
            "parent_organoid",
        ],
    },
}

## Filter significant features
pvalue threshold is set to 0.05 - statistically significant features    
rsquared threshold is set to 0.5 - the explained variance is at least 50% of the total variance    
rsquared adjusted threshold is set to positive values - the model performs better than the mean    


In [53]:
df = pd.read_parquet(
    profile_dict["single_cell_fs"]["input_profile_path"],
)
print(df.shape)

(33143, 14)


In [59]:
pvalue_threshold_max = 0.05
rsquared_threshold_min = 0.4
rsquared_adj_threshold_min = 0
coefficient_threshold_min = 1

In [60]:
# filter significant features
df_filtered = df[
    (df["pvalue"] < pvalue_threshold_max)
    & (df["rsquared"] > rsquared_threshold_min)
    & (df["rsquared_adj"] > rsquared_adj_threshold_min)
    & (df["coefficient"].abs() > coefficient_threshold_min)
].copy()
print(df_filtered.shape)
df_filtered.head()

(24, 14)


Unnamed: 0,patient,treatment,feature,rsquared,rsquared_adj,fvalue,pvalue,coefficient,intercept,Feature_type,Compartment,Channel,Measurement,Extra_info
7094,NF0016,Ketotifen,Colocalization_Nuclei_AGPMito_MEANOVERLAPCOEFF,0.419794,0.412719,59.329163,2.705052e-11,3.260447,-3.989864e-16,Colocalization,Nuclei,AGPMito,MEANOVERLAPCOEFF,
7295,NF0016,Ketotifen,Intensity_Cytoplasm_Mito_MININTENSITY,0.493712,0.487538,79.963279,9.377955e-14,2.996913,-2.255141e-16,Intensity,Cytoplasm,Mito,MININTENSITY,
10148,NF0018,Copanlisib,Colocalization_Nuclei_DNAMito_MINK2,0.495091,0.491585,141.199938,4.00885e-23,47.083873,1.595946e-16,Colocalization,Nuclei,DNAMito,MINK2,
10224,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MINK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MINK2,
10226,NF0018,Copanlisib,Intensity_Cell_AGP_CMIX,0.569113,0.566121,190.19458,4.1335560000000005e-28,80.03613,2.706169e-16,Intensity,Cell,AGP,CMIX,


In [61]:
df_filtered["treatment"].unique()

array(['Ketotifen', 'Copanlisib', 'Selumetinib'], dtype=object)

In [62]:
df_filtered["patient"].unique()

array(['NF0016', 'NF0018', 'NF0030'], dtype=object)

In [63]:
df_filtered["feature"].unique()

array(['Colocalization_Nuclei_AGPMito_MEANOVERLAPCOEFF',
       'Intensity_Cytoplasm_Mito_MININTENSITY',
       'Colocalization_Nuclei_DNAMito_MINK2',
       'Colocalization_Cell_DNAMito_MINK2', 'Intensity_Cell_AGP_CMIX',
       'Intensity_Cell_AGP_CMIY', 'Intensity_Cell_ER_CMIX',
       'Granularity_Cell_AGP_GRANULARITY1',
       'Granularity_Cell_AGP_GRANULARITY2',
       'Granularity_Cell_Mito_GRANULARITY1',
       'AreaSizeShape_Cytoplasm_VOLUME',
       'Colocalization_Cytoplasm_DNAER_MEANK2',
       'Colocalization_Cytoplasm_ERMito_MAXMANDERSCOEFFCOSTESM2',
       'Intensity_Cytoplasm_AGP_CMIX', 'Intensity_Cytoplasm_Mito_CMIZ',
       'Granularity_Cytoplasm_AGP_GRANULARITY1',
       'Granularity_Cytoplasm_Mito_GRANULARITY1',
       'Intensity_Nuclei_Mito_MEANINTENSITYEDGE',
       'Intensity_Nuclei_Mito_MININTENSITY',
       'Colocalization_Cytoplasm_AGPMito_MEANCORRELATIONCOEFF',
       'Colocalization_Cytoplasm_AGPMito_MINK2',
       'Colocalization_Cytoplasm_ERMito_MINK2'], dt