In [1]:
import pathlib
import warnings

import pandas as pd
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")  # Ignore all warnings
warnings.simplefilter("ignore")  # Additional suppression method

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
profile_dict = {
    "organoid_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/organoid_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic Categories",
            "single_cell_count",
        ],
    },
    "single_cell_fs": {
        "input_profile_path": pathlib.Path(
            root_dir, "5.EDA/results/linear_modeling/sc_fs.parquet"
        ),
        "metadata_columns": [
            "patient",
            "object_id",
            "unit",
            "dose",
            "treatment",
            "Target",
            "Class",
            "image_set",
            "Well",
            "Therapeutic Categories",
            "parent_organoid",
        ],
    },
}

## Filter significant features
pvalue threshold is set to 0.05 - statistically significant features    
rsquared threshold is set to 0.5 - the explained variance is at least 50% of the total variance    
rsquared adjusted threshold is set to positive values - the model performs better than the mean    


### Single Cell

In [3]:
df = pd.read_parquet(
    profile_dict["single_cell_fs"]["input_profile_path"],
)
print(df.shape)

(45981, 14)


In [4]:
pvalue_threshold_max = 0.05
rsquared_threshold_min = 0.4
rsquared_adj_threshold_min = 0
coefficient_threshold_min = 1

In [5]:
# filter significant features
df_filtered = df[
    (df["pvalue"] < pvalue_threshold_max)
    & (df["rsquared"] > rsquared_threshold_min)
    & (df["rsquared_adj"] > rsquared_adj_threshold_min)
    & (df["coefficient"].abs() > coefficient_threshold_min)
].copy()
print(df_filtered.shape)
df_filtered.head()

(35, 14)


Unnamed: 0,patient,treatment,feature,rsquared,rsquared_adj,fvalue,pvalue,coefficient,intercept,Feature_type,Compartment,Channel,Measurement,Extra_info
9840,NF0016,Ketotifen,Colocalization_Nuclei_AGPMito_MEANOVERLAPCOEFF,0.419794,0.412719,59.329163,2.705052e-11,3.260447,-3.989864e-16,Colocalization,Nuclei,AGPMito,MEANOVERLAPCOEFF,
10117,NF0016,Ketotifen,Intensity_Cytoplasm_Mito_MININTENSITY,0.493712,0.487538,79.963279,9.377955e-14,2.996913,-2.255141e-16,Intensity,Cytoplasm,Mito,MININTENSITY,
14072,NF0018,Copanlisib,Colocalization_Nuclei_DNAMito_MINK2,0.495091,0.491585,141.199938,4.00885e-23,47.083873,1.595946e-16,Colocalization,Nuclei,DNAMito,MINK2,
14081,NF0018,Copanlisib,Intensity_Nuclei_AGP_CMIX,0.459907,0.456156,122.620633,5.308751e-21,16.842723,4.683753e-17,Intensity,Nuclei,AGP,CMIX,
14177,NF0018,Copanlisib,Colocalization_Cell_DNAMito_MINK2,0.504153,0.50071,146.412102,1.0786620000000001e-23,59.369253,-1.05211e-15,Colocalization,Cell,DNAMito,MINK2,


In [6]:
df_filtered["treatment"].unique()

array(['Ketotifen', 'Copanlisib', 'Selumetinib'], dtype=object)

In [7]:
df_filtered["patient"].unique()

array(['NF0016', 'NF0018', 'NF0030'], dtype=object)

In [8]:
df_filtered["feature"].unique()

array(['Colocalization_Nuclei_AGPMito_MEANOVERLAPCOEFF',
       'Intensity_Cytoplasm_Mito_MININTENSITY',
       'Colocalization_Nuclei_DNAMito_MINK2', 'Intensity_Nuclei_AGP_CMIX',
       'Colocalization_Cell_DNAMito_MINK2', 'Intensity_Cell_AGP_CMIX',
       'Intensity_Cell_AGP_CMIY', 'Intensity_Cell_AGP_CMIZ',
       'Intensity_Cell_AGP_IX', 'Intensity_Cell_ER_CMIX',
       'Intensity_Cell_Mito_CMIZ', 'Granularity_Cell_AGP_GRANULARITY1',
       'Granularity_Cell_AGP_GRANULARITY2',
       'Granularity_Cell_Mito_GRANULARITY1',
       'AreaSizeShape_Cytoplasm_VOLUME',
       'Colocalization_Cytoplasm_DNAER_MEANK2',
       'Colocalization_Cytoplasm_ERMito_MAXMANDERSCOEFFCOSTESM2',
       'Intensity_Cytoplasm_AGP_CMIX', 'Intensity_Cytoplasm_AGP_CMIZ',
       'Intensity_Cytoplasm_DNA_INTEGRATEDINTENSITY',
       'Intensity_Cytoplasm_Mito_CMIX', 'Intensity_Cytoplasm_Mito_CMIY',
       'Intensity_Cytoplasm_Mito_CMIZ',
       'Granularity_Cytoplasm_AGP_GRANULARITY1',
       'Granularity_Cytopla

### Organoid

In [9]:
df = pd.read_parquet(
    profile_dict["organoid_fs"]["input_profile_path"],
)
print(df.shape)

(18876, 14)


In [10]:
pvalue_threshold_max = 0.05
rsquared_threshold_min = 0.4
rsquared_adj_threshold_min = 0
coefficient_threshold_min = 1

In [11]:
# filter significant features
df_filtered = df[
    (df["pvalue"] < pvalue_threshold_max)
    & (df["rsquared"] > rsquared_threshold_min)
    & (df["rsquared_adj"] > rsquared_adj_threshold_min)
    & (df["coefficient"].abs() > coefficient_threshold_min)
].copy()
print(df_filtered.shape)
df_filtered.head()

(133, 14)


Unnamed: 0,patient,treatment,feature,rsquared,rsquared_adj,fvalue,pvalue,coefficient,intercept,Feature_type,Compartment,Channel,Measurement,Extra_info
89,NF0014,Mirdametinib,Intensity_Organoid_ER_STDINTENSITY,0.447425,0.4234,18.623301,0.000256,-1.745595,1.536424e-16,Intensity,Organoid,ER,STDINTENSITY,
90,NF0014,Mirdametinib,Intensity_Organoid_ER_STDINTENSITYEDGE,0.402125,0.37613,15.469554,0.000664,-1.665961,2.317867e-17,Intensity,Organoid,ER,STDINTENSITYEDGE,
143,NF0014,Fimepinostat,AreaSizeShape_Organoid_VOLUME,0.456506,0.427901,15.958957,0.000775,-2.108811,5.228555e-16,AreaSizeShape,Organoid,,VOLUME,
146,NF0014,Fimepinostat,AreaSizeShape_Organoid_EQUIVALENTDIAMETER,0.62819,0.608621,32.101402,1.8e-05,-3.092539,6.538228000000001e-17,AreaSizeShape,Organoid,,EQUIVALENTDIAMETER,
154,NF0014,Fimepinostat,Colocalization_Organoid_AGPER_MAXMANDERSCOEFFC...,0.403456,0.372059,12.850147,0.001976,-2.665425,4.232982e-14,Colocalization,Organoid,AGPER,MAXMANDERSCOEFFCOSTESM1,


In [12]:
df_filtered["treatment"].unique()

array(['Mirdametinib', 'Fimepinostat', 'Staurosporine', 'Binimetinib',
       'Copanlisib', 'Cabozantinib', 'Onalespib', 'Rapamycin', 'Digoxin',
       'Ketotifen', 'Trametinib', 'Linsitinib', 'Imatinib', 'Everolimus',
       'Selumetinib'], dtype=object)

In [13]:
df_filtered["patient"].unique()

array(['NF0014', 'NF0016', 'NF0018', 'NF0021', 'NF0030', 'SARCO219'],
      dtype=object)

In [14]:
df_filtered["feature"].unique()

array(['Intensity_Organoid_ER_STDINTENSITY',
       'Intensity_Organoid_ER_STDINTENSITYEDGE',
       'AreaSizeShape_Organoid_VOLUME',
       'AreaSizeShape_Organoid_EQUIVALENTDIAMETER',
       'Colocalization_Organoid_AGPER_MAXMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAAGP_MEANCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAAGP_MINMANDERSCOEFFCOSTESM1',
       'Colocalization_Organoid_DNAER_MEDIANCORRELATIONCOEFF',
       'Colocalization_Organoid_DNAER_MEDIANMANDERSCOEFFCOSTESM1',
       'Intensity_Organoid_DNA_CMIX',
       'Intensity_Organoid_DNA_INTEGRATEDINTENSITYEDGE',
       'Intensity_Organoid_DNA_LOWERQUARTILEINTENSITY',
       'Intensity_Organoid_DNA_MEANINTENSITYEDGE',
       'Intensity_Organoid_ER_DIFFZ',
       'Intensity_Organoid_ER_INTEGRATEDINTENSITY',
       'Intensity_Organoid_ER_INTEGRATEDINTENSITYEDGE',
       'Intensity_Organoid_ER_LOWERQUARTILEINTENSITY',
       'Intensity_Organoid_ER_MEANINTENSITYEDGE',
       'Intensity_Organoid_Mito_CMZ',
  