This notebook performs profile feature selection.

In [1]:
import argparse
import os
import pathlib
import sys

import pandas as pd
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook
from pycytominer import feature_select

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data")).resolve(),
    root_dir,
)

In [2]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]
    image_based_profiles_subparent_name = args["image_based_profiles_subparent_name"]

else:
    patient = "NF0014_T1"
    image_based_profiles_subparent_name = "image_based_profiles"

In [None]:
# pathing
sc_normalized_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/4.normalized_profiles/sc_norm.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/4.normalized_profiles/organoid_norm.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/5.feature_selected_profiles/sc_fs.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/{image_based_profiles_subparent_name}/5.feature_selected_profiles/organoid_fs.parquet"
).resolve()

organoid_fs_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "drop_na_columns",
    "blocklist",
    # "correlation_threshold", # comment out to remove correlation thresholding
    # "variance_threshold", # comment out to remove variance thresholding
]

In [6]:
na_cutoff = 0.05
corr_threshold = 0.95
freq_cut = 0.01
unique_cut = 0.01

### Feature select the single-cell profiles

In [7]:
sc_normalized.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_parent_organoid,Metadata_Target,Metadata_Class,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,NF0014_T1,61,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,0.06443,0.03635,0.009193,0.541606,-0.013471,0.0273,-0.179517,-0.003906,-0.281164,-0.2534
1,NF0014_T1,114,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,1.387787,-0.987652,1.137201,0.83367,0.808809,-1.139383,0.929826,1.155166,0.478639,0.580791
2,NF0014_T1,184,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,1.635019,-1.390974,1.422826,0.874011,0.970971,-1.587359,0.460939,1.453986,-0.415846,-0.386394
3,NF0014_T1,219,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,-0.071752,0.14621,-0.093037,-0.152675,0.059486,0.159556,-0.587803,-0.106811,-0.80297,-0.831941
4,NF0014_T1,255,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,0.03729,0.053939,-0.022213,0.395098,-0.011712,0.04618,-0.026482,-0.031981,-0.016887,0.045138


In [8]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
sc_blocklist += [
    x
    for x in sc_normalized.columns
    if "Intensity" in x
    and (
        "MIN.X" in x
        or "MAX.X" in x
        or "MIN.Y" in x
        or "MAX.Y" in x
        or "MIN.Z" in x
        or "MAX.Z" in x
    )
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path(
    f"{root_dir}/4.processing_image_based_profiles/data/blocklist/sc_blocklist.txt"
).resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [9]:
sc_metadata_columns = [x for x in sc_normalized.columns if "Metadata" in x]

sc_metadata_columns += [
    "Area.Size.Shape_Cell_CENTER.X",
    "Area.Size.Shape_Cell_CENTER.Y",
    "Area.Size.Shape_Cell_CENTER.Z",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
all_trt_df = sc_normalized.copy()

sc_normalized = sc_normalized.loc[
    sc_normalized["Metadata_treatment"].isin(["DMSO", "Staurosporine"])
]

sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [10]:
# fs the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
    freq_cut=freq_cut,
    unique_cut=unique_cut,
)
fs_profiles = all_trt_df[
    [col for col in all_trt_df.columns if col in sc_fs_profiles.columns]
]

original_data_shape = sc_normalized.shape
sc_fs_profiles = pd.concat(
    [
        all_trt_df[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", sc_fs_profiles.shape[1])
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

The number features before feature selection: 5980
The number features after feature selection: 5889


Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_parent_organoid,Metadata_Target,Metadata_Class,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.3,Texture_Cytoplasm_Mito_Difference.Variance_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Texture_Cytoplasm_Mito_Sum.Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Variance_256.3,Texture_Cytoplasm_Mito_Variance_256.3
0,NF0014_T1,61,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,-0.853288,0.666296,-0.744992,-1.727884,-0.795478,0.744107,-0.887714,-0.778888,-0.756731,-0.807663
1,NF0014_T1,114,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,0.234114,-0.146387,0.204914,-0.566253,0.5074,-0.180214,0.69977,0.221479,1.005875,1.00938
2,NF0014_T1,184,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,-0.292831,0.26572,-0.284591,-0.33757,-0.143982,0.291757,-0.415339,-0.290942,-0.489394,-0.508018
3,NF0014_T1,219,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,-0.292831,0.26572,-0.284591,-0.33757,-0.143982,0.291757,-0.415339,-0.290942,-0.489394,-0.508018
4,NF0014_T1,255,uM,1,Fimepinostat,E5-2,E5,-1,PI3K and HDAC inhibitor,Small Molecule,...,-0.341044,0.297917,-0.323744,-0.28141,-0.210875,0.324488,-0.010976,-0.334375,0.466926,0.503383


### Normalize the organoid profiles

In [11]:
organoid_normalized.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Metadata_Target,Metadata_Class,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,NF0014_T1,2,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.7997,0.727493,-0.707564,1.408761,-0.908726,0.723078,-0.593903,-0.73303,-0.528032,-0.561465
1,NF0014_T1,4,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.790156,0.72277,-0.701173,-0.081791,-0.791176,0.718448,-0.589805,-0.725724,-0.523691,-0.555467
2,NF0014_T1,7,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.799328,0.727338,-0.707337,1.40876,-0.908707,0.722927,-0.593751,-0.732765,-0.527863,-0.561134
3,NF0014_T1,19,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.797577,0.726521,-0.706253,1.309233,-0.902352,0.722125,-0.59309,-0.731511,-0.527244,-0.559971
4,NF0014_T1,23,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,0.990027,-0.917486,0.84717,-1.158874,1.342547,-0.94117,0.691929,0.914296,0.408973,0.434582


In [12]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
organoid_blocklist += [
    x
    for x in organoid_normalized.columns
    if "Intensity" in x
    and (
        "MIN.X" in x
        or "MAX.X" in x
        or "MIN.Y" in x
        or "MAX.Y" in x
        or "MIN.Z" in x
        or "MAX.Z" in x
    )
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [13]:
organoid_metadata_columns = [x for x in organoid_normalized.columns if "Metadata" in x]
organoid_metadata_columns += [
    "Area.Size.Shape_Organoid_CENTER.X",
    "Area.Size.Shape_Organoid_CENTER.Y",
    "Area.Size.Shape_Organoid_CENTER.Z",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
all_trt_df = organoid_normalized.copy()
organoid_normalized = organoid_normalized.loc[
    organoid_normalized["Metadata_treatment"].isin(["DMSO", "Staurosporine"])
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [14]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
    freq_cut=freq_cut,
    unique_cut=unique_cut,
)
# apply feature selection to all profiles
organoid_fs_profiles = all_trt_df[
    [col for col in all_trt_df.columns if col in organoid_fs_profiles.columns]
]
# concatenate the metadata and the feature selected profiles
original_data_shape = organoid_normalized.shape
organoid_fs_profiles = pd.concat(
    [
        all_trt_df[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)

print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", organoid_fs_profiles.shape[1])
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

The number features before feature selection: 2000
The number features after feature selection: 1980


Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Metadata_Target,Metadata_Class,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,NF0014_T1,2,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.7997,0.727493,-0.707564,1.408761,-0.908726,0.723078,-0.593903,-0.73303,-0.528032,-0.561465
1,NF0014_T1,4,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.790156,0.72277,-0.701173,-0.081791,-0.791176,0.718448,-0.589805,-0.725724,-0.523691,-0.555467
2,NF0014_T1,7,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.799328,0.727338,-0.707337,1.40876,-0.908707,0.722927,-0.593751,-0.732765,-0.527863,-0.561134
3,NF0014_T1,19,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,-0.797577,0.726521,-0.706253,1.309233,-0.902352,0.722125,-0.59309,-0.731511,-0.527244,-0.559971
4,NF0014_T1,23,uM,1,Fimepinostat,E5-2,E5,,PI3K and HDAC inhibitor,Small Molecule,...,0.990027,-0.917486,0.84717,-1.158874,1.342547,-0.94117,0.691929,0.914296,0.408973,0.434582
