This notebook performs profile feature selection.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import feature_select

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "SARCO361"

In [3]:
# pathing
sc_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/sc_fs.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/organoid_fs.parquet"
).resolve()

organoid_fs_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
]

In [6]:
na_cutoff = 0.05
corr_threshold = 0.95
freq_cut = 0.01
unique_cut = 0.01

### Feature select the single-cell profiles

In [7]:
sc_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Cytoplasm_MIN.X,Area.Size.Shape_Cytoplasm_MAX.X,Area.Size.Shape_Cytoplasm_MIN.Y,Area.Size.Shape_Cytoplasm_MAX.Y,Area.Size.Shape_Cytoplasm_MIN.Z,Area.Size.Shape_Cytoplasm_MAX.Z,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_EQUIVALENT.DIAMETER,Area.Size.Shape_Cytoplasm_SURFACE.AREA
0,SARCO361,63,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,-1.563725,-1.596488,0.795866,0.709358,-0.390267,-0.776504,1.447654,0.029134,0.083745,-0.517441
1,SARCO361,127,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,0.263549,0.292244,-0.943569,-0.941791,-0.390267,-0.776504,1.797869,-0.810362,0.584273,-0.246756
2,SARCO361,191,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,-1.868271,-1.95752,1.121788,1.033182,-0.390267,-0.776504,0.303446,-0.250698,-0.393543,-0.46411
3,SARCO361,19,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.544408,0.690048,0.243215,0.193374,-0.390267,1.737238,-1.005302,0.308966,0.21466,0.941926
4,SARCO361,39,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.057916,-0.078817,0.466401,0.403326,0.895252,7.344817,-1.477648,-3.888512,0.223567,3.051615


In [8]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
sc_blocklist += [
    x
    for x in sc_normalized.columns
    if "Intensity" in x
    and (
        "MIN.X" in x
        or "MAX.X" in x
        or "MIN.Y" in x
        or "MAX.Y" in x
        or "MIN.Z" in x
        or "MAX.Z" in x
    )
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path(
    f"{root_dir}/4.processing_image_based_profiles/data/blocklist/sc_blocklist.txt"
).resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [9]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Target",
    "Class",
    "Therapeutic Categories",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [10]:
# fs the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
    freq_cut=freq_cut,
    unique_cut=unique_cut,
)
original_data_shape = sc_normalized.shape
sc_fs_profiles = pd.concat(
    [
        sc_normalized[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", sc_fs_profiles.shape[1])
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

The number features before feature selection: 1366
The number features after feature selection: 247


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Target,Class,Therapeutic Categories,Well,...,Texture_Cytoplasm_ER_Contrast_256.3,Texture_Cytoplasm_ER_Sum.Average_256.3,Texture_Cytoplasm_Mito_Angular.Second.Moment_256.3,Texture_Cytoplasm_Mito_Correlation_256.3,Texture_Cytoplasm_Mito_Entropy_256.3,Texture_Cytoplasm_Mito_Sum.Average_256.3,Area.Size.Shape_Cytoplasm_VOLUME,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_SURFACE.AREA
0,SARCO361,63,uM,1,Nilotinib,F6-4,tyrosine kinase inhibitor,Small Molecule,Investigational,F6,...,-0.480502,-0.185199,-0.23843,1.106339,0.462662,0.027847,-0.172863,1.447654,0.029134,-0.517441
1,SARCO361,127,uM,1,Nilotinib,F6-4,tyrosine kinase inhibitor,Small Molecule,Investigational,F6,...,0.796857,1.439825,-0.81506,1.509295,0.932089,-0.245931,0.323172,1.797869,-0.810362,-0.246756
2,SARCO361,191,uM,1,Nilotinib,F6-4,tyrosine kinase inhibitor,Small Molecule,Investigational,F6,...,-0.933175,-0.799619,0.152032,-0.950358,-0.172071,0.150518,-0.528759,0.303446,-0.250698,-0.46411
3,SARCO361,19,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,-0.521002,-0.654759,0.778598,0.209751,-0.780719,-1.056773,-0.05598,-1.005302,0.308966,0.941926
4,SARCO361,39,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.880093,-0.227139,0.761749,-0.051743,-0.798949,-0.329617,-0.047708,-1.477648,-3.888512,3.051615


### Normalize the organoid profiles

In [11]:
organoid_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Texture_Organoid_Mito_Difference.Entropy_256.3,Texture_Organoid_Mito_Difference.Variance_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3
0,SARCO361,3,uM,1,Selumetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G10-7,G10,...,-1.342294,1.199485,-1.189122,2.465973,-1.960858,1.16403,-1.127175,-1.245957,-0.951031,-0.957166
1,SARCO361,12,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,-1.341064,1.198943,-1.188376,2.500432,-1.955204,1.163486,-1.12575,-1.245059,-0.948053,-0.952059
2,SARCO361,3,uM,10,Binimetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,C8-7,C8,...,-1.341968,1.199362,-1.188926,2.445245,-1.957968,1.163907,-1.126907,-1.245719,-0.950578,-0.956399
3,SARCO361,45,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.239618,-0.117426,0.122834,-0.364435,0.503755,-0.155096,0.588238,0.196549,0.66399,0.648497
4,SARCO361,5,uM,10,Trametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,F10-3,F10,...,-1.204943,1.112503,-1.08809,-1.053402,-1.077854,1.076885,-0.906514,-1.129538,-0.527033,-0.523087


In [12]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
organoid_blocklist += [
    x
    for x in organoid_normalized.columns
    if "Intensity" in x
    and (
        "MIN.X" in x
        or "MAX.X" in x
        or "MIN.Y" in x
        or "MAX.Y" in x
        or "MIN.Z" in x
        or "MAX.Z" in x
    )
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [13]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Target",
    "Class",
    "Therapeutic Categories",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [14]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
    freq_cut=freq_cut,
    unique_cut=unique_cut,
)
original_data_shape = organoid_normalized.shape
organoid_fs_profiles = pd.concat(
    [
        organoid_normalized[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)

print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", organoid_fs_profiles.shape[1])
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

The number features before feature selection: 462
The number features after feature selection: 138


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Target,Class,Therapeutic Categories,Well,...,Texture_Organoid_DNA_Sum.Variance_256.3,Texture_Organoid_ER_Contrast_256.3,Texture_Organoid_ER_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_ER_Sum.Average_256.3,Texture_Organoid_ER_Variance_256.3,Texture_Organoid_Mito_Contrast_256.3,Texture_Organoid_Mito_Correlation_256.3,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Variance_256.3
0,SARCO361,3,uM,1,Selumetinib,G10-7,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G10,...,-1.109763,-0.947271,-1.955246,-1.076803,-0.786205,-0.851938,-2.453619,-1.189122,-1.127175,-0.957166
1,SARCO361,12,uM,1,Nilotinib,F6-4,tyrosine kinase inhibitor,Small Molecule,Investigational,F6,...,-1.107423,-0.912819,-1.949752,-1.075108,-0.780288,-0.833558,-2.431424,-1.188376,-1.12575,-0.952059
2,SARCO361,3,uM,10,Binimetinib,C8-7,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,C8,...,-1.109387,-0.942641,-1.952621,-1.076499,-0.785398,-0.849212,-2.415649,-1.188926,-1.126907,-0.956399
3,SARCO361,45,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.651113,-0.215303,0.487508,0.0427,-0.209479,0.447558,0.540493,0.122834,0.588238,0.648497
4,SARCO361,5,uM,10,Trametinib,F10-3,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,F10,...,-0.935827,-0.537656,-1.079954,-0.892315,-0.535576,-0.417425,0.300017,-1.08809,-0.906514,-0.523087
