This notebook performs profile feature selection.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import feature_select

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "SARCO361"

In [3]:
# pathing
sc_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/sc_fs.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/organoid_fs.parquet"
).resolve()

organoid_fs_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
]

### Feature select the single-cell profiles

In [6]:
sc_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Cytoplasm_MIN.X,Area.Size.Shape_Cytoplasm_MAX.X,Area.Size.Shape_Cytoplasm_MIN.Y,Area.Size.Shape_Cytoplasm_MAX.Y,Area.Size.Shape_Cytoplasm_MIN.Z,Area.Size.Shape_Cytoplasm_MAX.Z,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_EQUIVALENT.DIAMETER,Area.Size.Shape_Cytoplasm_SURFACE.AREA
0,SARCO361,63,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,-1.563725,-1.596488,0.795866,0.709358,-0.390267,-0.776504,1.447654,0.029134,0.083745,-0.517441
1,SARCO361,127,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,0.263549,0.292244,-0.943569,-0.941791,-0.390267,-0.776504,1.797869,-0.810362,0.584273,-0.246756
2,SARCO361,191,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,-1.868271,-1.95752,1.121788,1.033182,-0.390267,-0.776504,0.303446,-0.250698,-0.393543,-0.46411
3,SARCO361,19,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.544408,0.690048,0.243215,0.193374,-0.390267,1.737238,-1.005302,0.308966,0.21466,0.941926
4,SARCO361,39,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.057916,-0.078817,0.466401,0.403326,0.895252,7.344817,-1.477648,-3.888512,0.223567,3.051615


In [7]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path(
    f"{root_dir}/4.processing_image_based_profiles/data/blocklist/sc_blocklist.txt"
).resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [8]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [9]:
# fs the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
)
original_data_shape = sc_normalized.shape
sc_fs_profiles = pd.concat(
    [
        sc_normalized[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", sc_fs_profiles.shape[1])
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

The number features before feature selection: 1933
The number features after feature selection: 201


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Colocalization_Nuclei_AGP.BF_MEDIAN.CORRELATION.COEFF,Colocalization_Nuclei_AGP.BF_MAX.OVERLAP.COEFF,...,Granularity_Cytoplasm_BF_GRANULARITY.2,Texture_Cytoplasm_AGP_Angular.Second.Moment_256.3,Texture_Cytoplasm_AGP_Sum.Variance_256.3,Texture_Cytoplasm_BF_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_BF_Sum.Variance_256.3,Texture_Cytoplasm_DNA_Contrast_256.3,Texture_Cytoplasm_DNA_Correlation_256.3,Texture_Cytoplasm_DNA_Sum.Variance_256.3,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER
0,SARCO361,63,uM,1,Nilotinib,F6-4,F6,-1,1.128208,-0.181426,...,0.177022,-0.238446,-0.244723,-0.902567,0.672952,0.00228,0.034022,-0.210443,1.447654,0.029134
1,SARCO361,127,uM,1,Nilotinib,F6-4,F6,12,1.024314,-0.175396,...,0.181449,-0.815101,0.477238,-1.425577,1.158644,0.226669,1.210875,0.752661,1.797869,-0.810362
2,SARCO361,191,uM,1,Nilotinib,F6-4,F6,-1,-0.338469,-0.156752,...,0.166467,0.152044,-0.789297,1.194696,-0.133684,1.315882,-1.100122,-0.027451,0.303446,-0.250698
3,SARCO361,19,uM,1,Everolimus,C5-4,C5,45,1.394724,-0.183581,...,0.186102,0.778623,-0.414778,-0.551235,-0.906984,-0.89662,0.324211,-0.662453,-1.005302,0.308966
4,SARCO361,39,uM,1,Everolimus,C5-4,C5,45,-1.291259,-0.16832,...,0.192541,0.761778,-0.081625,-0.278053,-0.662951,-0.77001,0.343953,-0.571636,-1.477648,-3.888512


### Normalize the organoid profiles

In [10]:
organoid_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,Area.Size.Shape_Organoid_MIN.Y,Area.Size.Shape_Organoid_MAX.Y,Area.Size.Shape_Organoid_MIN.Z,Area.Size.Shape_Organoid_MAX.Z,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER,Area.Size.Shape_Organoid_EQUIVALENT.DIAMETER,Area.Size.Shape_Organoid_SURFACE.AREA
0,SARCO361,3,uM,1,Selumetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,G10-7,G10,...,0.940022,-2.848617,1.933864,-1.101632,-0.500376,-2.090429,-0.848919,-0.187277,-2.115236,-1.168251
1,SARCO361,12,uM,1,Nilotinib,tyrosine kinase inhibitor,Small Molecule,Investigational,F6-4,F6,...,0.420956,0.918519,-0.178544,-0.569424,0.00538,-0.54925,-1.24354,3.975833,-2.047617,-1.145283
2,SARCO361,3,uM,10,Binimetinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,C8-7,C8,...,1.146001,-1.669552,0.830897,-1.981549,-0.500376,-2.090429,-1.214684,2.7269,-2.066649,-1.155802
3,SARCO361,45,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.161277,-0.593228,0.131061,0.871084,-0.500376,4.973309,-0.180141,-0.365696,1.333158,2.178146
4,SARCO361,5,uM,10,Trametinib,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,F10-3,F10,...,1.755697,0.125953,3.333536,0.991718,3.292798,0.092908,2.951274,-0.544115,-1.172637,-1.131081


In [11]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [12]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [13]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
)
original_data_shape = organoid_normalized.shape
organoid_fs_profiles = pd.concat(
    [
        organoid_normalized[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)

print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", organoid_fs_profiles.shape[1])
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

The number features before feature selection: 651
The number features after feature selection: 151


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Colocalization_Organoid_AGP.BF_MAX.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MIN.MANDERS.COEFF.M2,...,Texture_Organoid_DNA_Sum.Variance_256.3,Texture_Organoid_ER_Contrast_256.3,Texture_Organoid_ER_Sum.Average_256.3,Texture_Organoid_ER_Variance_256.3,Texture_Organoid_Mito_Contrast_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Variance_256.3,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER,Area.Size.Shape_Organoid_SURFACE.AREA
0,SARCO361,3,uM,1,Selumetinib,G10-7,G10,,-0.558121,3.086785,...,-1.109763,-0.947271,-1.076803,-0.786205,-0.851938,-1.127175,-0.957166,-0.848919,-0.187277,-1.168251
1,SARCO361,12,uM,1,Nilotinib,F6-4,F6,1.0,-0.76801,-0.571096,...,-1.107423,-0.912819,-1.075108,-0.780288,-0.833558,-1.12575,-0.952059,-1.24354,3.975833,-1.145283
2,SARCO361,3,uM,10,Binimetinib,C8-7,C8,,0.144085,4.74967,...,-1.109387,-0.942641,-1.076499,-0.785398,-0.849212,-1.126907,-0.956399,-1.214684,2.7269,-1.155802
3,SARCO361,45,uM,1,Everolimus,C5-4,C5,12.0,-0.924303,-0.370312,...,0.651113,-0.215303,0.0427,-0.209479,0.447558,0.588238,0.648497,-0.180141,-0.365696,2.178146
4,SARCO361,5,uM,10,Trametinib,F10-3,F10,,1.227839,5.178467,...,-0.935827,-0.537656,-0.892315,-0.535576,-0.417425,-0.906514,-0.523087,2.951274,-0.544115,-1.131081
