This notebook performs profile feature selection.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import feature_select

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "SARCO361"

In [3]:
# pathing
sc_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/sc_norm.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/3.normalized_profiles/organoid_norm.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/sc_fs.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/organoid_fs.parquet"
).resolve()

organoid_fs_output_path.parent.mkdir(parents=True, exist_ok=True)

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
]

### Feature select the single-cell profiles

In [6]:
sc_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Cytoplasm_MIN.X,Area.Size.Shape_Cytoplasm_MAX.X,Area.Size.Shape_Cytoplasm_MIN.Y,Area.Size.Shape_Cytoplasm_MAX.Y,Area.Size.Shape_Cytoplasm_MIN.Z,Area.Size.Shape_Cytoplasm_MAX.Z,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER,Area.Size.Shape_Cytoplasm_EQUIVALENT.DIAMETER,Area.Size.Shape_Cytoplasm_SURFACE.AREA
0,SARCO361,19,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.399277,0.627346,0.285328,0.184579,-0.375,2.139832,-0.836526,0.12712,0.358714,1.609508
1,SARCO361,39,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.264814,-0.259309,0.50509,0.375293,2.25,8.295097,-1.300594,-4.544532,0.367507,4.539921
2,SARCO361,58,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.925326,0.700591,0.846943,0.456104,5.75,2.564333,0.529536,0.230934,-0.775283,-0.768679
3,SARCO361,78,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.656553,-0.679506,0.198121,0.226601,1.375,5.32359,-0.478178,-1.430098,0.950878,2.45018
4,SARCO361,98,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,0.738784,0.604216,0.655087,0.430244,4.875,3.625585,-0.710484,-1.533912,-0.398202,0.754258


In [7]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path(
    f"{root_dir}/4.processing_image_based_profiles/data/blocklist/sc_blocklist.txt"
).resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [8]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Target",
    "Class",
    "Therapeutic Categories",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [9]:
# fs the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
)
original_data_shape = sc_normalized.shape
sc_fs_profiles = pd.concat(
    [
        sc_normalized[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", sc_fs_profiles.shape[1])
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

The number features before feature selection: 1933
The number features after feature selection: 226


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Target,Class,Therapeutic Categories,Well,...,Granularity_Cytoplasm_Mito_GRANULARITY.1,Texture_Cytoplasm_BF_Angular.Second.Moment_256.3,Texture_Cytoplasm_BF_Information.Measure.of.Correlation.1_256.3,Texture_Cytoplasm_BF_Sum.Variance_256.3,Texture_Cytoplasm_DNA_Contrast_256.3,Texture_Cytoplasm_ER_Contrast_256.3,Texture_Cytoplasm_Mito_Correlation_256.3,Texture_Cytoplasm_Mito_Variance_256.3,Area.Size.Shape_Cytoplasm_EXTENT,Area.Size.Shape_Cytoplasm_EULER.NUMBER
0,SARCO361,19,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.506677,0.777007,-0.712073,-0.915372,-0.719147,-0.463082,0.34481,-0.91479,-0.836526,0.12712
1,SARCO361,39,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.005694,0.757187,-0.436066,-0.608477,-0.596133,1.034804,0.086968,0.654268,-1.300594,-4.544532
2,SARCO361,58,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.331001,1.170315,-1.135125,-0.903262,-1.096134,-0.902594,0.29377,-0.425628,0.529536,0.230934
3,SARCO361,78,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.438188,0.471413,-1.288118,-0.705702,-0.295453,0.036339,0.796643,0.908627,-0.478178,-1.430098
4,SARCO361,98,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,0.002442,1.051855,0.016516,-0.883929,-1.14358,-0.443873,-0.303698,-0.099879,-0.710484,-1.533912


### Normalize the organoid profiles

In [10]:
organoid_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,Target,Class,Therapeutic Categories,image_set,Well,...,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,Area.Size.Shape_Organoid_MIN.Y,Area.Size.Shape_Organoid_MAX.Y,Area.Size.Shape_Organoid_MIN.Z,Area.Size.Shape_Organoid_MAX.Z,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER,Area.Size.Shape_Organoid_EQUIVALENT.DIAMETER,Area.Size.Shape_Organoid_SURFACE.AREA
0,SARCO361,45,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-4,C5,...,-0.823303,-0.661559,-0.199632,0.854865,-0.933257,6.044266,-0.274677,0.356235,2.630958,5.281308
1,SARCO361,4,nM,10,Staurosporine,Apoptosis induction,Small Molecule,Experimental,C11-2,C11,...,-1.697752,-0.692855,0.042959,0.987888,-0.933257,-2.470265,2.066893,0.356235,1.062641,-0.624914
2,SARCO361,27,uM,1,Everolimus,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5-6,C5,...,-0.003168,-1.282259,0.240064,-1.594314,0.933257,1.629324,2.453575,-0.926212,0.93008,-0.401658
3,SARCO361,5,uM,1,Onalespib,HSP90 inhibitor,Small Molecule,Investigational,C3-3,C3,...,-1.431616,-2.033359,0.224902,0.987888,-0.933257,-2.470265,-0.451312,-0.071247,-0.809746,-0.410041
4,SARCO361,28,uM,1,Onalespib,HSP90 inhibitor,Small Molecule,Investigational,C3-4,C3,...,-1.697752,1.132735,-2.024115,0.987888,-0.933257,0.683265,-0.671532,5.058541,1.741469,4.203173


In [11]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [12]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Target",
    "Class",
    "Therapeutic Categories",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [13]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
)
original_data_shape = organoid_normalized.shape
organoid_fs_profiles = pd.concat(
    [
        organoid_normalized[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)

print("The number features before feature selection:", original_data_shape[1])
print("The number features after feature selection:", organoid_fs_profiles.shape[1])
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

The number features before feature selection: 651
The number features after feature selection: 151


Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Target,Class,Therapeutic Categories,Well,...,Texture_Organoid_DNA_Sum.Variance_256.3,Texture_Organoid_ER_Contrast_256.3,Texture_Organoid_ER_Sum.Average_256.3,Texture_Organoid_ER_Variance_256.3,Texture_Organoid_Mito_Contrast_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Variance_256.3,Area.Size.Shape_Organoid_EXTENT,Area.Size.Shape_Organoid_EULER.NUMBER
0,SARCO361,45,uM,1,Everolimus,C5-4,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,1.545744,-0.675457,0.390916,-0.513617,6.944647,1.094299,4.284505,8.313099,-0.274677,0.356235
1,SARCO361,4,nM,10,Staurosporine,C11-2,Apoptosis induction,Small Molecule,Experimental,C11,...,0.701343,-0.817489,0.321596,-1.000235,3.471792,1.12111,3.747953,1.082282,2.066893,0.356235
2,SARCO361,27,uM,1,Everolimus,C5-6,mTOR inhibitor,Small Molecule,Kinase Inhibitor,C5,...,3.267712,-1.38443,-0.226361,-0.824801,-1.341468,1.439365,1.430394,2.258404,2.453575,-0.926212
3,SARCO361,5,uM,1,Onalespib,C3-3,HSP90 inhibitor,Small Molecule,Investigational,C3,...,-1.263533,-0.932483,-0.992392,-1.107921,12.843089,-1.438886,-0.32673,3.720659,-0.451312,-0.071247
4,SARCO361,28,uM,1,Onalespib,C3-4,HSP90 inhibitor,Small Molecule,Investigational,C3,...,-0.706097,-1.540967,-0.888877,-1.305535,9.024406,0.128921,3.23974,3.342695,-0.671532,5.058541
