This notebook performs profile feature selection.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import feature_select

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_normalized_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.sc_normalized_profiles.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.organoid_normalized_profiles.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.sc_fs_profiles.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.organoid_fs_profiles.parquet"
).resolve()

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
]

### Feature select the single-cell profiles

In [6]:
sc_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,NF0014,15,%,1,DMSO,C4-2,C4,32,-0.097616,-1.074317,...,0.04961,0.011873,0.026107,-0.144187,0.17534,-0.039107,0.110616,0.052515,0.028664,0.0371
1,NF0014,26,%,1,DMSO,C4-2,C4,32,1.180799,0.092335,...,0.434451,-0.436822,0.513649,-0.123169,0.59728,-0.503675,0.146859,0.466631,-0.192587,-0.195145
2,NF0014,37,%,1,DMSO,C4-2,C4,32,-0.440505,-0.930837,...,-0.692588,0.701364,-0.648313,-0.31915,-0.51301,0.832248,-1.207311,-0.664273,-1.168725,-1.184747
3,NF0014,43,%,1,DMSO,C4-2,C4,32,0.609896,-0.012714,...,0.055561,-0.05009,0.03203,0.132546,0.137178,-0.109043,0.459081,0.003973,0.574085,0.599102
4,NF0014,51,%,1,DMSO,C4-2,C4,32,-0.787903,-1.431078,...,-0.232026,0.254619,-0.200967,-0.591741,0.02794,0.226455,-0.248335,-0.214032,-0.362201,-0.369626


In [7]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path("../data/blocklist/sc_blocklist.txt").resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [8]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [9]:
# normalize the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
)
sc_fs_profiles = pd.concat(
    [
        sc_normalized[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_EXTENT,Area.Size.Shape_Nuclei_EULER.NUMBER,...,Granularity_Cytoplasm_BF_GRANULARITY.4,Granularity_Cytoplasm_DNA_GRANULARITY.15,Granularity_Cytoplasm_ER_GRANULARITY.16,Granularity_Cytoplasm_ER_GRANULARITY.8,Texture_Cytoplasm_AGP_Correlation_256.1,Texture_Cytoplasm_AGP_Sum.Variance_256.1,Texture_Cytoplasm_DNA_Contrast_256.1,Texture_Cytoplasm_ER_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Contrast_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1
0,NF0014,15,%,1,DMSO,C4-2,C4,32,0.836761,-0.392232,...,-1.236246,-1.309578,0.0,-1.303511,0.442841,-0.229443,0.09705,0.020495,0.203806,-0.144187
1,NF0014,26,%,1,DMSO,C4-2,C4,32,0.214518,2.54951,...,0.158438,-0.988114,0.0,0.999034,0.345999,-0.401708,-0.286432,-0.417305,-0.236065,-0.123169
2,NF0014,37,%,1,DMSO,C4-2,C4,32,0.459903,-0.392232,...,0.022219,-0.974631,0.0,-1.201527,0.147025,-0.953462,-1.084089,0.727439,-1.443039,-0.31915
3,NF0014,43,%,1,DMSO,C4-2,C4,32,-1.2739,2.54951,...,-1.129586,0.435303,1.421085e-14,0.952505,-0.183131,0.153477,0.997786,-0.052029,1.067864,0.132546
4,NF0014,51,%,1,DMSO,C4-2,C4,32,0.505727,-0.392232,...,-1.257155,-1.801191,0.0,0.096698,-0.030948,-1.0673,-0.954839,0.243587,-0.498713,-0.591741


### Normalize the organoid profiles

In [10]:
organoid_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Difference.Entropy_256.1,Texture_Organoid_Mito_Difference.Variance_256.1,Texture_Organoid_Mito_Entropy_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Organoid_Mito_Inverse.Difference.Moment_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Sum.Entropy_256.1,Texture_Organoid_Mito_Sum.Variance_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,32,%,1,DMSO,C4-2,C4,29,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [12]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [13]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
)
organoid_fs_profiles = pd.concat(
    [
        organoid_normalized[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count
0,NF0014,32,%,1,DMSO,C4-2,C4,29
