This notebook performs profile feature selection.

In [1]:
import argparse
import pathlib

import pandas as pd
from pycytominer import feature_select

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        "--patient",
        type=str,
        required=True,
        help="Patient ID to process, e.g. 'P01'",
    )
    args = argparser.parse_args()
    patient = args.patient

else:
    patient = "NF0014"

In [3]:
# pathing
sc_normalized_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.sc_normalized_profiles.parquet"
).resolve(strict=True)
organoid_normalized_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/2.organoid_normalized_profiles.parquet"
).resolve(strict=True)


# output path
sc_fs_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.sc_fs_profiles.parquet"
).resolve()
organoid_fs_output_path = pathlib.Path(
    f"../../data/{patient}/image_based_profiles/3.organoid_fs_profiles.parquet"
).resolve()

In [4]:
# read in the data
sc_normalized = pd.read_parquet(sc_normalized_path)
organoid_normalized = pd.read_parquet(organoid_normalized_path)

In [5]:
feature_select_ops = [
    "variance_threshold",
    "drop_na_columns",
    "correlation_threshold",
    "blocklist",
]

### Feature select the single-cell profiles

In [6]:
sc_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,...,Texture_Cytoplasm_Mito_Difference.Entropy_256.1,Texture_Cytoplasm_Mito_Difference.Variance_256.1,Texture_Cytoplasm_Mito_Entropy_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_Mito_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_Mito_Sum.Average_256.1,Texture_Cytoplasm_Mito_Sum.Entropy_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Variance_256.1
0,NF0014,70,uM,10,Mirdametinib,G8-1,G8,40,-0.68873,-0.320369,...,0.07147,0.014529,0.043398,0.553207,0.433568,0.020056,0.042939,0.044455,0.210545,0.217315
1,NF0014,122,uM,10,Mirdametinib,G8-1,G8,40,-0.389751,-0.493357,...,0.125826,-0.02077,0.094458,0.160591,0.654296,-0.011324,0.098997,0.100074,0.240877,0.241641
2,NF0014,162,uM,10,Mirdametinib,G8-1,G8,40,-0.331708,0.049285,...,-0.002328,0.054434,-0.009989,0.284718,0.278696,0.0581,-0.059797,-0.012521,-0.003908,-0.003837
3,NF0014,165,uM,10,Mirdametinib,G8-1,G8,40,-0.472822,-0.120799,...,-0.220983,0.168983,-0.187017,0.70227,-0.54307,0.160654,-0.318593,-0.203146,-0.489233,-0.489208
4,NF0014,182,uM,10,Mirdametinib,G8-1,G8,40,-0.808064,-0.351828,...,-0.108694,0.111325,-0.100026,0.171216,-0.070353,0.109511,-0.17167,-0.106965,-0.206362,-0.211112


In [7]:
sc_blocklist = [
    x
    for x in sc_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
sc_blocklist = ["blocklist"] + sc_blocklist
sc_blocklist_path = pathlib.Path("../data/blocklist/sc_blocklist.txt").resolve()
sc_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(sc_blocklist_path, "w") as f:
    for item in sc_blocklist:
        f.write(f"{item}\n")

In [8]:
sc_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "parent_organoid",
]
sc_features_columns = [
    col for col in sc_normalized.columns if col not in sc_metadata_columns
]
sc_features_df = sc_normalized.drop(columns=sc_metadata_columns, errors="ignore")

In [9]:
# normalize the data
sc_fs_profiles = feature_select(
    sc_features_df,
    operation=feature_select_ops,
    features=sc_features_columns,
    blocklist_file=sc_blocklist_path,
)
sc_fs_profiles = pd.concat(
    [
        sc_normalized[sc_metadata_columns].reset_index(drop=True),
        sc_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
sc_fs_profiles.to_parquet(sc_fs_output_path, index=False)
sc_fs_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,parent_organoid,Area.Size.Shape_Nuclei_EXTENT,Colocalization_Nuclei_AGP.BF_MIN.CORRELATION.COEFF,...,Texture_Cytoplasm_AGP_Information.Measure.of.Correlation.2_256.1,Texture_Cytoplasm_BF_Correlation_256.1,Texture_Cytoplasm_BF_Inverse.Difference.Moment_256.1,Texture_Cytoplasm_BF_Sum.Average_256.1,Texture_Cytoplasm_DNA_Contrast_256.1,Texture_Cytoplasm_DNA_Sum.Variance_256.1,Texture_Cytoplasm_ER_Information.Measure.of.Correlation.1_256.1,Texture_Cytoplasm_ER_Sum.Variance_256.1,Texture_Cytoplasm_Mito_Contrast_256.1,Texture_Cytoplasm_Mito_Sum.Variance_256.1
0,NF0014,70,uM,10,Mirdametinib,G8-1,G8,40,0.59041,-0.937207,...,0.390195,0.292325,0.028686,-0.034991,-0.743879,-0.644247,-0.132689,-0.616014,0.387111,0.210545
1,NF0014,122,uM,10,Mirdametinib,G8-1,G8,40,0.242355,0.366156,...,0.550148,0.502795,0.003052,-0.009418,-0.51247,-0.388735,-0.743222,-0.410225,0.233132,0.240877
2,NF0014,162,uM,10,Mirdametinib,G8-1,G8,40,0.427268,-0.703361,...,0.207164,0.237268,0.064626,-0.070724,0.097954,0.0426,-0.598672,-0.553485,-0.001264,-0.003908
3,NF0014,165,uM,10,Mirdametinib,G8-1,G8,40,-1.756083,-0.418309,...,-0.550337,-0.142304,0.154835,-0.154227,-0.878099,-0.726517,-0.326628,-0.580438,-0.426149,-0.489233
4,NF0014,182,uM,10,Mirdametinib,G8-1,G8,40,0.570591,1.021355,...,-0.124153,0.383889,0.111983,-0.101184,-0.646486,-0.469616,-1.041207,-0.500639,-0.322758,-0.206362


### Normalize the organoid profiles

In [10]:
organoid_normalized.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Difference.Entropy_256.1,Texture_Organoid_Mito_Difference.Variance_256.1,Texture_Organoid_Mito_Entropy_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.1,Texture_Organoid_Mito_Inverse.Difference.Moment_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Sum.Entropy_256.1,Texture_Organoid_Mito_Sum.Variance_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8,-1.718171,-0.204121,...,-2.133984,2.118421,-1.889329,-1.389515,-2.146901,2.142268,-1.376009,-1.968139,-0.980562,-0.987831
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5,-2.085979,1.325346,...,0.595326,-0.249176,-0.130507,3.504391,-4.467319,-0.035009,-1.498039,-0.153214,-1.127113,-1.093515
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9,0.130074,-2.027178,...,2.021954,-2.551488,2.543025,1.116702,1.4841,-1.863943,-0.923582,2.403406,-1.18946,-1.193731
3,NF0014,18,nM,10,STAURO,G11-1,G11,1,-2.223655,2.247724,...,-2.453815,2.30287,-2.233725,-1.75627,-3.008386,2.194833,-1.196517,-2.305849,-0.726536,-0.730073
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,6,-1.728123,1.564666,...,-1.087216,1.297326,-0.942457,-1.000208,-0.322736,1.26196,-0.360881,-0.943562,0.087396,0.078821


In [11]:
organoid_blocklist = [
    x
    for x in organoid_normalized.columns
    if "Area" in x and ("MAX" in x or "MIN" in x or "BBOX" in x or "CENTER" in x)
]
# write the blocklist to a file
# add "blocklist" the beginning of the list
organoid_blocklist = ["blocklist"] + organoid_blocklist
organoid_blocklist_path = pathlib.Path(
    "../data/blocklist/organoid_blocklist.txt"
).resolve()
organoid_blocklist_path.parent.mkdir(parents=True, exist_ok=True)
with open(organoid_blocklist_path, "w") as f:
    for item in organoid_blocklist:
        f.write(f"{item}\n")

In [12]:
organoid_metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
]
organoid_features_columns = [
    col for col in organoid_normalized.columns if col not in organoid_metadata_columns
]
organoid_features_df = organoid_normalized.drop(
    columns=organoid_metadata_columns, errors="ignore"
)

In [13]:
# normalize the data
organoid_fs_profiles = feature_select(
    organoid_features_df,
    operation=feature_select_ops,
    features=organoid_features_columns,
    blocklist_file=organoid_blocklist_path,
)
organoid_fs_profiles = pd.concat(
    [
        organoid_normalized[organoid_metadata_columns].reset_index(drop=True),
        organoid_fs_profiles.reset_index(drop=True),
    ],
    axis=1,
)
organoid_fs_profiles.to_parquet(organoid_fs_output_path, index=False)
organoid_fs_profiles.head()

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_EXTENT,...,Texture_Organoid_BF_Variance_256.1,Texture_Organoid_DNA_Contrast_256.1,Texture_Organoid_DNA_Variance_256.1,Texture_Organoid_ER_Contrast_256.1,Texture_Organoid_ER_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_ER_Sum.Average_256.1,Texture_Organoid_ER_Variance_256.1,Texture_Organoid_Mito_Contrast_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8,-1.718171,-1.664739,...,-1.334719,-1.068404,-1.112068,-0.635342,-2.238332,-1.169085,-0.882218,-0.972143,-1.376009,-0.987831
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5,-2.085979,-2.484067,...,-0.427715,-0.455481,-1.543011,0.074815,5.133408,-1.179782,-0.99543,1.584246,-1.498039,-1.093515
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9,0.130074,-0.663134,...,0.068257,-0.455344,-1.153488,0.648688,1.518773,0.786162,-0.364094,-0.88644,-0.923582,-1.193731
3,NF0014,18,nM,10,STAURO,G11-1,G11,1,-2.223655,0.265362,...,-1.677969,-1.018121,-1.357958,-0.786237,-2.838483,-1.520782,-1.059963,-0.601197,-1.196517,-0.730073
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,6,-1.728123,-0.369563,...,-0.549648,-0.69948,-0.59302,-0.760946,-0.497278,-1.27012,-1.007393,-0.50722,-0.360881,0.078821
