# Perform feature selection on normalized data

## Import libraries

In [1]:
import gc
import pathlib
import sys

import pandas as pd
from pycytominer import feature_select
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# directory where normalized parquet file is located
input_file_path = pathlib.Path("../data/4.normalized_data/normalized_profile.parquet")

# directory where the feature selected parquet file is saved to
output_file_path = pathlib.Path(
    "../data/5.feature_selected_data/features_selected_profile.parquet"
)
output_file_path.parent.mkdir(exist_ok=True)

## Perform feature selection

In [3]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]

In [4]:
manual_block_list = [
    "Nuclei_AreaShape_BoundingBoxArea",
    "Nuclei_AreaShape_BoundingBoxMinimum_X",
    "Cells_AreaShape_BoundingBoxArea",
]

In [5]:
# feature selection parameters
print("Performing feature selection on normalized annotated merged single cells!")

# read in the annotated file
normalized_df = pd.read_parquet(input_file_path)
# perform feature selection with the operations specified
feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
)

# add "Metadata_" to the beginning of each column name in the list
feature_select_df.columns = [
    "Metadata_" + column if column in manual_block_list else column
    for column in feature_select_df.columns
]
print("Feature selection complete, saving to parquet file!")
# save features selected df as parquet file
output(
    df=feature_select_df,
    output_filename=output_file_path,
    output_type="parquet",
)
# check to see if the shape of the df has changed indicating feature selection occurred
print(feature_select_df.shape)
feature_select_df.head()

Performing feature selection on normalized annotated merged single cells!
Feature selection complete, saving to parquet file!
(1700, 882)


Unnamed: 0,NMF_plate,NMF_Well,NMF_number_of_singlecells,NMF_compound,NMF_dose,NMF_control,NMF_ImageNumber,NMF_FOV,NMF_Time,NMF_Cells_Number_Object_Number,...,Nuclei_Texture_Correlation_CL_561_3_00_256,Nuclei_Texture_Correlation_CL_561_3_01_256,Nuclei_Texture_Correlation_CL_561_3_02_256,Nuclei_Texture_Correlation_CL_561_3_03_256,Nuclei_Texture_DifferenceVariance_CL_488_1_3_01_256,Nuclei_Texture_DifferenceVariance_CL_488_2_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CL_561_3_03_256,Nuclei_Texture_SumAverage_CL_488_1_3_01_256,Nuclei_Texture_SumAverage_CL_488_2_3_00_256,Nuclei_Texture_SumAverage_CL_561_3_01_256
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,6,...,1.627355,1.501131,-0.845063,-0.60843,1.292035,-1.456748,0.557835,-0.390739,-1.476686,0.096642
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,7,...,-0.924724,1.501131,1.56223,-0.70188,0.886759,0.827159,0.436105,-0.390739,0.36755,0.096642
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,9,...,-0.833578,-0.793186,-0.782806,-0.572324,1.292035,-1.456748,-2.479875,-0.390739,-1.476686,-2.420632
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,0.168607,0.280239,-0.001959,-0.330995,-1.316472,0.573392,0.12474,0.094856,0.36755,0.881523
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,12,...,-0.912043,-0.880947,-0.864179,-0.630945,-0.924705,0.375017,0.527587,-0.077278,0.394278,0.189384
