In [1]:
import pandas as pd
import pathlib

from pycytominer import feature_select
from pycytominer.cyto_utils import output

import sys
sys.path.append("../../utils")
import sc_extraction_utils as sc_utils

In [2]:
# output directory for annotated file
output_dir = pathlib.Path("./data/")

# dictionary with each run for the cell type
run_info_dictionary = {
    "SHSY5Y_first_run": {
        # path to parquet file from annotate function
        "normalized_path": str(pathlib.Path("./data/SHSY5Y_first_run_sc_norm.parquet"))
    },
    "SHSY5Y_second_run": {
        # path to parquet file from annotate function
        "normalized_path": str(pathlib.Path("./data/SHSY5Y_second_run_sc_norm.parquet")),
    },
}

In [3]:
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
]

# process each run
for SHSY5Y_run, info in run_info_dictionary.items():
    normalized_df = pd.read_parquet(info["normalized_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{SHSY5Y_run}_sc_norm_fs.parquet"))
    print(f"Performing feature selection on normalized annotated merged single cells for {SHSY5Y_run}!")

    # perform feature selection with the operations specified
    feature_select_df = feature_select(
        normalized_df,
        operation=feature_select_ops,
        output_file="none",
    )

    # save features selected df as parquet file
    output(
        df=feature_select_df,
        output_filename=output_file,
        output_type="parquet"
    )
    print(f"Features have been selected for {SHSY5Y_run} and saved!")

Performing feature selection on normalized annotated merged single cells for SHSY5Y_first_run!
Features have been selected for SHSY5Y_first_run and saved!
Performing feature selection on normalized annotated merged single cells for SHSY5Y_second_run!
Features have been selected for SHSY5Y_second_run and saved!


In [4]:
print(feature_select_df.shape)
feature_select_df.head()

(290878, 1276)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256
0,SH-SY5Y,I13,3803,6,Media ctr,,,media ctr,,,...,-0.514091,-0.055462,1.15719,-0.28404,1.149262,1.113963,-1.208402,-0.071781,-0.049587,-0.055033
1,SH-SY5Y,I13,3803,6,Media ctr,,,media ctr,,,...,-2.321946,-2.637265,-1.375906,-0.987115,-0.62753,-0.91302,0.884619,-0.032483,0.074814,0.011987
2,SH-SY5Y,I13,3803,6,Media ctr,,,media ctr,,,...,0.514732,-0.482626,-0.540586,0.783146,0.538427,0.402433,-1.195798,-0.065013,-0.124266,-0.058691
3,SH-SY5Y,I13,3803,6,Media ctr,,,media ctr,,,...,1.222402,0.576237,0.625213,0.576041,0.238163,0.212912,-0.704956,-0.061256,-0.107076,-0.051076
4,SH-SY5Y,I13,3803,6,Media ctr,,,media ctr,,,...,0.383106,0.331319,0.363506,0.334869,-0.951015,-0.352459,0.272006,-0.059324,-0.084269,-0.024785
