# Perform data preprocessing with pycytominer on each compartment file

## Import libraries

In [1]:
import pathlib

import pandas as pd
from pycytominer import normalize, feature_select
from pycytominer.cyto_utils import infer_cp_features

## Set paths and variables

In [2]:
# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns"
]

# Output directory for processed profiles
output_dir = pathlib.Path("./profiles/processed_profiles")
output_dir.mkdir(parents=True, exist_ok=True)

# Path to CytoTable converted profiles for each object
converted_profiles_dir = pathlib.Path("./profiles/converted_profiles/").resolve(strict=True)

# List all the converted files in the directory
converted_files = [file for file in converted_profiles_dir.iterdir() if file.is_file()]

# Print the converted files that will be processed
for file in converted_files:
    print(file)

profiles/converted_profiles/per_tubular.parquet
profiles/converted_profiles/per_lamellar.parquet
profiles/converted_profiles/per_image.parquet
profiles/converted_profiles/per_mito.parquet


## Perform preprocessing on single cell features

In [3]:
for file in converted_files:
    print(f"Performing pycytominer pipeline for {file.stem}")
    output_normalized_file = str(
        pathlib.Path(f"{output_dir}/{file.stem}_normalized.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(f"{output_dir}/{file.stem}_feature_selected.parquet")
    )

    # Load in file to process
    df = pd.read_parquet(file)

    # Set the compartment name to find in the data frame (must be capitalized)
    compartment_name = file.stem.split("_")[1].capitalize()

    # Find the cp features based on the mask name or image
    cp_features = infer_cp_features(population_df=df, compartments=[compartment_name])

    # Find the metadata features
    meta_features = infer_cp_features(population_df=df, compartments=[compartment_name], metadata=True)

    # Step 2: Normalization
    normalize(
        profiles=df,
        method="standardize",
        features=cp_features,
        meta_features=meta_features,
        output_file=output_normalized_file,
        output_type="parquet",
    )

    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        features=cp_features,
        output_file=output_feature_select_file,
        output_type="parquet",
    )

    print(
        f"Normalization and feature selection have been performed for {file.stem}"
    )

Performing pycytominer pipeline for per_tubular
Normalization and feature selection have been performed for per_tubular
Performing pycytominer pipeline for per_lamellar
Normalization and feature selection have been performed for per_lamellar
Performing pycytominer pipeline for per_image


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Normalization and feature selection have been performed for per_image
Performing pycytominer pipeline for per_mito
Normalization and feature selection have been performed for per_mito


## Check example output file to confirm that the process worked

In [4]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

print(test_df.shape)
test_df.head(2)

(459, 97)


Unnamed: 0,Metadata_ImageNumber,Metadata_Cell_ID,Metadata_Condition,Metadata_Mito_ID,Metadata_Mito_Number_Object_Number,Mito_AreaShape_BoundingBoxMaximum_X,Mito_AreaShape_BoundingBoxMaximum_Z,Mito_AreaShape_BoundingBoxVolume,Mito_AreaShape_Center_Y,Mito_AreaShape_EulerNumber,...,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_04_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_05_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_06_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_07_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_08_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_09_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_10_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_11_256,Mito_Texture_InverseDifferenceMoment_Mitochondria_3_12_256,Mito_Texture_SumVariance_Mitochondria_3_08_256
0,1,cell_001,shCtrl,1,1,0.42829,0.1591,0.000491,0.624221,2.419534,...,0.556266,1.141136,-0.50403,-0.250619,0.119015,0.023412,-0.334441,-0.737432,-0.054884,0.130932
1,2,cell_001,shCtrl,2,1,0.348931,-0.150598,-0.057103,0.186061,-0.100661,...,-0.263411,-0.97873,0.211009,-0.854785,-0.936465,0.346005,-0.406445,0.23298,0.260963,-0.335407
