In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "Profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [6]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['CP_Plate_1','CP_Plate_2']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df.head()

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_WGA_10_02_256,Nuclei_Texture_Variance_WGA_10_03_256,Nuclei_Texture_Variance_WGA_20_00_256,Nuclei_Texture_Variance_WGA_20_01_256,Nuclei_Texture_Variance_WGA_20_02_256,Nuclei_Texture_Variance_WGA_20_03_256,Nuclei_Texture_Variance_WGA_5_00_256,Nuclei_Texture_Variance_WGA_5_01_256,Nuclei_Texture_Variance_WGA_5_02_256,Nuclei_Texture_Variance_WGA_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2671.4,3523.5,2674.2,3527.6,2671.4,3527.7,-0.51156,-0.51212,...,-0.35477,-0.36219,0,0,0,0,-0.40761,-0.37515,-0.36112,-0.35993
1,AARS2,AGCAAACTGGGGTCGCCGCG,2582.0,3378.9,2582.6,3377.8,2581.1,3375.9,0.084136,-0.065571,...,0.000376,-0.048306,0,0,0,0,0.020732,0.008269,0.023519,0.034127
2,AARS2,CCAACTTCTACGCAGAACAG,2751.4,3508.4,2750.6,3508.6,2754.3,3508.1,-0.22247,-0.24945,...,0.10417,0.033998,0,0,0,0,0.025522,0.035306,0.046367,0.047433
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2727.4,3488.2,2728.4,3488.2,2736.2,3482.1,-0.58164,-0.66097,...,-0.061569,-0.16679,0,0,0,0,-0.11992,-0.089091,-0.080681,-0.10323
4,AARSD1,ACCTCCGCTCCCAATCTACC,2829.1,3379.1,2827.8,3380.1,2835.6,3376.3,0.99958,1.1778,...,-0.69561,-0.81238,0,0,0,0,-0.60749,-0.56959,-0.63268,-0.57116


In [8]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_ALLWELLS_cp.csv.gz'))


In [10]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_cp.csv.gz'),index = False)


**Gene level aggregation**

In [11]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_cp.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [12]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_cp.csv.gz
Now aggregating.
Now saving aggregated file.
