In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "Profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [12]:
df = pd.read_csv('20240202_6W_CP498_SABER_Pilot_HeLa_single_cell_normalized_ALLBATCHES__AAAAGGATCAAGACGCTCTG_SF3B1.csv.gz')
column_list = list(df.columns)

In [4]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['SABER_Plate_1','SABER_Plate_2']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df.head()

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_WGA_10_02_256,Nuclei_Texture_Variance_WGA_10_03_256,Nuclei_Texture_Variance_WGA_20_00_256,Nuclei_Texture_Variance_WGA_20_01_256,Nuclei_Texture_Variance_WGA_20_02_256,Nuclei_Texture_Variance_WGA_20_03_256,Nuclei_Texture_Variance_WGA_5_00_256,Nuclei_Texture_Variance_WGA_5_01_256,Nuclei_Texture_Variance_WGA_5_02_256,Nuclei_Texture_Variance_WGA_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.4252,-0.42568,...,-0.30745,-0.53185,0,0,0,0,-0.39053,-0.40637,-0.40695,-0.42023
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.04679,0.002498,...,-0.31084,-0.3922,0,0,0,0,-0.32955,-0.27388,-0.24364,-0.27513
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.5198,-0.33393,...,-0.022518,-0.17368,0,0,0,0,-0.078611,-0.079739,-0.089532,-0.13237
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,0.25517,0.36629,0,0,0,0,0.085265,0.12268,0.085219,0.11226
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.64695,0.52948,...,-0.58076,-0.48669,0,0,0,0,-0.42524,-0.48565,-0.42515,-0.48657


In [13]:
column_list

['Metadata_Foci_Parent_Cells',
 'Metadata_Foci_Cell_Quality_Index',
 'Metadata_Foci_Barcode_MatchedTo_GeneCode',
 'Metadata_Foci_Barcode_MatchedTo_Barcode',
 'Metadata_Foci_Barcode_MatchedTo_Score_mean',
 'Metadata_Foci_Barcode_MatchedTo_Score_count',
 'Metadata_Foci_cell_quality_method',
 'Metadata_Foci_ImageNumber',
 'Metadata_Foci_site',
 'Metadata_Foci_plate',
 'Metadata_Foci_well',
 'Metadata_Foci_site_location',
 'Metadata_Foci_Metadata_dataset_split',
 'Metadata_Cells_ImageNumber_x',
 'Metadata_Cells_ObjectNumber',
 'Metadata_Cells_Parent_Nuclei_x',
 'Metadata_Cytoplasm_ImageNumber_x',
 'Metadata_Cytoplasm_ObjectNumber_x',
 'Metadata_Cytoplasm_Parent_Cells_x',
 'Metadata_Cytoplasm_Parent_Nuclei_x',
 'Metadata_Nuclei_ImageNumber_x',
 'Metadata_Nuclei_ObjectNumber_x',
 'Metadata_Foci_Cell_Quality',
 'Metadata_Cells_ImageNumber_y',
 'Metadata_Cells_Parent_Nuclei_y',
 'Metadata_Cytoplasm_ImageNumber_y',
 'Metadata_Cytoplasm_ObjectNumber_y',
 'Metadata_Cytoplasm_Parent_Cells_y',
 'Me

In [9]:
profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_WGA_10_02_256,Nuclei_Texture_Variance_WGA_10_03_256,Nuclei_Texture_Variance_WGA_20_00_256,Nuclei_Texture_Variance_WGA_20_01_256,Nuclei_Texture_Variance_WGA_20_02_256,Nuclei_Texture_Variance_WGA_20_03_256,Nuclei_Texture_Variance_WGA_5_00_256,Nuclei_Texture_Variance_WGA_5_01_256,Nuclei_Texture_Variance_WGA_5_02_256,Nuclei_Texture_Variance_WGA_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.425200,-0.425680,...,-0.307450,-0.531850,0,0,0,0,-0.390530,-0.406370,-0.406950,-0.420230
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.046790,0.002498,...,-0.310840,-0.392200,0,0,0,0,-0.329550,-0.273880,-0.243640,-0.275130
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.519800,-0.333930,...,-0.022518,-0.173680,0,0,0,0,-0.078611,-0.079739,-0.089532,-0.132370
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,0.255170,0.366290,0,0,0,0,0.085265,0.122680,0.085219,0.112260
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.646950,0.529480,...,-0.580760,-0.486690,0,0,0,0,-0.425240,-0.485650,-0.425150,-0.486570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,2952.8,3272.8,2952.1,3271.8,2954.6,3274.1,0.762960,0.847240,...,0.706340,0.711020,0,0,0,0,0.636750,0.496870,0.597250,0.487800
2396,nontargeting,TCCCGGTTGGTGAACGATAC,2614.7,3211.4,2615.1,3210.9,2617.6,3209.8,-0.149030,-0.154530,...,-0.089812,0.225030,0,0,0,0,0.050906,0.060512,0.061396,-0.011746
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,2799.3,3285.6,2797.8,3286.1,2797.4,3283.9,-0.290330,-0.206790,...,-0.363760,-0.537200,0,0,0,0,-0.330100,-0.375490,-0.307660,-0.362080
2398,nontargeting,TGGCCACGAATTCCGCCGCC,2830.7,3510.9,2830.1,3509.4,2828.4,3512.5,-0.153320,-0.254700,...,-0.204040,-0.211900,0,0,0,0,-0.278330,-0.279600,-0.275190,-0.283150


In [5]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_ALLWELLS_saber_1_2.csv.gz'))


In [6]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_saber_1_2.csv.gz'),index = False)


**Gene level aggregation**

In [7]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_saber_1_2.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [8]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_guide_normalized_merged_feature_select_median_ALLWELLS_saber_1_2.csv.gz
Now aggregating.
Now saving aggregated file.
