In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "../SABER_profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("../common_files/SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [4]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['SABER_Plate_1','SABER_Plate_2','SABER_Plate_4']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.425200,-0.425680,...,-0.27762,-0.27347,0,0,0,0,-0.347620,-0.33345,-0.33496,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.046790,0.002498,...,-0.60813,-0.56665,0,0,0,0,-0.584880,-0.57599,-0.55988,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.519800,-0.333930,...,-0.32481,-0.41165,0,0,0,0,-0.373230,-0.32645,-0.28932,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,-0.64861,-0.61902,0,0,0,0,-0.681330,-0.64998,-0.66432,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.646950,0.529480,...,0.30998,0.38566,0,0,0,0,0.330150,0.24726,0.30005,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,2806.1,3660.7,2805.8,3660.5,2806.9,3658.2,0.726790,0.530340,...,0.27448,0.25705,0,0,0,0,0.203040,0.18065,0.19713,0.186450
2396,nontargeting,TCCCGGTTGGTGAACGATAC,2628.8,3507.5,2630.9,3504.9,2624.1,3508.0,0.409350,0.312160,...,-0.25260,-0.25986,0,0,0,0,-0.183360,-0.15042,-0.19423,-0.201860
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,2832.0,3331.5,2832.2,3331.7,2845.5,3331.9,-0.287260,-0.164680,...,-0.49103,-0.56060,0,0,0,0,-0.534040,-0.56032,-0.52216,-0.522740
2398,nontargeting,TGGCCACGAATTCCGCCGCC,2540.4,3259.2,2539.9,3260.3,2576.6,3257.2,-0.080036,-0.214170,...,-0.47413,-0.52804,0,0,0,0,-0.471600,-0.49416,-0.45333,-0.471900


In [5]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_ALLWELLS.csv.gz'))


In [6]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz'),index = False)


**Gene level aggregation**

In [7]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [8]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz
Now aggregating.
Now saving aggregated file.


In [9]:
gene_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,...,Nuclei_Texture_SumEntropy_PRSP6_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_Syto9_10_03_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,0.010109,-0.087648,0.161311,-0.185825,-0.011616,-0.004846,0.011557,0.014249,-0.185814,...,-0.463325,-0.313945,-0.315505,-0.422010,-0.414045,-0.416745,0.153090,0.084152,0.135430,-0.297145
1,AARSD1,0.306288,-0.072139,0.331056,0.320015,0.330335,0.114930,-0.025301,0.000489,0.046690,...,0.189640,-0.249210,0.142986,-0.192595,-0.136617,0.004501,0.094970,-0.188770,0.160150,0.113047
2,ABCF1,-0.060312,0.147953,-0.499430,-0.480239,0.015687,0.076740,-0.687840,-0.130226,0.293780,...,1.211825,-0.960585,0.192730,-0.032408,-0.107133,-0.381875,0.785110,1.032315,1.076095,0.045420
3,ABLIM1,0.045400,-0.039003,0.239105,-0.298120,0.099097,-0.051053,-0.013485,0.002024,-0.091853,...,-0.381365,-0.099255,0.232015,0.081376,0.274015,0.117540,-0.136871,0.037682,0.270835,0.083502
4,ACTR2,0.010109,-0.033853,0.150085,0.098020,-0.049430,-0.000416,0.047962,0.080113,-2.250000,...,1.502300,3.948900,1.080285,0.507495,1.133240,0.378950,-0.444100,1.053490,-0.458410,1.284000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,ZC3H8,0.001845,-0.423155,0.490555,0.052490,0.049880,0.138930,-0.180165,-0.146645,-0.493300,...,-0.125215,-0.912500,0.199215,0.765830,0.017160,0.398660,0.531960,0.321214,0.604615,0.453625
587,ZC3HC1,0.340573,-0.089564,0.282530,-0.401705,0.165095,-0.124319,0.176982,-0.058772,-0.153682,...,-0.206965,0.616870,-0.430110,0.061355,-0.302654,-0.038280,0.324660,0.301820,0.579965,-0.401915
588,ZNF131,-0.029233,0.157988,-0.082408,-0.239695,-0.109162,-0.067107,-0.034260,0.014058,-0.301460,...,-0.237825,-0.112827,-0.102336,-0.125061,-0.166290,0.186080,-0.040850,-0.203575,-0.030845,0.026090
589,ZNHIT6,-0.110182,0.194335,0.542920,0.031361,0.314445,-0.093444,0.110662,0.167762,-0.070929,...,0.660150,0.272020,0.063960,0.229974,0.082414,0.209111,-0.090215,0.268950,-0.101148,0.100381
