In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [5]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "../SABER_profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [6]:
# Load barcodes used in the experiment
saber_library = pd.read_csv("../common_files/SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [7]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['SABER_Plate_1','SABER_Plate_2']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df.head()

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.4252,-0.42568,...,-0.27762,-0.27347,0,0,0,0,-0.34762,-0.33345,-0.33496,-0.29404
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.04679,0.002498,...,-0.60813,-0.56665,0,0,0,0,-0.58488,-0.57599,-0.55988,-0.55576
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.5198,-0.33393,...,-0.32481,-0.41165,0,0,0,0,-0.37323,-0.32645,-0.28932,-0.33573
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,-0.64861,-0.61902,0,0,0,0,-0.68133,-0.64998,-0.66432,-0.66757
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.64695,0.52948,...,0.30998,0.38566,0,0,0,0,0.33015,0.24726,0.30005,0.28016


In [8]:
profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.425200,-0.425680,...,-0.277620,-0.273470,0,0,0,0,-0.347620,-0.333450,-0.334960,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.046790,0.002498,...,-0.608130,-0.566650,0,0,0,0,-0.584880,-0.575990,-0.559880,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.519800,-0.333930,...,-0.324810,-0.411650,0,0,0,0,-0.373230,-0.326450,-0.289320,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,-0.648610,-0.619020,0,0,0,0,-0.681330,-0.649980,-0.664320,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.646950,0.529480,...,0.309980,0.385660,0,0,0,0,0.330150,0.247260,0.300050,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,2952.8,3272.8,2952.1,3271.8,2954.6,3274.1,0.763000,0.847270,...,-0.097319,-0.087962,0,0,0,0,0.036283,-0.033155,0.026341,-0.063860
2396,nontargeting,TCCCGGTTGGTGAACGATAC,2614.7,3211.4,2615.1,3210.9,2617.6,3209.8,-0.148980,-0.154530,...,-0.004332,0.016687,0,0,0,0,0.031787,-0.003908,0.023189,-0.062283
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,2799.3,3285.6,2797.8,3286.1,2797.4,3283.9,-0.290270,-0.206800,...,-0.321470,-0.315640,0,0,0,0,-0.318210,-0.284810,-0.289590,-0.331970
2398,nontargeting,TGGCCACGAATTCCGCCGCC,2830.7,3510.9,2830.1,3509.4,2828.4,3512.5,-0.153260,-0.254710,...,-0.373990,-0.422790,0,0,0,0,-0.431640,-0.427470,-0.399830,-0.446480


In [9]:
# Remove all features from GM130 and Calnexin for comparison to Meraj's SABER profiles without ConnA and WGA
features = list(profile_df.columns)
feature_no_GM130_no_Calnexin = [feature for feature in features if 'GM130' not in feature and 'Calnexin' not in feature]
profile_df = profile_df[feature_no_GM130_no_Calnexin]
profile_df


Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,2691.9,3291.4,2690.9,3291.5,2688.2,3289.7,-0.425200,-0.425680,...,-0.277620,-0.273470,0,0,0,0,-0.347620,-0.333450,-0.334960,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,2810.5,3382.1,2811.3,3382.6,2820.5,3379.8,-0.046790,0.002498,...,-0.608130,-0.566650,0,0,0,0,-0.584880,-0.575990,-0.559880,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,2835.8,3356.3,2833.8,3354.9,2832.5,3353.4,-0.519800,-0.333930,...,-0.324810,-0.411650,0,0,0,0,-0.373230,-0.326450,-0.289320,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,2923.9,3503.3,2922.3,3501.2,2933.7,3506.3,-0.037781,-0.072786,...,-0.648610,-0.619020,0,0,0,0,-0.681330,-0.649980,-0.664320,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,2770.8,3461.8,2769.8,3461.8,2767.2,3458.9,0.646950,0.529480,...,0.309980,0.385660,0,0,0,0,0.330150,0.247260,0.300050,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,2952.8,3272.8,2952.1,3271.8,2954.6,3274.1,0.763000,0.847270,...,-0.097319,-0.087962,0,0,0,0,0.036283,-0.033155,0.026341,-0.063860
2396,nontargeting,TCCCGGTTGGTGAACGATAC,2614.7,3211.4,2615.1,3210.9,2617.6,3209.8,-0.148980,-0.154530,...,-0.004332,0.016687,0,0,0,0,0.031787,-0.003908,0.023189,-0.062283
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,2799.3,3285.6,2797.8,3286.1,2797.4,3283.9,-0.290270,-0.206800,...,-0.321470,-0.315640,0,0,0,0,-0.318210,-0.284810,-0.289590,-0.331970
2398,nontargeting,TGGCCACGAATTCCGCCGCC,2830.7,3510.9,2830.1,3509.4,2828.4,3512.5,-0.153260,-0.254710,...,-0.373990,-0.422790,0,0,0,0,-0.431640,-0.427470,-0.399830,-0.446480


In [10]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_ALLWELLS_1_2.csv.gz'))


In [17]:
profile_feature_selected_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,...,Nuclei_Texture_SumEntropy_Syto9_10_01_256,Nuclei_Texture_SumEntropy_Syto9_10_03_256,Nuclei_Texture_SumEntropy_btubulin_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.72350,0.086527,-0.345100,-0.901010,0.141280,0.577780,0.211680,-0.475640,...,0.10111,-0.353220,-0.015718,-0.481120,-0.551010,-0.45509,0.236280,0.276310,0.283320,-0.255000
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.17152,0.001845,-0.106880,-0.000361,-0.394080,-0.051405,-0.183930,0.122460,...,-0.43897,-0.176290,-0.812680,-0.114570,-0.223580,-0.48465,-0.030054,0.135890,0.160160,-0.299660
2,AARS2,CCAACTTCTACGCAGAACAG,-0.33226,0.404090,0.034043,-0.194050,0.171450,-0.041717,0.218850,0.276470,...,-0.09725,-0.407500,-0.435290,-0.270650,0.095793,-0.26024,0.122420,0.044213,0.154610,-0.131970
3,AARS2,GCTGAGCCAGTTCAGAAGCA,0.58669,0.128870,-0.325360,1.452300,-0.228630,-0.197630,-0.028427,0.494710,...,-0.68643,-0.126020,-0.714230,-0.290430,-0.407430,-0.39597,-0.361090,-0.100230,0.116250,-0.752190
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.29554,-0.421570,-0.201250,0.367640,0.324740,0.334370,0.362300,0.062095,...,-0.56663,-0.429620,0.428470,-0.343730,-1.084800,-0.61097,-0.317450,-0.148970,0.166720,-0.341260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,-0.60567,-0.060312,0.315410,-0.359710,-1.032300,-0.590450,-0.650870,-0.060749,...,1.00870,0.899630,0.212820,0.448720,0.165270,0.48795,0.498550,0.161680,0.110460,0.664950
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.93950,-0.132260,0.094504,-0.597480,-0.332500,0.015295,-0.011430,0.092524,...,-0.26082,-0.017144,0.114110,-0.088028,0.209160,-0.51843,0.438950,0.130860,0.222850,-0.273710
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.54274,0.227500,-0.003356,-0.165970,0.785560,0.183510,0.539330,-0.295280,...,-0.27662,-0.483390,-0.479350,-0.496000,-0.483020,-0.33478,0.004608,-0.114520,-0.006739,-0.551740
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.71047,-0.348130,0.092173,-0.764790,0.073275,0.376920,0.229720,-0.689830,...,0.26244,-0.144620,-0.521650,-0.260630,0.221040,0.17208,0.298700,0.091638,0.440270,0.179850


In [11]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz'),index = False)


**Gene level aggregation**

In [12]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [13]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz
Now aggregating.
Now saving aggregated file.


In [14]:
gene_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,...,Nuclei_Texture_SumEntropy_Syto9_10_01_256,Nuclei_Texture_SumEntropy_Syto9_10_03_256,Nuclei_Texture_SumEntropy_btubulin_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,-0.205588,0.023693,-0.080699,-0.057343,0.027962,-0.033402,0.076849,0.083817,0.019874,...,-0.374615,-0.396335,-0.524118,-0.329375,-0.350120,-0.381850,0.041402,0.030863,0.048960,-0.370085
1,AARSD1,0.140995,0.272625,0.081797,-0.126535,0.398628,0.091985,0.182374,-0.087431,0.093671,...,0.012913,0.236525,0.168630,-0.124790,0.316453,-0.026575,0.181426,-0.096967,0.064950,0.160856
2,ABCF1,0.003170,-0.481500,0.561900,-0.173627,-0.255603,-0.225560,0.038623,-0.593172,-0.541509,...,0.028843,0.196578,0.911600,-0.900122,0.418750,0.236627,0.870753,1.086597,0.983365,0.154630
3,ABLIM1,-0.558244,0.146947,0.343826,0.439982,-0.362965,0.042743,0.052130,0.000361,0.019048,...,-0.229766,-0.057305,-0.513901,-0.058773,0.061827,0.064997,-0.202675,-0.077290,0.115830,-0.101946
4,ACTR2,0.169513,0.220674,-0.029678,-0.171895,-0.167338,0.040010,0.099809,0.027234,0.055947,...,0.811419,0.775320,1.795975,3.684625,1.246983,1.286035,-0.034865,1.250970,0.133793,1.596575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,ZC3H8,-0.022036,-0.113226,-0.795568,0.568145,0.334003,0.077486,0.393525,-0.132623,-0.094868,...,0.498880,0.412530,-0.134758,-0.626308,-0.189309,0.330240,0.609622,0.347368,0.625757,0.307787
587,ZC3HC1,-0.615254,0.141876,0.142357,0.162268,-0.515610,-0.132690,-0.507639,0.166184,-0.064098,...,-0.169582,0.031808,-0.082787,0.600818,-0.468210,-0.218002,0.324425,0.425944,0.546220,-0.371775
588,ZNF131,-0.843817,-0.274877,0.262886,-0.040244,-0.003717,-0.224979,-0.078116,-0.134581,0.162277,...,0.551580,0.147048,-0.225332,-0.133487,0.067398,0.159290,-0.020956,-0.197158,-0.020906,0.112614
589,ZNHIT6,0.876942,-0.282140,0.327792,0.742264,-0.056139,0.279110,-0.118152,-0.053264,0.059448,...,-0.042642,0.202350,0.214329,0.318047,0.205272,0.238812,-0.006842,0.274765,0.162147,0.281501
