In [5]:
import os
import pandas as pd
from pycytominer import normalize,feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "../SABER_profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("../common_files/SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [41]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['SABER_Plate_1','SABER_Plate_2','SABER_Plate_4']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    remove_list = list(pre_profile_df.columns)
    remove_list = [feat for feat in remove_list if "AreaShape_Center_X_x" in feat or "AreaShape_Center_Y_x" in feat ]
    pre_profile_df.drop(remove_list,axis=1,inplace=True)
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X_y,Cells_AreaShape_Center_Y_y,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.425200,-0.425680,-0.27427,-0.72350,-0.27040,-0.71436,-0.26086,-0.72221,...,-0.27762,-0.27347,0,0,0,0,-0.347620,-0.33345,-0.33496,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.046790,0.002498,0.26750,-0.17152,0.30749,-0.17571,0.28776,-0.17266,...,-0.60813,-0.56665,0,0,0,0,-0.584880,-0.57599,-0.55988,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,-0.519800,-0.333930,0.41105,-0.33226,0.41151,-0.35425,0.40479,-0.32898,...,-0.32481,-0.41165,0,0,0,0,-0.373230,-0.32645,-0.28932,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.037781,-0.072786,0.80927,0.58669,0.79523,0.51425,0.81233,0.56169,...,-0.64861,-0.61902,0,0,0,0,-0.681330,-0.64998,-0.66432,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.646950,0.529480,0.11933,0.29554,0.10870,0.26610,0.10411,0.31024,...,0.30998,0.38566,0,0,0,0,0.330150,0.24726,0.30005,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.726790,0.530340,0.15324,1.43920,0.21468,1.49590,0.18311,1.48900,...,0.27448,0.25705,0,0,0,0,0.203040,0.18065,0.19713,0.186450
2396,nontargeting,TCCCGGTTGGTGAACGATAC,0.409350,0.312160,-0.58249,0.64331,-0.52943,0.67616,-0.57115,0.66213,...,-0.25260,-0.25986,0,0,0,0,-0.183360,-0.15042,-0.19423,-0.201860
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.287260,-0.164680,0.29358,-0.29556,0.28484,-0.28384,0.29329,-0.28778,...,-0.49103,-0.56060,0,0,0,0,-0.534040,-0.56032,-0.52216,-0.522740
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-0.080036,-0.214170,-0.92909,-0.66247,-0.96314,-0.64519,-0.94721,-0.67799,...,-0.47413,-0.52804,0,0,0,0,-0.471600,-0.49416,-0.45333,-0.471900


In [42]:
# Perform normalization om merged profiles
profile_normalized_df = normalize(
            profiles=profile_df,
            features='infer',
            meta_features='infer',
            samples="Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting'",
            method='standardize'
        )
profile_normalized_df.drop_duplicates(inplace=True)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [43]:
profile_normalized_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X_y,Cells_AreaShape_Center_Y_y,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.509654,-0.545016,-0.324111,-1.125756,-0.321657,-1.104525,-0.303519,-1.119602,...,-0.615685,-0.566528,0.0,0.0,0.0,0.0,-0.779368,-0.756977,-0.767705,-0.648634
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.009285,-0.190891,0.286167,0.509036,0.283057,0.565377,0.286108,0.575224,...,-0.467428,-0.478090,0.0,0.0,0.0,0.0,-0.569323,-0.611772,-0.767705,-0.645845
0,AARS2,AAAGGCGGCCCTCACGGCCG,1.253534,1.285028,0.374451,-0.647258,0.228550,-0.682731,0.275341,-0.609381,...,0.741029,0.827752,0.0,0.0,0.0,0.0,0.653145,0.794236,0.779676,0.659075
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.126817,-0.079312,0.420435,-0.327373,0.468972,-0.321861,0.447035,-0.322717,...,-1.469794,-1.243151,0.0,0.0,0.0,0.0,-1.372502,-1.387106,-1.351610,-1.312237
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.529089,-0.567508,-1.026411,-1.161135,-1.039898,-1.168531,-1.035822,-1.153867,...,-1.133071,-1.376061,0.0,0.0,0.0,0.0,-1.152933,-1.030551,-1.151142,-1.165125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-0.234533,-0.359062,0.382628,0.948336,0.414192,0.999842,0.413640,0.953634,...,-0.864726,-0.911140,0.0,0.0,0.0,0.0,-0.989412,-1.001245,-0.936111,-1.035153
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-0.160452,-0.314969,-1.224019,-1.037482,-1.269415,-1.004021,-1.242497,-1.055480,...,-1.123509,-1.154044,0.0,0.0,0.0,0.0,-1.089309,-1.174508,-1.075000,-1.099606
2399,nontargeting,TTCAGCGAGTGTGACTAAGC,0.192212,0.345447,-0.400466,-0.467746,-0.467130,-0.524120,-0.446551,-0.499320,...,-0.131596,-0.569920,0.0,0.0,0.0,0.0,-0.177433,0.290154,-0.282943,-0.237546
2399,nontargeting,TTCAGCGAGTGTGACTAAGC,0.722766,1.028997,-2.262991,0.524859,-2.240459,0.481669,-2.237211,0.514800,...,1.352506,1.108574,0.0,0.0,0.0,0.0,0.551147,0.923827,0.881130,0.815898


In [44]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_normalized_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_ALLWELLS.csv.gz'))


In [47]:
profile_feature_selected_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,...,Nuclei_Texture_SumEntropy_PRSP6_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_Syto9_10_03_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.324111,0.028607,-0.697494,-1.327128,0.248283,1.109521,0.470483,-0.813341,...,0.047652,-1.015432,-0.812984,-0.336496,-0.605291,-0.311832,-0.176106,0.143655,-0.136156,-0.358509
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.286167,-0.167492,0.388079,0.559662,0.383528,-0.155219,0.082665,-0.135980,...,0.350978,-0.837491,-0.080167,-0.151859,-0.079167,-0.340174,-0.428218,-0.602523,-0.531093,-0.474004
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.374451,0.641482,-0.288900,1.199932,-0.294911,-1.608526,-0.652843,0.091975,...,-0.069085,-0.952504,-0.990831,-1.166128,-1.156829,0.107908,-0.915448,-1.190307,1.131739,-0.568920
1,AARS2,AGCAAACTGGGGTCGCCGCG,0.420435,-0.084483,-0.167836,-0.174848,-0.502180,-0.250670,-0.423590,0.213223,...,-1.028130,-0.291443,-0.274541,-0.715289,-0.653819,-0.241750,-0.786580,-0.215661,-0.422714,-0.426638
1,AARS2,AGCAAACTGGGGTCGCCGCG,-1.026411,-0.816097,0.038750,-1.192242,0.831905,-0.174085,0.663712,-0.068827,...,-1.371378,-0.731683,-0.762828,-0.893350,-0.567582,-0.862682,-0.996851,-0.852844,-0.927627,-0.877130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.382628,-0.551862,0.274737,-1.152850,0.152954,0.675296,0.511253,-1.180972,...,-0.270745,-0.579933,0.456615,0.577642,0.424319,0.064891,-0.033031,-0.328896,0.229021,0.304850
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-1.224019,-1.030266,-0.086078,-0.159482,0.664966,0.500101,0.164917,-0.056855,...,-1.226263,-0.999591,-0.859061,0.081586,-0.568534,-0.147928,-0.635104,-0.867097,-0.666205,-0.223976
2399,nontargeting,TTCAGCGAGTGTGACTAAGC,-0.400466,0.367885,-1.330806,0.853538,1.000807,-1.295644,0.598217,-0.150545,...,-0.807510,0.968737,0.504419,0.053922,0.144757,-0.254328,1.586364,-0.282038,0.217341,0.508580
2399,nontargeting,TTCAGCGAGTGTGACTAAGC,-2.262991,-0.551862,0.649017,-0.882477,-0.451477,1.289558,-0.121701,-3.594317,...,1.602898,0.488757,0.079209,-0.009168,-0.137955,1.002994,-0.503184,-1.028510,0.218481,-0.081790


In [45]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz'),index = False)


In [46]:
profile_feature_selected_median_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,...,Nuclei_Texture_SumEntropy_PRSP6_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_Syto9_10_03_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.286167,0.028607,-0.288900,0.559662,0.248283,-0.155219,0.082665,-0.135980,...,0.047652,-0.952504,-0.812984,-0.336496,-0.605291,-0.311832,-0.428218,-0.602523,-0.136156,-0.474004
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.376567,-0.084483,0.038750,-0.537643,-0.188543,-0.174085,-0.008743,0.080111,...,-1.028130,-0.731683,-0.274541,-0.715289,-0.567582,-0.355411,-0.786580,-0.215661,-0.539864,-0.426638
2,AARS2,CCAACTTCTACGCAGAACAG,0.374451,-0.062411,0.216489,-0.121187,-0.231957,-0.229726,-0.028984,-0.034369,...,-1.012095,-0.599724,-0.122199,-0.256789,-0.375769,-0.716810,-0.305359,-0.450250,-0.435628,-0.374634
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.994253,-0.167492,-0.342195,0.185172,-0.270254,-0.012194,-0.072155,0.644617,...,-0.749566,-0.638792,-0.576873,-0.734981,-0.508235,-0.734689,-0.268112,-0.245855,-0.524880,-0.418964
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.216807,0.024692,-0.377658,-0.252822,0.505456,0.583310,0.810882,0.055234,...,0.098922,-0.744067,-0.836499,-0.520663,-0.861196,-0.630555,-1.184898,-0.944579,-0.316360,-0.291860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.263409,-0.167492,0.585517,0.371861,-1.396831,-1.415993,-1.458483,-0.101232,...,1.349904,0.821134,0.335353,0.594888,0.942876,0.437077,0.425053,0.555710,0.573956,0.326756
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.747693,-0.263576,-0.145198,-0.197777,0.351259,-0.106476,0.077146,0.600798,...,-0.291884,-1.081500,-0.644822,-0.419891,-0.709275,-1.131810,0.026129,-0.228532,-0.276853,-0.792633
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,0.456276,-0.062411,0.062339,-0.137677,0.250134,0.257176,0.114678,0.102850,...,-0.038258,-0.929296,-0.701177,-0.327030,-0.407781,-0.614023,-0.707128,-0.856426,-0.496192,-0.737731
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.382628,-0.551862,-0.086078,-0.159482,0.168746,0.500101,0.383496,-0.056855,...,-1.000795,-0.700653,-0.131112,0.081586,-0.472923,-0.098026,-0.343959,-0.328896,-0.504917,-0.223976


**Gene level aggregation**

In [7]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [8]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz
Now aggregating.
Now saving aggregated file.


In [9]:
gene_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,...,Nuclei_Texture_SumEntropy_PRSP6_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_Syto9_10_03_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,0.010109,-0.087648,0.161311,-0.185825,-0.011616,-0.004846,0.011557,0.014249,-0.185814,...,-0.463325,-0.313945,-0.315505,-0.422010,-0.414045,-0.416745,0.153090,0.084152,0.135430,-0.297145
1,AARSD1,0.306288,-0.072139,0.331056,0.320015,0.330335,0.114930,-0.025301,0.000489,0.046690,...,0.189640,-0.249210,0.142986,-0.192595,-0.136617,0.004501,0.094970,-0.188770,0.160150,0.113047
2,ABCF1,-0.060312,0.147953,-0.499430,-0.480239,0.015687,0.076740,-0.687840,-0.130226,0.293780,...,1.211825,-0.960585,0.192730,-0.032408,-0.107133,-0.381875,0.785110,1.032315,1.076095,0.045420
3,ABLIM1,0.045400,-0.039003,0.239105,-0.298120,0.099097,-0.051053,-0.013485,0.002024,-0.091853,...,-0.381365,-0.099255,0.232015,0.081376,0.274015,0.117540,-0.136871,0.037682,0.270835,0.083502
4,ACTR2,0.010109,-0.033853,0.150085,0.098020,-0.049430,-0.000416,0.047962,0.080113,-2.250000,...,1.502300,3.948900,1.080285,0.507495,1.133240,0.378950,-0.444100,1.053490,-0.458410,1.284000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,ZC3H8,0.001845,-0.423155,0.490555,0.052490,0.049880,0.138930,-0.180165,-0.146645,-0.493300,...,-0.125215,-0.912500,0.199215,0.765830,0.017160,0.398660,0.531960,0.321214,0.604615,0.453625
587,ZC3HC1,0.340573,-0.089564,0.282530,-0.401705,0.165095,-0.124319,0.176982,-0.058772,-0.153682,...,-0.206965,0.616870,-0.430110,0.061355,-0.302654,-0.038280,0.324660,0.301820,0.579965,-0.401915
588,ZNF131,-0.029233,0.157988,-0.082408,-0.239695,-0.109162,-0.067107,-0.034260,0.014058,-0.301460,...,-0.237825,-0.112827,-0.102336,-0.125061,-0.166290,0.186080,-0.040850,-0.203575,-0.030845,0.026090
589,ZNHIT6,-0.110182,0.194335,0.542920,0.031361,0.314445,-0.093444,0.110662,0.167762,-0.070929,...,0.660150,0.272020,0.063960,0.229974,0.082414,0.209111,-0.090215,0.268950,-0.101148,0.100381
