In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "../SABER_profiles/"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("../common_files/SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [4]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
plates = ['SABER_Plate_1','SABER_Plate_2']

pre_df_list = []
for plate in plates:
    filename = f'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_profile_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_profile_df = pre_profile_df[pre_profile_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    remove_list = list(pre_profile_df.columns)
    remove_list = [feat for feat in remove_list if "AreaShape_Center_X_x" in feat or "AreaShape_Center_Y_x" in feat ]
    pre_profile_df.drop(remove_list,axis=1,inplace=True)
    pre_df_list.append(pre_profile_df)

profile_df = pd.concat(pre_df_list)


profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X_y,Cells_AreaShape_Center_Y_y,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.425200,-0.425680,-0.27427,-0.72350,-0.27040,-0.71436,-0.26086,-0.72221,...,-0.277620,-0.273470,0,0,0,0,-0.347620,-0.333450,-0.334960,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.046790,0.002498,0.26750,-0.17152,0.30749,-0.17571,0.28776,-0.17266,...,-0.608130,-0.566650,0,0,0,0,-0.584880,-0.575990,-0.559880,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,-0.519800,-0.333930,0.41105,-0.33226,0.41151,-0.35425,0.40479,-0.32898,...,-0.324810,-0.411650,0,0,0,0,-0.373230,-0.326450,-0.289320,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.037781,-0.072786,0.80927,0.58669,0.79523,0.51425,0.81233,0.56169,...,-0.648610,-0.619020,0,0,0,0,-0.681330,-0.649980,-0.664320,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.646950,0.529480,0.11933,0.29554,0.10870,0.26610,0.10411,0.31024,...,0.309980,0.385660,0,0,0,0,0.330150,0.247260,0.300050,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.763000,0.847270,0.83773,-0.60567,0.82975,-0.59107,0.78310,-0.59782,...,-0.097319,-0.087962,0,0,0,0,0.036283,-0.033155,0.026341,-0.063860
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.148980,-0.154530,-0.60874,-0.93950,-0.65694,-0.92849,-0.65609,-0.93444,...,-0.004332,0.016687,0,0,0,0,0.031787,-0.003908,0.023189,-0.062283
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.290270,-0.206800,0.11662,-0.54274,0.12474,-0.53895,0.12969,-0.52764,...,-0.321470,-0.315640,0,0,0,0,-0.318210,-0.284810,-0.289590,-0.331970
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-0.153260,-0.254710,0.23999,0.71047,0.26745,0.73392,0.26335,0.70754,...,-0.373990,-0.422790,0,0,0,0,-0.431640,-0.427470,-0.399830,-0.446480


In [12]:
# Filter out WGA and ConA features for this line of analysis
features = list(profile_df.columns)
features_no_WGA_no_ConA = [feature for feature in features if "ConA" not in feature and "WGA" not in feature]
profile_df = profile_df[features_no_WGA_no_ConA]
profile_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X_y,Cells_AreaShape_Center_Y_y,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.425200,-0.425680,-0.27427,-0.72350,-0.27040,-0.71436,-0.26086,-0.72221,...,-0.277620,-0.273470,0,0,0,0,-0.347620,-0.333450,-0.334960,-0.294040
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.046790,0.002498,0.26750,-0.17152,0.30749,-0.17571,0.28776,-0.17266,...,-0.608130,-0.566650,0,0,0,0,-0.584880,-0.575990,-0.559880,-0.555760
2,AARS2,CCAACTTCTACGCAGAACAG,-0.519800,-0.333930,0.41105,-0.33226,0.41151,-0.35425,0.40479,-0.32898,...,-0.324810,-0.411650,0,0,0,0,-0.373230,-0.326450,-0.289320,-0.335730
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.037781,-0.072786,0.80927,0.58669,0.79523,0.51425,0.81233,0.56169,...,-0.648610,-0.619020,0,0,0,0,-0.681330,-0.649980,-0.664320,-0.667570
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.646950,0.529480,0.11933,0.29554,0.10870,0.26610,0.10411,0.31024,...,0.309980,0.385660,0,0,0,0,0.330150,0.247260,0.300050,0.280160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.763000,0.847270,0.83773,-0.60567,0.82975,-0.59107,0.78310,-0.59782,...,-0.097319,-0.087962,0,0,0,0,0.036283,-0.033155,0.026341,-0.063860
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.148980,-0.154530,-0.60874,-0.93950,-0.65694,-0.92849,-0.65609,-0.93444,...,-0.004332,0.016687,0,0,0,0,0.031787,-0.003908,0.023189,-0.062283
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.290270,-0.206800,0.11662,-0.54274,0.12474,-0.53895,0.12969,-0.52764,...,-0.321470,-0.315640,0,0,0,0,-0.318210,-0.284810,-0.289590,-0.331970
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-0.153260,-0.254710,0.23999,0.71047,0.26745,0.73392,0.26335,0.70754,...,-0.373990,-0.422790,0,0,0,0,-0.431640,-0.427470,-0.399830,-0.446480


In [13]:
# Perform feature selection on merged profiles
profile_feature_selected_df = feature_select(
            profiles=profile_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )
profile_feature_selected_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_ALLWELLS_1_2.csv.gz'))


In [14]:
# Perform median aggregation on profiles
profile_feature_selected_median_df= aggregate(
                    population_df=profile_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

profile_feature_selected_median_df.to_csv(os.path.join(output_folder,'20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz'),index = False)


**Gene level aggregation**

In [16]:
# List files to undergo gene level aggregation
file_list = ["20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [17]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz
Now aggregating.
Now saving aggregated file.


In [18]:
gene_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_3,Cells_AreaShape_Compactness,...,Nuclei_Texture_SumEntropy_Syto9_10_03_256,Nuclei_Texture_SumEntropy_btubulin_10_01_256,Nuclei_Texture_SumVariance_Calnexin_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,0.023693,-0.080699,-0.057343,0.027962,-0.033402,0.076849,0.083817,0.019874,-0.149227,...,-0.396335,-0.524118,-0.450118,-0.329375,-0.350120,-0.381850,0.041402,0.030863,0.048960,-0.370085
1,AARSD1,0.272625,0.081797,-0.126535,0.398628,0.091985,0.182374,-0.087431,0.093671,0.124445,...,0.236525,0.168630,-0.134238,-0.124790,0.316453,-0.026575,0.181426,-0.096967,0.064950,0.160856
2,ABCF1,-0.481500,0.561900,-0.173627,-0.255603,-0.225560,0.038623,-0.593172,-0.541509,0.311361,...,0.196578,0.911600,-0.249272,-0.900122,0.418750,0.236627,0.870753,1.086597,0.983365,0.154630
3,ABLIM1,0.146947,0.343826,0.439982,-0.362965,0.042743,0.052130,0.000361,0.019048,-0.046677,...,-0.057305,-0.513901,0.042288,-0.058773,0.061827,0.064997,-0.202675,-0.077290,0.115830,-0.101946
4,ACTR2,0.220674,-0.029678,-0.171895,-0.167338,0.040010,0.099809,0.027234,0.055947,-2.217325,...,0.775320,1.795975,2.384025,3.684625,1.246983,1.286035,-0.034865,1.250970,0.133793,1.596575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,ZC3H8,-0.113226,-0.795568,0.568145,0.334003,0.077486,0.393525,-0.132623,-0.094868,-0.671630,...,0.412530,-0.134758,-0.226020,-0.626308,-0.189309,0.330240,0.609622,0.347368,0.625757,0.307787
587,ZC3HC1,0.141876,0.142357,0.162268,-0.515610,-0.132690,-0.507639,0.166184,-0.064098,-0.012567,...,0.031808,-0.082787,0.206732,0.600818,-0.468210,-0.218002,0.324425,0.425944,0.546220,-0.371775
588,ZNF131,-0.274877,0.262886,-0.040244,-0.003717,-0.224979,-0.078116,-0.134581,0.162277,0.062030,...,0.147048,-0.225332,-0.530802,-0.133487,0.067398,0.159290,-0.020956,-0.197158,-0.020906,0.112614
589,ZNHIT6,-0.282140,0.327792,0.742264,-0.056139,0.279110,-0.118152,-0.053264,0.059448,0.066105,...,0.202350,0.214329,-0.119478,0.318047,0.205272,0.238812,-0.006842,0.274765,0.162147,0.281501
