In [1]:
import os
import pandas as pd
from pycytominer import feature_select,aggregate

In [2]:
# Set data input folder
# Before running notebook, download files described in README.md to this folder
input_folder = "../plate_level_profiles"

# Set output folder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load barcodes used in experiment
saber_library = pd.read_csv("../../SABER_Library_ngt_Included_Oligo_Sequences_Assiged.csv")
guide_list = list(saber_library['sgRNA'])
len(guide_list)

2400

In [4]:
# Load normalized plate_level profiles, remove ghost guides, and merge 
M059K_plates = ['SABER_Plate_2','SABER_Plate_3','SABER_Plate_5','SABER_Plate_6']

M059K_list = []
for plate in M059K_plates:
    filename = f'20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_ALLBATCHES___{plate}___ALLWELLS.csv.gz'
    pre_M059K_df = pd.read_csv(os.path.join(input_folder, filename))
    pre_M059K_df = pre_M059K_df[pre_M059K_df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]
    M059K_list.append(pre_M059K_df)

M059K_df = pd.concat(M059K_list)


M059K_df.head()

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_btubulin_10_02_256,Nuclei_Texture_Variance_btubulin_10_03_256,Nuclei_Texture_Variance_btubulin_20_00_256,Nuclei_Texture_Variance_btubulin_20_01_256,Nuclei_Texture_Variance_btubulin_20_02_256,Nuclei_Texture_Variance_btubulin_20_03_256,Nuclei_Texture_Variance_btubulin_5_00_256,Nuclei_Texture_Variance_btubulin_5_01_256,Nuclei_Texture_Variance_btubulin_5_02_256,Nuclei_Texture_Variance_btubulin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,3762.7,2837.6,3761.3,2837.9,3788.5,2834.8,0.61032,1.0704,...,0.21645,0.33719,1.0359,0.49717,0.25361,0.18051,0.62282,-0.064759,-0.062812,1.2555
1,AARS2,AGCAAACTGGGGTCGCCGCG,3622.0,3298.2,3628.3,3300.0,3607.3,3286.9,-0.012456,0.36358,...,-0.40061,-0.35288,-0.56428,-0.52125,-0.57874,-0.37759,-0.1864,-0.16037,-0.18899,-0.44489
2,AARS2,CCAACTTCTACGCAGAACAG,3073.0,3049.7,3073.4,3049.1,3061.9,3057.3,-0.68076,-0.9105,...,-0.47984,-0.24455,-0.74271,-0.63685,-0.73178,-0.60013,-0.29418,-0.3948,-0.30222,-0.30114
3,AARS2,GCTGAGCCAGTTCAGAAGCA,3452.6,3734.9,3451.7,3739.3,3456.5,3704.1,-0.4149,-0.88976,...,-1.2593,-1.1557,-0.74271,-0.63685,-0.73178,-0.60013,-0.35732,-0.6872,-1.027,-0.91099
4,AARSD1,ACCTCCGCTCCCAATCTACC,2882.7,3341.6,2885.1,3342.1,2882.2,3335.2,0.98454,1.0166,...,0.4834,0.44318,1.309,1.2618,1.4063,1.2125,0.52363,0.44407,0.44955,0.44109


In [5]:
# subset the features to cp channels only
features = list(M059K_df.columns)

cp_channels = ['DNA','Phalloidin','Calnexin','COXIV','GM130']
SABER_only_channels = ['Catalase','btubulin','P65NFkb','Golgin97','LAMP1','PRSPS6','Syto9','TDP43','G3BP1']

for channel in SABER_only_channels:
    features = [feature for feature in features if channel not in feature]
    
print(len(features))
M059K_df = M059K_df[features]
M059K_df

3813


Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_Center_X_x,Cells_AreaShape_Center_Y_x,Cytoplasm_AreaShape_Center_X_x,Cytoplasm_AreaShape_Center_Y_x,Nuclei_AreaShape_Center_X_x,Nuclei_AreaShape_Center_Y_x,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,...,Nuclei_Texture_Variance_Phalloidin_10_02_256,Nuclei_Texture_Variance_Phalloidin_10_03_256,Nuclei_Texture_Variance_Phalloidin_20_00_256,Nuclei_Texture_Variance_Phalloidin_20_01_256,Nuclei_Texture_Variance_Phalloidin_20_02_256,Nuclei_Texture_Variance_Phalloidin_20_03_256,Nuclei_Texture_Variance_Phalloidin_5_00_256,Nuclei_Texture_Variance_Phalloidin_5_01_256,Nuclei_Texture_Variance_Phalloidin_5_02_256,Nuclei_Texture_Variance_Phalloidin_5_03_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,3762.7,2837.6,3761.3,2837.9,3788.5,2834.8,0.610320,1.070400,...,-0.268060,-0.270850,0.098870,0.126770,0.432540,0.940150,0.049743,0.086445,0.081901,-0.193190
1,AARS2,AGCAAACTGGGGTCGCCGCG,3622.0,3298.2,3628.3,3300.0,3607.3,3286.9,-0.012456,0.363580,...,-0.097597,-0.043174,-0.078298,-0.056108,-0.049872,-0.127410,-0.109070,-0.095821,-0.072852,-0.045826
2,AARS2,CCAACTTCTACGCAGAACAG,3073.0,3049.7,3073.4,3049.1,3061.9,3057.3,-0.680760,-0.910500,...,-0.258710,-0.259120,-0.413680,-0.106930,-0.497460,-0.495190,-0.226750,-0.214350,-0.234250,-0.281360
3,AARS2,GCTGAGCCAGTTCAGAAGCA,3452.6,3734.9,3451.7,3739.3,3456.5,3704.1,-0.414900,-0.889760,...,-0.060373,-0.092529,-0.413680,-0.106930,-0.497460,-0.495190,-0.206060,-0.120230,-0.156020,-0.125760
4,AARSD1,ACCTCCGCTCCCAATCTACC,2882.7,3341.6,2885.1,3342.1,2882.2,3335.2,0.984540,1.016600,...,-0.053495,-0.042818,0.578100,0.115420,0.714650,0.888450,-0.084462,-0.082096,-0.088276,-0.078953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2392,nontargeting,TAAGATCCGCGGGTGGCAAC,3069.4,3204.2,3065.5,3204.4,3086.4,3204.9,0.552470,0.697360,...,-0.310110,-0.239510,-0.185010,-0.413680,-0.320240,-0.151410,-0.357850,-0.349920,-0.341500,-0.401550
2393,nontargeting,TCCCGGTTGGTGAACGATAC,3267.8,3474.6,3267.0,3473.7,3281.3,3485.2,-0.843580,-0.966190,...,-0.424410,-0.397240,-0.185010,-0.413680,-0.320240,-0.151410,-0.274110,-0.262160,-0.228140,-0.278850
2394,nontargeting,TGCCGTGAAAAGACGCTGCG,2710.2,3379.2,2710.6,3382.5,2709.3,3355.9,-0.169340,-0.113020,...,-0.051105,-0.076397,-0.185010,-0.413680,-0.320240,-0.151410,-0.027866,0.006505,-0.045215,0.113520
2395,nontargeting,TGGCCACGAATTCCGCCGCC,2081.9,3295.6,2082.6,3294.2,2079.7,3295.2,0.514700,0.091906,...,-0.347880,-0.174260,-0.124180,-0.244290,-0.209500,-0.096095,-0.352880,-0.273740,-0.249230,-0.380720


In [6]:
# Perform feature selection on merged profiles
M059K_feature_selected_df = feature_select(
            profiles=M059K_df,
            features='infer',
            samples='all',
            operation=['variance_threshold','correlation_threshold','drop_na_columns','blocklist','drop_outliers'],
            na_cutoff= 0,
            corr_threshold=0.9,
            outlier_cutoff=100
        )

M059K_feature_selected_df.to_csv(os.path.join(output_folder,'20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz'))


In [7]:
# Perform median aggregation on profiles
M059K_feature_selected_median_df= aggregate(
                    population_df=M059K_feature_selected_df, 
                    strata=['Metadata_Foci_Barcode_MatchedTo_GeneCode' ,'Metadata_Foci_Barcode_MatchedTo_Barcode'], 
                    features='infer', 
                    operation='median' 
                    )

M059K_feature_selected_median_df.to_csv(os.path.join(output_folder,'20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_median_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz'),index = False)


**Gene level aggregation**

In [8]:
# List files to undergo gene level aggregation
file_list = ["20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_median_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz"]

# Set aggregation parameters for gene level aggregation
aggregate_columns = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
aggregate_features = 'infer'
aggregate_operation = 'median'

In [9]:
# Perform profile aggregation
for profile_file in file_list:
    print (f"Now loading {profile_file}")
    df = pd.read_csv(os.path.join(output_folder,profile_file))
    df = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"].isin(guide_list)]

    print (f"Now aggregating.")
    gene_df = aggregate(
            population_df=df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
            )

    print (f"Now saving aggregated file.")
    agg_file_name = f"{profile_file.split('.',1)[0]}_gene_aggregated.{profile_file.split('.',1)[1]}"
    gene_df.to_csv(os.path.join(output_folder, agg_file_name), index=False)

Now loading 20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_median_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz
Now aggregating.
Now saving aggregated file.


In [10]:
M059K_feature_selected_median_df

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_2,...,Nuclei_Texture_SumVariance_Calnexin_20_03_256,Nuclei_Texture_SumVariance_Calnexin_5_02_256,Nuclei_Texture_SumVariance_DNA_10_01_256,Nuclei_Texture_SumVariance_DNA_20_01_256,Nuclei_Texture_SumVariance_DNA_20_03_256,Nuclei_Texture_SumVariance_GM130_20_01_256,Nuclei_Texture_SumVariance_GM130_5_00_256,Nuclei_Texture_SumVariance_Phalloidin_10_02_256,Nuclei_Texture_SumVariance_Phalloidin_20_01_256,Nuclei_Texture_Variance_Calnexin_20_00_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.033501,-0.021708,-0.446427,-0.052055,-0.106097,-0.035181,-0.024508,0.101165,...,-0.283260,1.223165,-0.861290,0.045650,-0.058530,-0.210587,-0.240745,0.081160,-0.199070,-0.199375
1,AARS2,AGCAAACTGGGGTCGCCGCG,0.098789,-0.041051,0.013143,0.027802,0.000183,0.006994,0.014238,-0.307540,...,-0.479160,0.042645,0.355875,-0.529805,-0.475970,-0.619375,-0.188795,-0.034064,-0.397135,-0.332470
2,AARS2,CCAACTTCTACGCAGAACAG,0.080462,-0.007381,-0.019076,0.096305,0.049998,0.033392,0.008300,-0.290030,...,-0.479160,-0.268070,-0.216838,-0.529805,-0.486250,-0.619375,-0.009330,-0.114787,-0.397135,-0.447315
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.115877,-0.095393,-0.234975,-0.022434,0.000537,-0.001022,-0.056283,-0.161884,...,-0.381090,-0.364134,-0.294380,-0.467000,-0.367030,-0.376080,-0.722730,-0.246800,-0.288600,-0.447315
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.106199,-0.002299,0.005447,-0.032071,0.010073,0.005625,0.008212,0.558170,...,1.102550,0.325770,0.798840,1.244500,1.149600,1.139140,0.281992,-0.006721,0.633935,0.934645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,-0.093411,-0.006135,-0.025522,0.018110,0.037509,-0.003599,-0.075485,0.138834,...,-0.094215,-0.086080,0.050148,0.186640,-0.079335,0.047555,-0.066887,0.027325,-0.005835,-0.128360
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.074922,-0.016022,0.008407,0.011823,-0.005692,0.014658,0.006559,-0.107477,...,-0.166074,0.527320,0.254985,-0.427635,-0.367030,-0.376080,0.019385,-0.172278,-0.281090,-0.218643
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.197905,-0.024100,0.051483,0.007274,0.017820,0.015592,-0.001664,-0.244840,...,-0.286660,0.017660,-0.006520,-0.358925,-0.279415,0.139215,-0.016525,0.031936,-0.157765,-0.302650
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.083609,0.003866,-0.022012,0.139262,0.010856,0.014981,-0.107854,-0.147185,...,-0.366595,-0.088162,0.008927,-0.427635,-0.350825,-0.376080,0.215435,-0.196010,-0.281090,-0.283045
