In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from scipy.spatial import distance
import pathlib
import os
from sklearn.decomposition import PCA
import json
import random
from random import sample

random.seed(30)

In [2]:
# Set data input folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Import the guide-level profiles
df_guide = pd.read_csv("outputs/20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS_1_2.csv.gz")
# Subset the nontargeting guide profiles 
df_nontargeting = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting'")

# Load hits from the hit calling process
whole_cell_hits = pd.read_csv('outputs/HeLa_SABER_plate_level_median_per_feat_sig_genes_1_FDR_whole_cell_hits.csv')
comp_spec_hits = pd.read_csv('outputs/HeLa_SABER_plate_level_median_per_feat_sig_genes_1_FDR_compartment_specific_hits.csv')
all_hits = pd.concat([whole_cell_hits,comp_spec_hits])
hit_list = list(comp_spec_hits.Gene) + list(whole_cell_hits.Gene)
whole_cell_hit_list = list(whole_cell_hits.Gene)
comp_spec_hit_list = list(comp_spec_hits.Gene)

# list non hit genes
all_genes_list = list(df_guide.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
all_genes_list.remove("nontargeting")
non_hit_list = [gene for gene in all_genes_list if gene not in hit_list]
print("All genes:",len(all_genes_list),"non-hit genes",len(non_hit_list),'Whole cell hits',len(whole_cell_hit_list),'Compartment hits',len(comp_spec_hit_list))


All genes: 590 non-hit genes 206 Whole cell hits 306 Compartment hits 78


In [4]:
df_temp = df_guide.copy(deep=True)
features = list(df_guide.columns)
gene_list = list(df_temp.Metadata_Foci_Barcode_MatchedTo_GeneCode)
df_temp = df_temp.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
df_temp = df_temp.reset_index()
df_temp["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_temp = df_temp[features]
df_temp

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,...,Nuclei_Texture_SumEntropy_Syto9_10_03_256,Nuclei_Texture_SumEntropy_btubulin_10_01_256,Nuclei_Texture_SumVariance_Calnexin_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,-0.052235,-0.158375,0.013107,-0.100975,-0.163630,0.189520,0.285264,0.125879,...,-0.301305,-0.026015,-0.422110,-0.436075,-0.328195,-0.294850,0.181285,0.130507,0.198450,-0.292855
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.258900,-0.459740,-0.272072,-0.060422,-0.397970,0.081770,-0.033692,0.056625,...,-0.491365,-0.659530,-0.478125,-0.226015,-0.372045,-0.458385,-0.075922,0.011385,0.051656,-0.447315
2,AARS2,CCAACTTCTACGCAGAACAG,-0.132880,-0.252800,0.099935,0.130246,0.048945,-0.025845,-0.033111,0.097072,...,-0.579905,-0.419715,-0.291180,-0.236695,-0.223783,-0.305315,0.158725,0.008738,0.000045,-0.240930
3,AARS2,GCTGAGCCAGTTCAGAAGCA,0.023685,0.187520,0.034279,-0.255330,0.678388,-0.058525,-0.052645,-0.016378,...,-0.199930,-0.628520,-0.509130,-0.422055,-0.499170,-0.543055,-0.082475,0.050340,0.046263,-0.523410
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.246670,-0.368480,-0.168987,0.040285,-0.057675,0.577885,0.510590,0.619805,...,-0.472220,0.350025,-0.285700,-0.310835,-0.825055,-0.658360,-0.260640,-0.189415,0.186295,-0.276285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.304885,0.178550,-0.198601,0.273680,0.202490,-1.244100,-0.728195,-0.646360,...,0.549655,0.474175,0.560410,0.154985,0.141940,0.532435,0.557760,0.352925,0.373555,0.429580
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.266702,-0.608585,-0.530960,-0.001097,-0.284394,-0.058880,0.266767,0.231285,...,-0.362107,-0.478795,-0.196041,-0.314249,-0.301765,-0.306024,0.206455,0.034544,0.052445,-0.406645
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,0.439795,-0.475410,-0.097035,0.043919,-0.068639,0.464080,0.092188,0.296787,...,-0.435390,-0.297345,-0.379060,-0.375630,-0.512320,-0.335140,0.150539,-0.028893,0.091351,-0.527665
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.334780,0.366528,-0.257825,-0.370588,-0.140465,0.078908,0.190906,0.201455,...,-0.329340,-0.501910,-0.291045,-0.291190,0.042340,-0.101190,0.230875,0.110184,0.282550,-0.207045


In [5]:
df_temp = df_guide.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
# Perform principal component analysis on hit list
pca = PCA()
pca.fit(df_temp)
x = list(pca.explained_variance_ratio_)
# Find principal component that represents 90% variation
PCA_lookup = {}
for i in range(len(x)):
    distance = abs(.9-sum(x[:i+1]))
    PCA_lookup[distance] = i 
component = PCA_lookup[min(PCA_lookup.keys())]+1
print (f'Principal component representing closest to 90% variation is {component}')
# Perform principal component analysis and select components representing 90% of variation in data
pca = PCA(n_components=component)
df_guide_pca = pd.DataFrame(pca.fit_transform(df_temp),index=df_temp.index)
df_guide_pca.head()

Principal component representing closest to 90% variation is 126


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,116,117,118,119,120,121,122,123,124,125
Metadata_Foci_Barcode_MatchedTo_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAGGCGGCCCTCACGGCCG,-6.710463,0.486999,-11.986185,-1.667203,3.587151,1.954251,0.393956,-1.381604,0.795512,2.432747,...,-0.148742,-0.098854,0.093733,-0.038341,-0.209644,0.713044,0.181305,0.197558,-0.714389,-0.202157
AGCAAACTGGGGTCGCCGCG,-5.440888,4.87327,-6.463723,0.125317,5.979666,-1.431571,3.642431,-4.030995,-5.56001,-0.650523,...,-0.354534,0.389032,-0.204374,-0.896739,0.220582,0.477541,-0.1502,-0.168778,-0.397499,0.40686
CCAACTTCTACGCAGAACAG,-2.895954,2.08286,-7.39723,-1.212867,5.571158,0.768329,2.529835,-4.745525,-4.22863,-0.546666,...,0.253655,0.041214,0.020714,-0.587065,-0.098556,-0.068179,-0.109663,-0.64028,-0.363984,0.787972
GCTGAGCCAGTTCAGAAGCA,-5.137854,8.75276,-7.682749,3.428363,5.61881,3.298196,-1.713082,-1.751437,-6.63827,-1.076689,...,0.079569,0.30012,0.178696,-0.308449,0.056447,0.035915,0.608106,-0.121751,-0.297094,0.365801
ACCTCCGCTCCCAATCTACC,-9.430923,-13.188477,-1.666126,-0.493579,0.709127,-4.454061,-2.674309,2.788512,-4.126509,-5.779795,...,0.503656,0.117245,0.163742,-0.440101,0.066408,0.653199,0.136061,-0.238649,0.150585,-0.431789


In [6]:
df_guide_pca_updated = df_guide_pca.reset_index()
pca_feat_list = list(df_guide_pca_updated.columns)
feat_list = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
feat_list.extend(pca_feat_list)
df_guide_pca_updated["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_guide_pca_updated = df_guide_pca_updated[feat_list]
df_guide_pca_updated

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,0,1,2,3,4,5,6,7,...,116,117,118,119,120,121,122,123,124,125
0,AARS2,AAAGGCGGCCCTCACGGCCG,-6.710463,0.486999,-11.986185,-1.667203,3.587151,1.954251,0.393956,-1.381604,...,-0.148742,-0.098854,0.093733,-0.038341,-0.209644,0.713044,0.181305,0.197558,-0.714389,-0.202157
1,AARS2,AGCAAACTGGGGTCGCCGCG,-5.440888,4.873270,-6.463723,0.125317,5.979666,-1.431571,3.642431,-4.030995,...,-0.354534,0.389032,-0.204374,-0.896739,0.220582,0.477541,-0.150200,-0.168778,-0.397499,0.406860
2,AARS2,CCAACTTCTACGCAGAACAG,-2.895954,2.082860,-7.397230,-1.212867,5.571158,0.768329,2.529835,-4.745525,...,0.253655,0.041214,0.020714,-0.587065,-0.098556,-0.068179,-0.109663,-0.640280,-0.363984,0.787972
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-5.137854,8.752760,-7.682749,3.428363,5.618810,3.298196,-1.713082,-1.751437,...,0.079569,0.300120,0.178696,-0.308449,0.056447,0.035915,0.608106,-0.121751,-0.297094,0.365801
4,AARSD1,ACCTCCGCTCCCAATCTACC,-9.430923,-13.188477,-1.666126,-0.493579,0.709127,-4.454061,-2.674309,2.788512,...,0.503656,0.117245,0.163742,-0.440101,0.066408,0.653199,0.136061,-0.238649,0.150585,-0.431789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,4.003927,-18.131483,-5.699261,-16.737475,-4.544329,5.003279,3.466993,5.139113,...,0.848647,-0.649111,-0.550101,0.728439,-1.604472,-0.740453,-0.339756,0.269298,0.129081,0.128742
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-3.796607,1.166612,-8.339153,-3.137942,4.600958,0.103040,4.956599,-0.998438,...,-0.461645,0.120479,-1.293101,-0.019327,-0.018258,0.197794,-0.132588,-0.015223,-0.587564,-0.162986
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-4.889959,0.596805,-9.629494,-3.326799,1.445863,-1.743408,2.061291,1.434181,...,-0.017347,0.295602,-0.494953,0.397776,-0.157459,0.852551,-0.074374,0.059959,-0.039761,0.034334
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-5.551520,2.499135,-7.366429,-5.081633,1.432936,1.231327,1.764526,-1.644413,...,0.021938,-0.544383,0.133975,-0.440685,0.067196,0.570425,0.310648,-0.341108,-0.188375,0.302037


In [7]:
def calculate_mean_similarity(cosine_array):
    total_sum = 0
    for i in range(4):
        similarities = cosine_array[i]
        target_sum = 0
        for j in range(4):
            target_sum += similarities[j]  
        target_mean = ((target_sum-1)/3)
        total_sum += target_mean

    total_mean = total_sum / 4
    return total_mean

hit_guide_sim_list =[]
for i in range(len(whole_cell_hit_list)):
    df_temp = df_guide_pca_updated.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == @whole_cell_hit_list[@i]")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    hit_guide_sim = calculate_mean_similarity(cosine_array)
    hit_guide_sim_list.append(hit_guide_sim)
average_cosine_distance = sum(hit_guide_sim_list)/len(hit_guide_sim_list)
print('Average cosine distance: ',average_cosine_distance)

Average cosine distance:  0.5699460482678571


In [8]:
def cosine_to_df(df_temp, cosine_array, i):
    cosine_list = cosine_array[i]
    gene_list = list(df_temp.index)
    cosine_df = pd.DataFrame(index=gene_list)
    cosine_df['cosine'] = cosine_list
    cosine_df = cosine_df.sort_values('cosine',ascending=False)   
    return cosine_df

def ap_from_cosine_df(cosine_df,gene,n=10):    
    #print(cosine_df.iloc[:20])
    index_list = list(cosine_df.index)
    boolean = [1 if  i == gene else 0 for i in index_list ]
    grades_list=[]
    for i in range(2,n+2):
        pre_grade = sum(boolean[1:i])/(i-1)
        grades_list.append(pre_grade*boolean[i-1])
    return sum(grades_list)/3

def calculate_map(df_guide, gene):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting' | Metadata_Foci_Barcode_MatchedTo_GeneCode == @gene")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    #print(df_temp)
    ap_list = []
    cosine_array = cosine_similarity(df_temp)
    for guide in range(4):
        cosine_df = cosine_to_df(df_temp, cosine_array, guide)
        #print(cosine_df[:10])
        guide_ap = ap_from_cosine_df(cosine_df,gene,10)
        ap_list.append(guide_ap)
    return np.mean(ap_list)

In [9]:
genes_list = all_genes_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For all genes ({len(all_genes_list)} genes) the mAP values is',np.mean(map_list))

For all genes (590 genes) the mAP values is 0.4694236503452605


In [10]:
genes_list = whole_cell_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For whole cell hits ({len(whole_cell_hit_list)} genes) the mAP values is',np.mean(map_list))

For whole cell hits (306 genes) the mAP values is 0.7057977357609709


In [11]:
genes_list = comp_spec_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For compartment hits ({len(comp_spec_hit_list)} genes) the mAP values is',np.mean(map_list))

For compartment hits (78 genes) the mAP values is 0.30898326210826216


In [12]:
genes_list = non_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For non hits ({len(non_hit_list)} genes) the mAP values is',np.mean(map_list))

For non hits (206 genes) the mAP values is 0.17905413648738888
