In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from scipy.spatial import distance
import pathlib
import os
from sklearn.decomposition import PCA
import json
import random
from random import sample

random.seed(30)

In [2]:
# Set data input folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Import the guide-level profiles
df_guide = pd.read_csv("outputs/20240202_6W_CP498_SABER_Pilot_HeLa_SABER_only_guide_normalized_merged_feature_select_median_ALLWELLS.csv.gz")
# Subset the nontargeting guide profiles 
df_nontargeting = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting'")

# Load hits from the hit calling process
whole_cell_hits = pd.read_csv('outputs/HeLa_SABER_plate_level_median_per_feat_sig_genes_1_FDR_whole_cell_hits.csv')
comp_spec_hits = pd.read_csv('outputs/HeLa_SABER_plate_level_median_per_feat_sig_genes_1_FDR_compartment_specific_hits.csv')
all_hits = pd.concat([whole_cell_hits,comp_spec_hits])
hit_list = list(comp_spec_hits.Gene) + list(whole_cell_hits.Gene)
whole_cell_hit_list = list(whole_cell_hits.Gene)
comp_spec_hit_list = list(comp_spec_hits.Gene)

# list non hit genes
all_genes_list = list(df_guide.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
all_genes_list.remove("nontargeting")
non_hit_list = [gene for gene in all_genes_list if gene not in hit_list]
print("All genes:",len(all_genes_list),"non-hit genes",len(non_hit_list),'Whole cell hits',len(whole_cell_hit_list),'Compartment hits',len(comp_spec_hit_list))


All genes: 590 non-hit genes 201 Whole cell hits 279 Compartment hits 110


In [4]:
df_temp = df_guide.copy(deep=True)
features = list(df_guide.columns)
gene_list = list(df_temp.Metadata_Foci_Barcode_MatchedTo_GeneCode)
df_temp = df_temp.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
df_temp = df_temp.reset_index()
df_temp["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_temp = df_temp[features]
df_temp

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,...,Nuclei_Texture_SumEntropy_PRSP6_10_01_256,Nuclei_Texture_SumVariance_NfKb_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_00_256,Nuclei_Texture_SumVariance_Syto9_10_01_256,Nuclei_Texture_SumVariance_Syto9_10_02_256,Nuclei_Texture_SumVariance_Syto9_10_03_256,Nuclei_Texture_SumVariance_TDP43_10_01_256,Nuclei_Texture_SumVariance_TDP43_10_02_256,Nuclei_Texture_SumVariance_TDP43_10_03_256,Nuclei_Texture_Variance_Syto9_10_01_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.286167,0.028607,-0.288900,0.559662,0.248283,-0.155219,0.082665,-0.135980,...,0.047652,-0.952504,-0.812984,-0.336496,-0.605291,-0.311832,-0.428218,-0.602523,-0.136156,-0.474004
1,AARS2,AGCAAACTGGGGTCGCCGCG,-0.376567,-0.084483,0.038750,-0.537643,-0.188543,-0.174085,-0.008743,0.080111,...,-1.028130,-0.731683,-0.274541,-0.715289,-0.567582,-0.355411,-0.786580,-0.215661,-0.539864,-0.426638
2,AARS2,CCAACTTCTACGCAGAACAG,0.374451,-0.062411,0.216489,-0.121187,-0.231957,-0.229726,-0.028984,-0.034369,...,-1.012095,-0.599724,-0.122199,-0.256789,-0.375769,-0.716810,-0.305359,-0.450250,-0.435628,-0.374634
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.994253,-0.167492,-0.342195,0.185172,-0.270254,-0.012194,-0.072155,0.644617,...,-0.749566,-0.638792,-0.576873,-0.734981,-0.508235,-0.734689,-0.268112,-0.245855,-0.524880,-0.418964
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.216807,0.024692,-0.377658,-0.252822,0.505456,0.583310,0.810882,0.055234,...,0.098922,-0.744067,-0.836499,-0.520663,-0.861196,-0.630555,-1.184898,-0.944579,-0.316360,-0.291860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,0.263409,-0.167492,0.585517,0.371861,-1.396831,-1.415993,-1.458483,-0.101232,...,1.349904,0.821134,0.335353,0.594888,0.942876,0.437077,0.425053,0.555710,0.573956,0.326756
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.747693,-0.263576,-0.145198,-0.197777,0.351259,-0.106476,0.077146,0.600798,...,-0.291884,-1.081500,-0.644822,-0.419891,-0.709275,-1.131810,0.026129,-0.228532,-0.276853,-0.792633
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,0.456276,-0.062411,0.062339,-0.137677,0.250134,0.257176,0.114678,0.102850,...,-0.038258,-0.929296,-0.701177,-0.327030,-0.407781,-0.614023,-0.707128,-0.856426,-0.496192,-0.737731
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.382628,-0.551862,-0.086078,-0.159482,0.168746,0.500101,0.383496,-0.056855,...,-1.000795,-0.700653,-0.131112,0.081586,-0.472923,-0.098026,-0.343959,-0.328896,-0.504917,-0.223976


In [5]:
df_temp = df_guide.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
# Perform principal component analysis on hit list
pca = PCA()
pca.fit(df_temp)
x = list(pca.explained_variance_ratio_)
# Find principal component that represents 90% variation
PCA_lookup = {}
for i in range(len(x)):
    distance = abs(.9-sum(x[:i+1]))
    PCA_lookup[distance] = i 
component = PCA_lookup[min(PCA_lookup.keys())]+1
print (f'Principal component representing closest to 90% variation is {component}')
# Perform principal component analysis and select components representing 90% of variation in data
pca = PCA(n_components=component)
df_guide_pca = pd.DataFrame(pca.fit_transform(df_temp),index=df_temp.index)
df_guide_pca.head()

Principal component representing closest to 90% variation is 105


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
Metadata_Foci_Barcode_MatchedTo_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAGGCGGCCCTCACGGCCG,-9.927714,-10.363048,-16.919941,-1.808576,-7.302551,-1.720419,2.938468,0.189887,-2.022779,-5.627067,...,1.117789,-0.128193,-0.521531,-0.06841,0.090368,0.092639,-0.67994,0.337226,1.507687,-1.24791
AGCAAACTGGGGTCGCCGCG,-10.93437,-13.483884,-1.659578,-2.412464,-1.750032,5.111465,5.397133,-16.177699,2.649917,-0.848376,...,-0.401202,0.68282,1.640284,-0.028864,-1.188852,-0.662635,-0.799945,-0.574,0.874268,-0.685046
CCAACTTCTACGCAGAACAG,-5.445037,-14.466683,-2.80221,-3.422422,-4.993501,0.926324,5.67571,-13.280781,2.291404,6.161931,...,-0.128956,0.964017,0.585295,-1.41226,0.166829,-0.603197,-0.446179,-0.668894,0.760232,0.821056
GCTGAGCCAGTTCAGAAGCA,-11.092848,-17.477347,6.022897,0.637782,-13.049446,0.519385,3.829102,-13.522351,2.083018,4.079536,...,0.597441,-1.247062,0.921299,-1.050499,0.407465,0.022244,0.197231,-1.236179,0.074084,-1.048374
ACCTCCGCTCCCAATCTACC,-11.051094,10.244638,-20.76596,-0.166493,1.615477,5.929733,1.355259,-2.164857,4.202877,-0.829689,...,0.278215,-0.020964,0.612208,0.409184,-0.065916,-0.849527,-0.522935,-0.432912,0.193651,-0.311747


In [6]:
df_guide_pca_updated = df_guide_pca.reset_index()
pca_feat_list = list(df_guide_pca_updated.columns)
feat_list = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
feat_list.extend(pca_feat_list)
df_guide_pca_updated["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_guide_pca_updated = df_guide_pca_updated[feat_list]
df_guide_pca_updated

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,0,1,2,3,4,5,6,7,...,95,96,97,98,99,100,101,102,103,104
0,AARS2,AAAGGCGGCCCTCACGGCCG,-9.927714,-10.363048,-16.919941,-1.808576,-7.302551,-1.720419,2.938468,0.189887,...,1.117789,-0.128193,-0.521531,-0.068410,0.090368,0.092639,-0.679940,0.337226,1.507687,-1.247910
1,AARS2,AGCAAACTGGGGTCGCCGCG,-10.934370,-13.483884,-1.659578,-2.412464,-1.750032,5.111465,5.397133,-16.177699,...,-0.401202,0.682820,1.640284,-0.028864,-1.188852,-0.662635,-0.799945,-0.574000,0.874268,-0.685046
2,AARS2,CCAACTTCTACGCAGAACAG,-5.445037,-14.466683,-2.802210,-3.422422,-4.993501,0.926324,5.675710,-13.280781,...,-0.128956,0.964017,0.585295,-1.412260,0.166829,-0.603197,-0.446179,-0.668894,0.760232,0.821056
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-11.092848,-17.477347,6.022897,0.637782,-13.049446,0.519385,3.829102,-13.522351,...,0.597441,-1.247062,0.921299,-1.050499,0.407465,0.022244,0.197231,-1.236179,0.074084,-1.048374
4,AARSD1,ACCTCCGCTCCCAATCTACC,-11.051094,10.244638,-20.765960,-0.166493,1.615477,5.929733,1.355259,-2.164857,...,0.278215,-0.020964,0.612208,0.409184,-0.065916,-0.849527,-0.522935,-0.432912,0.193651,-0.311747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,14.356203,-3.012670,-22.281876,-21.403339,-1.051677,-10.777645,10.687603,11.709133,...,0.181302,0.378459,0.295610,-0.859600,-0.628133,0.713893,1.779583,-0.510914,-0.090763,0.578216
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-2.663293,-10.507725,-10.286200,-3.125135,-4.518898,-0.409850,2.087296,-4.621259,...,-0.327283,0.124244,-0.800804,-0.376625,0.010825,-0.235899,0.076506,0.371485,-0.480332,0.096770
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-11.186793,-13.967958,-5.836028,-1.251022,-3.981048,-0.359620,1.645743,0.391380,...,-0.172372,0.257802,0.233922,0.250029,0.447242,-0.203582,0.088387,0.145406,-0.274508,-0.096507
2398,nontargeting,TGGCCACGAATTCCGCCGCC,-13.317918,-13.688782,-5.300725,-5.817619,-3.539680,-2.202030,1.905486,-1.934071,...,0.748681,1.293496,-0.326951,-0.004736,-0.209038,-1.248334,0.750385,0.304225,0.344621,-0.588380


In [7]:
def calculate_mean_similarity(cosine_array):
    total_sum = 0
    for i in range(4):
        similarities = cosine_array[i]
        target_sum = 0
        for j in range(4):
            target_sum += similarities[j]  
        target_mean = ((target_sum-1)/3)
        total_sum += target_mean

    total_mean = total_sum / 4
    return total_mean

hit_guide_sim_list =[]
for i in range(len(whole_cell_hit_list)):
    df_temp = df_guide_pca_updated.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == @whole_cell_hit_list[@i]")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    hit_guide_sim = calculate_mean_similarity(cosine_array)
    hit_guide_sim_list.append(hit_guide_sim)
average_cosine_distance = sum(hit_guide_sim_list)/len(hit_guide_sim_list)
print('Average cosine distance: ',average_cosine_distance)

Average cosine distance:  0.6655476642245279


In [8]:
def cosine_to_df(df_temp, cosine_array, i):
    cosine_list = cosine_array[i]
    gene_list = list(df_temp.index)
    cosine_df = pd.DataFrame(index=gene_list)
    cosine_df['cosine'] = cosine_list
    cosine_df = cosine_df.sort_values('cosine',ascending=False)   
    return cosine_df

def ap_from_cosine_df(cosine_df,gene,n=10):    
    #print(cosine_df.iloc[:20])
    index_list = list(cosine_df.index)
    boolean = [1 if  i == gene else 0 for i in index_list ]
    grades_list=[]
    for i in range(2,n+2):
        pre_grade = sum(boolean[1:i])/(i-1)
        grades_list.append(pre_grade*boolean[i-1])
    return sum(grades_list)/3

def calculate_map(df_guide, gene):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting' | Metadata_Foci_Barcode_MatchedTo_GeneCode == @gene")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    #print(df_temp)
    ap_list = []
    cosine_array = cosine_similarity(df_temp)
    for guide in range(4):
        cosine_df = cosine_to_df(df_temp, cosine_array, guide)
        #print(cosine_df[:10])
        guide_ap = ap_from_cosine_df(cosine_df,gene,10)
        ap_list.append(guide_ap)
    return np.mean(ap_list)

In [9]:
genes_list = all_genes_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For all genes ({len(all_genes_list)} genes) the mAP values is',np.mean(map_list))

For all genes (590 genes) the mAP values is 0.545849699578513


In [10]:
genes_list = whole_cell_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For whole cell hits ({len(whole_cell_hit_list)} genes) the mAP values is',np.mean(map_list))

For whole cell hits (279 genes) the mAP values is 0.8213506997781191


In [11]:
genes_list = comp_spec_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For compartment hits ({len(comp_spec_hit_list)} genes) the mAP values is',np.mean(map_list))

For compartment hits (110 genes) the mAP values is 0.43426196488696484


In [12]:
genes_list = non_hit_list
map_list = []
for i in range(len(genes_list)):
    gene = genes_list[i]
    #print(f"Calculating mean average precision for gene: {gene}")
    gene_map = calculate_map(df_guide_pca_updated, gene)
    #map_list.append([gene, gene_map])
    map_list.append(gene_map)
print(f'For non hits ({len(non_hit_list)} genes) the mAP values is',np.mean(map_list))

For non hits (201 genes) the mAP values is 0.22450577798836507
