In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from scipy.spatial import distance
import pathlib
import os
from sklearn.decomposition import PCA
import json
import random
from random import sample

random.seed(30)

In [2]:
# Set data input folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Import the guide-level profiles
df_guide = pd.read_csv("outputs/20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_median_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz")
# Subset the nontargeting guide profiles 
df_nontargeting = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting'")

# Load hits from the hit calling process
whole_cell_hits = pd.read_csv('outputs/M059K_plate_level_median_per_feat_sig_genes_5_FDR_whole_cell_hits_32_controls.csv')
comp_spec_hits = pd.read_csv('outputs/M059K_plate_level_median_per_feat_sig_genes_5_FDR_compartment_specific_hits_32_controls.csv')
all_hits = pd.concat([whole_cell_hits,comp_spec_hits])
hit_list = list(comp_spec_hits.Gene) + list(whole_cell_hits.Gene)
whole_cell_hit_list = list(whole_cell_hits.Gene)
comp_spec_hit_list = list(comp_spec_hits.Gene)

# list non hit genes
all_genes_list = list(df_guide.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
all_genes_list.remove("nontargeting")
non_hit_list = [gene for gene in all_genes_list if gene not in hit_list]
print("All genes:",len(all_genes_list),"non-hit genes",len(non_hit_list),'Whole cell hits',len(whole_cell_hit_list),'Compartment hits',len(comp_spec_hit_list))


All genes: 590 non-hit genes 329 Whole cell hits 203 Compartment hits 58


In [4]:
df_temp = df_guide.copy(deep=True)
features = list(df_guide.columns)
gene_list = list(df_temp.Metadata_Foci_Barcode_MatchedTo_GeneCode)
df_temp = df_temp.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
df_temp = df_temp.reset_index()
df_temp["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_temp = df_temp[features]
df_temp

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,Cells_AreaShape_CentralMoment_0_1,Cells_AreaShape_CentralMoment_0_3,Cells_AreaShape_CentralMoment_1_0,Cells_AreaShape_CentralMoment_1_1,Cells_AreaShape_CentralMoment_1_2,Cells_AreaShape_CentralMoment_1_3,Cells_AreaShape_CentralMoment_2_1,Cells_AreaShape_CentralMoment_2_2,...,Nuclei_Texture_SumVariance_Calnexin_20_03_256,Nuclei_Texture_SumVariance_Calnexin_5_02_256,Nuclei_Texture_SumVariance_DNA_10_01_256,Nuclei_Texture_SumVariance_DNA_20_01_256,Nuclei_Texture_SumVariance_DNA_20_03_256,Nuclei_Texture_SumVariance_GM130_20_01_256,Nuclei_Texture_SumVariance_GM130_5_00_256,Nuclei_Texture_SumVariance_Phalloidin_10_02_256,Nuclei_Texture_SumVariance_Phalloidin_20_01_256,Nuclei_Texture_Variance_Calnexin_20_00_256
0,AARS2,AAAGGCGGCCCTCACGGCCG,0.033501,-0.021708,-0.446427,-0.052054,-0.106097,-0.035181,-0.024508,0.101165,...,-0.283260,1.223165,-0.861290,0.045650,-0.058530,-0.210587,-0.240745,0.081160,-0.199070,-0.199375
1,AARS2,AGCAAACTGGGGTCGCCGCG,0.098789,-0.041051,0.013143,0.027802,0.000183,0.006994,0.014238,-0.307540,...,-0.479160,0.042645,0.355875,-0.529805,-0.475970,-0.619375,-0.188795,-0.034064,-0.397135,-0.332470
2,AARS2,CCAACTTCTACGCAGAACAG,0.080462,-0.007381,-0.019076,0.096305,0.049998,0.033392,0.008300,-0.290030,...,-0.479160,-0.268070,-0.216838,-0.529805,-0.486250,-0.619375,-0.009330,-0.114787,-0.397135,-0.447315
3,AARS2,GCTGAGCCAGTTCAGAAGCA,-0.115877,-0.095393,-0.234975,-0.022434,0.000537,-0.001022,-0.056283,-0.161884,...,-0.381090,-0.364134,-0.294380,-0.467000,-0.367030,-0.376080,-0.722730,-0.246800,-0.288600,-0.447315
4,AARSD1,ACCTCCGCTCCCAATCTACC,0.106199,-0.002299,0.005447,-0.032071,0.010073,0.005625,0.008212,0.558170,...,1.102550,0.325770,0.798840,1.244500,1.149600,1.139140,0.281992,-0.006721,0.633935,0.934645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,-0.093410,-0.006135,-0.025522,0.018110,0.037509,-0.003599,-0.075485,0.138834,...,-0.094215,-0.086080,0.050147,0.186640,-0.079335,0.047555,-0.066887,0.027325,-0.005835,-0.128360
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-0.074922,-0.016022,0.008407,0.011823,-0.005692,0.014658,0.006559,-0.107477,...,-0.166074,0.527320,0.254985,-0.427635,-0.367030,-0.376080,0.019385,-0.172278,-0.281090,-0.218643
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-0.197905,-0.024100,0.051483,0.007274,0.017820,0.015592,-0.001664,-0.244840,...,-0.286660,0.017660,-0.006520,-0.358925,-0.279415,0.139215,-0.016525,0.031936,-0.157765,-0.302650
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.083609,0.003866,-0.022012,0.139262,0.010856,0.014981,-0.107854,-0.147185,...,-0.366595,-0.088162,0.008927,-0.427635,-0.350825,-0.376080,0.215435,-0.196010,-0.281090,-0.283045


In [5]:
df_temp = df_guide.drop('Metadata_Foci_Barcode_MatchedTo_GeneCode',axis=1).set_index('Metadata_Foci_Barcode_MatchedTo_Barcode')
# Perform principal component analysis on hit list
pca = PCA()
pca.fit(df_temp)
x = list(pca.explained_variance_ratio_)
# Find principal component that represents 70% variation
PCA_lookup = {}
for i in range(len(x)):
    distance = abs(.9-sum(x[:i+1]))
    PCA_lookup[distance] = i 
component = PCA_lookup[min(PCA_lookup.keys())]+1
print (f'Principal component representing closest to 90% variation is {component}')
# Perform principal component analysis and select components representing 70% of variation in data
pca = PCA(n_components=component)
df_guide_pca = pd.DataFrame(pca.fit_transform(df_temp),index=df_temp.index)
df_guide_pca.head()

Principal component representing closest to 90% variation is 465


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
Metadata_Foci_Barcode_MatchedTo_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAGGCGGCCCTCACGGCCG,-12.611862,11.568792,-0.629058,-6.342505,1.750918,-4.059962,-8.323564,-6.724534,5.0196,2.663829,...,-0.539952,0.237602,-0.103936,-0.983244,0.374232,-0.300048,-0.457735,-0.278632,-0.468465,0.116052
AGCAAACTGGGGTCGCCGCG,-7.311119,5.791851,-1.729677,-0.028421,-1.532662,1.726794,-0.418859,0.421092,-2.371502,-0.574886,...,-0.15082,-0.13429,0.171571,0.340987,0.115243,-0.050269,-0.267945,0.088411,-0.224443,0.14657
CCAACTTCTACGCAGAACAG,-1.403723,5.652499,-4.846151,-2.463511,3.872789,3.556736,0.105803,-0.91331,-2.026795,-0.982583,...,-0.15223,-0.03543,-0.152973,0.127581,0.292581,0.053375,0.43763,0.124756,-0.232952,0.178174
GCTGAGCCAGTTCAGAAGCA,10.999768,4.787027,-0.414024,-4.655487,-3.419273,-0.156374,6.183178,-1.849355,-0.695506,-0.246808,...,-0.308717,0.848416,-0.231705,-0.541361,-0.953342,0.460787,0.856216,0.187324,0.03228,0.227981
ACCTCCGCTCCCAATCTACC,-12.042125,-19.339157,5.109059,3.052993,-1.754634,-0.21668,-1.97668,-1.584205,0.385365,-0.740254,...,-0.084464,0.024137,-0.240069,0.060665,0.074922,-0.031983,-0.063801,-0.230667,-0.033338,0.08476


In [6]:
df_guide_pca_updated = df_guide_pca.reset_index()
pca_feat_list = list(df_guide_pca_updated.columns)
feat_list = ['Metadata_Foci_Barcode_MatchedTo_GeneCode']
feat_list.extend(pca_feat_list)
df_guide_pca_updated["Metadata_Foci_Barcode_MatchedTo_GeneCode"] = gene_list
df_guide_pca_updated = df_guide_pca_updated[feat_list]
df_guide_pca_updated

Unnamed: 0,Metadata_Foci_Barcode_MatchedTo_GeneCode,Metadata_Foci_Barcode_MatchedTo_Barcode,0,1,2,3,4,5,6,7,...,455,456,457,458,459,460,461,462,463,464
0,AARS2,AAAGGCGGCCCTCACGGCCG,-12.611862,11.568792,-0.629058,-6.342505,1.750918,-4.059962,-8.323564,-6.724534,...,-0.539952,0.237602,-0.103936,-0.983244,0.374232,-0.300048,-0.457735,-0.278632,-0.468465,0.116052
1,AARS2,AGCAAACTGGGGTCGCCGCG,-7.311119,5.791851,-1.729677,-0.028421,-1.532662,1.726794,-0.418859,0.421092,...,-0.150820,-0.134290,0.171571,0.340987,0.115243,-0.050269,-0.267945,0.088411,-0.224443,0.146570
2,AARS2,CCAACTTCTACGCAGAACAG,-1.403723,5.652499,-4.846151,-2.463511,3.872789,3.556736,0.105803,-0.913310,...,-0.152230,-0.035430,-0.152973,0.127581,0.292581,0.053375,0.437630,0.124756,-0.232952,0.178174
3,AARS2,GCTGAGCCAGTTCAGAAGCA,10.999768,4.787027,-0.414024,-4.655487,-3.419273,-0.156374,6.183178,-1.849355,...,-0.308717,0.848416,-0.231705,-0.541361,-0.953342,0.460787,0.856216,0.187324,0.032280,0.227981
4,AARSD1,ACCTCCGCTCCCAATCTACC,-12.042125,-19.339157,5.109059,3.052993,-1.754634,-0.216680,-1.976680,-1.584205,...,-0.084464,0.024137,-0.240069,0.060665,0.074922,-0.031983,-0.063801,-0.230667,-0.033338,0.084760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,nontargeting,TAAGATCCGCGGGTGGCAAC,-2.046619,-8.757570,-1.480610,0.105172,-4.646038,-0.889067,-1.479750,-1.407919,...,-0.040044,0.106140,-0.078792,0.207532,-0.065128,-0.130121,0.083579,-0.095784,-0.007244,0.393247
2396,nontargeting,TCCCGGTTGGTGAACGATAC,-10.038533,-1.072210,-3.895410,1.591304,0.664179,0.219409,-1.842280,-0.461682,...,-0.173059,-0.353899,-0.281638,0.183630,-0.120827,0.012593,-0.256977,-0.105605,0.091732,0.058836
2397,nontargeting,TGCCGTGAAAAGACGCTGCG,-3.309986,4.671463,3.814311,1.240977,-0.504655,-0.843307,1.553987,-1.919658,...,-0.448559,0.004717,-0.429402,0.109379,0.013776,-0.060127,-0.040784,0.256541,-0.644680,0.492879
2398,nontargeting,TGGCCACGAATTCCGCCGCC,0.971170,-0.397247,-5.989617,2.164388,1.332756,-0.461387,1.649887,-0.369219,...,0.500754,-0.291181,-0.239102,0.015498,0.151755,0.465061,-0.240735,0.043024,-0.445287,-0.341445


In [7]:
def calculate_mean_similarity(cosine_array):
    total_sum = 0
    for i in range(4):
        similarities = cosine_array[i]
        target_sum = 0
        for j in range(4):
            target_sum += similarities[j]  
        target_mean = ((target_sum-1)/3)
        total_sum += target_mean

    total_mean = total_sum / 4
    return total_mean

hit_guide_sim_list =[]
for i in range(len(whole_cell_hit_list)):
    df_temp = df_guide_pca_updated.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == @whole_cell_hit_list[@i]")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    hit_guide_sim = calculate_mean_similarity(cosine_array)
    hit_guide_sim_list.append(hit_guide_sim)
average_cosine_distance = sum(hit_guide_sim_list)/len(hit_guide_sim_list)
print('Average cosine distance: ',average_cosine_distance)

Average cosine distance:  0.15758121043312331


In [8]:
def cosine_to_grade(df_temp, cosine_array, i):
    cosine_list = cosine_array[i]
    gene_list = list(df_temp.index)
    cosine_df = pd.DataFrame(index=gene_list)
    cosine_df['cosine'] = cosine_list
    cosine_df = cosine_df.sort_values('cosine',ascending=False)
    grade_list = list(cosine_df.iloc[0:4].index)
    grade = (grade_list.count(gene)-1)/3
    return grade
    
def grade_single_gene(df_guide, gene):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting' | Metadata_Foci_Barcode_MatchedTo_GeneCode == @gene")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    #grade = cosine_to_grade(df_temp, cosine_array)
    grade_total = 0
    for i in range(4):
        grade = cosine_to_grade(df_temp, cosine_array, i)
        grade_total+=grade
    mAP = grade_total/4
    return mAP

grade_list = []
for i in range(len(whole_cell_hit_list)):
    gene = whole_cell_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide_pca_updated, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: ABCF1
Calculating mean average precision for gene: ACTR2
Calculating mean average precision for gene: ACTR3
Calculating mean average precision for gene: ALG8
Calculating mean average precision for gene: ANAPC10
Calculating mean average precision for gene: APEX1
Calculating mean average precision for gene: ARF4
Calculating mean average precision for gene: ARNTL
Calculating mean average precision for gene: ARPC2
Calculating mean average precision for gene: ARPC3
Calculating mean average precision for gene: ARPC4
Calculating mean average precision for gene: ATF4
Calculating mean average precision for gene: ATIC
Calculating mean average precision for gene: ATP6V0A1
Calculating mean average precision for gene: ATP6V0B
Calculating mean average precision for gene: ATP6V0C
Calculating mean average precision for gene: ATP6V0D1
Calculating mean average precision for gene: ATP6V1H
Calculating mean average precision for gene: BAK1
Calculating mean avera

0.2549261083743842

In [9]:
grade_list = []
for i in range(len(comp_spec_hit_list)):
    gene = comp_spec_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide_pca_updated, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: ALG5
Calculating mean average precision for gene: ATL2
Calculating mean average precision for gene: CBX3
Calculating mean average precision for gene: CLGN
Calculating mean average precision for gene: COA3
Calculating mean average precision for gene: CRKL
Calculating mean average precision for gene: DDA1
Calculating mean average precision for gene: DHX35
Calculating mean average precision for gene: DVL1
Calculating mean average precision for gene: DYNLL1
Calculating mean average precision for gene: EIF3C
Calculating mean average precision for gene: EIF4A2
Calculating mean average precision for gene: ETFB
Calculating mean average precision for gene: EXOC3
Calculating mean average precision for gene: EXOSC3
Calculating mean average precision for gene: GAPVD1
Calculating mean average precision for gene: GSR
Calculating mean average precision for gene: HAUS8
Calculating mean average precision for gene: HYOU1
Calculating mean average precision for

0.125

In [10]:
grade_list = []
for i in range(len(non_hit_list)):
    gene = non_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide_pca_updated, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: AARS2
Calculating mean average precision for gene: AARSD1
Calculating mean average precision for gene: ABLIM1
Calculating mean average precision for gene: ADAR
Calculating mean average precision for gene: AIMP1
Calculating mean average precision for gene: AIMP2
Calculating mean average precision for gene: ALG13
Calculating mean average precision for gene: ALG6
Calculating mean average precision for gene: ALKBH7
Calculating mean average precision for gene: ANKRD13A
Calculating mean average precision for gene: ANXA2
Calculating mean average precision for gene: AP3S1
Calculating mean average precision for gene: ARFGAP3
Calculating mean average precision for gene: ARPC1A
Calculating mean average precision for gene: ARPC1B
Calculating mean average precision for gene: ARPC5
Calculating mean average precision for gene: ARPC5L
Calculating mean average precision for gene: ATF3
Calculating mean average precision for gene: ATG14
Calculating mean averag

0.06864235055724417