In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from scipy.spatial import distance
import pathlib
import os
from sklearn.decomposition import PCA
import json
import random
from random import sample

random.seed(30)

In [2]:
# Set data input folder
input_folder = "inputs"

# Set output folder, subfolder
output_folder = "outputs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder, exist_ok=True)

In [3]:
# Import the guide-level profiles
df_guide = pd.read_csv("outputs/20230601_6W_CP469_SABER_Pilot_M059K_guide_normalized_feature_select_median_merged_ALLBATCHES___ALLWELLS_cp_features.csv.gz")
# Subset the nontargeting guide profiles 
df_nontargeting = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting'")

# Load hits from the hit calling process
whole_cell_hits = pd.read_csv('outputs/M059K_plate_level_median_per_feat_sig_genes_5_FDR_whole_cell_hits_32_controls.csv')
comp_spec_hits = pd.read_csv('outputs/M059K_plate_level_median_per_feat_sig_genes_5_FDR_compartment_specific_hits_32_controls.csv')
all_hits = pd.concat([whole_cell_hits,comp_spec_hits])
hit_list = list(comp_spec_hits.Gene) + list(whole_cell_hits.Gene)
whole_cell_hit_list = list(whole_cell_hits.Gene)
comp_spec_hit_list = list(comp_spec_hits.Gene)

# list non hit genes
all_genes_list = list(df_guide.Metadata_Foci_Barcode_MatchedTo_GeneCode.unique())
all_genes_list.remove("nontargeting")
non_hit_list = [gene for gene in all_genes_list if gene not in hit_list]
print("All genes:",len(all_genes_list),"non-hit genes",len(non_hit_list),'Whole cell hits',len(whole_cell_hit_list),'Compartment hits',len(comp_spec_hit_list))


All genes: 590 non-hit genes 329 Whole cell hits 203 Compartment hits 58


In [4]:
def calculate_mean_similarity(cosine_array):
    total_sum = 0
    for i in range(4):
        similarities = cosine_array[i]
        target_sum = 0
        for j in range(4):
            target_sum += similarities[j]  
        target_mean = ((target_sum-1)/3)
        total_sum += target_mean

    total_mean = total_sum / 4
    return total_mean

hit_guide_sim_list =[]
for i in range(len(whole_cell_hit_list)):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == @whole_cell_hit_list[@i]")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    hit_guide_sim = calculate_mean_similarity(cosine_array)
    hit_guide_sim_list.append(hit_guide_sim)
average_cosine_distance = sum(hit_guide_sim_list)/len(hit_guide_sim_list)
print('Average cosine distance: ',average_cosine_distance)

Average cosine distance:  0.1499613576630236


In [5]:
df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == @whole_cell_hit_list[130]")
df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
cosine_array = cosine_similarity(df_temp)
print(whole_cell_hit_list[5])
cosine_array[0]

APEX1


array([ 1.        , -0.03879113, -0.04860413,  0.06872586])

In [6]:
def cosine_to_grade(df_temp, cosine_array, i):
    cosine_list = cosine_array[i]
    gene_list = list(df_temp.index)
    cosine_df = pd.DataFrame(index=gene_list)
    cosine_df['cosine'] = cosine_list
    cosine_df = cosine_df.sort_values('cosine',ascending=False)
    grade_list = list(cosine_df.iloc[0:4].index)
    grade = (grade_list.count(gene)-1)/3
    return grade
    
def grade_single_gene(df_guide, gene):
    df_temp = df_guide.query("Metadata_Foci_Barcode_MatchedTo_GeneCode == 'nontargeting' | Metadata_Foci_Barcode_MatchedTo_GeneCode == @gene")
    df_temp = df_temp.drop(['Metadata_Foci_Barcode_MatchedTo_Barcode'],axis=1)
    df_temp = df_temp.set_index("Metadata_Foci_Barcode_MatchedTo_GeneCode")
    cosine_array = cosine_similarity(df_temp)
    #grade = cosine_to_grade(df_temp, cosine_array)
    grade_total = 0
    for i in range(4):
        grade = cosine_to_grade(df_temp, cosine_array, i)
        grade_total+=grade
    mAP = grade_total/4
    return mAP

grade_list = []
for i in range(len(whole_cell_hit_list)):
    gene = whole_cell_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: ABCF1
Calculating mean average precision for gene: ACTR2
Calculating mean average precision for gene: ACTR3
Calculating mean average precision for gene: ALG8
Calculating mean average precision for gene: ANAPC10
Calculating mean average precision for gene: APEX1
Calculating mean average precision for gene: ARF4
Calculating mean average precision for gene: ARNTL
Calculating mean average precision for gene: ARPC2
Calculating mean average precision for gene: ARPC3
Calculating mean average precision for gene: ARPC4
Calculating mean average precision for gene: ATF4
Calculating mean average precision for gene: ATIC
Calculating mean average precision for gene: ATP6V0A1
Calculating mean average precision for gene: ATP6V0B
Calculating mean average precision for gene: ATP6V0C
Calculating mean average precision for gene: ATP6V0D1
Calculating mean average precision for gene: ATP6V1H
Calculating mean average precision for gene: BAK1
Calculating mean avera

0.251231527093596

In [7]:
grade_list = []
for i in range(len(comp_spec_hit_list)):
    gene = comp_spec_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: ALG5
Calculating mean average precision for gene: ATL2
Calculating mean average precision for gene: CBX3
Calculating mean average precision for gene: CLGN
Calculating mean average precision for gene: COA3
Calculating mean average precision for gene: CRKL
Calculating mean average precision for gene: DDA1
Calculating mean average precision for gene: DHX35
Calculating mean average precision for gene: DVL1
Calculating mean average precision for gene: DYNLL1
Calculating mean average precision for gene: EIF3C
Calculating mean average precision for gene: EIF4A2
Calculating mean average precision for gene: ETFB
Calculating mean average precision for gene: EXOC3
Calculating mean average precision for gene: EXOSC3
Calculating mean average precision for gene: GAPVD1
Calculating mean average precision for gene: GSR
Calculating mean average precision for gene: HAUS8
Calculating mean average precision for gene: HYOU1
Calculating mean average precision for

0.1235632183908046

In [8]:
grade_list = []
for i in range(len(non_hit_list)):
    gene = non_hit_list[i]
    print(f"Calculating mean average precision for gene: {gene}")
    gene_grade = grade_single_gene(df_guide, gene)
    grade_list.append(gene_grade)
    
sum(grade_list)/len(grade_list)

Calculating mean average precision for gene: AARS2
Calculating mean average precision for gene: AARSD1
Calculating mean average precision for gene: ABLIM1
Calculating mean average precision for gene: ADAR
Calculating mean average precision for gene: AIMP1
Calculating mean average precision for gene: AIMP2
Calculating mean average precision for gene: ALG13
Calculating mean average precision for gene: ALG6
Calculating mean average precision for gene: ALKBH7
Calculating mean average precision for gene: ANKRD13A
Calculating mean average precision for gene: ANXA2
Calculating mean average precision for gene: AP3S1
Calculating mean average precision for gene: ARFGAP3
Calculating mean average precision for gene: ARPC1A
Calculating mean average precision for gene: ARPC1B
Calculating mean average precision for gene: ARPC5
Calculating mean average precision for gene: ARPC5L
Calculating mean average precision for gene: ATF3
Calculating mean average precision for gene: ATG14
Calculating mean averag

0.0658561296859169