In [1]:
### if you have a new dataset and want to use the knowledge kernels, then use the following notebook to update it:
### this is an example of processing using genome-wide K562
### First download the knowledge embedding from: https://drive.google.com/file/d/16p9sQYkhpM-PBAcNWQdjL9pxDZ46krQX/view?usp=drive_link

import sys
sys.path.append('../GEARS/')

path_to_emb = '/home/huangk28/scratch/knowledge_kernels/'

from gears import PertData
pert_data = PertData('/home/huangk28/scratch/perturb_seq_data/gears_data/') # specific saved folder
pert_data.load(data_path = '/home/huangk28/scratch/perturb_seq_data/gears_data/replogle_k562_gw_1000hvg')

Found local copy...
Found local copy...
These perturbations are not in the GO graph and their perturbation can thus not be predicted
['TMA7+ctrl' 'CCDC169+ctrl' 'NEDD8-MDP1+ctrl' 'FAM117A+ctrl'
 'CCDC138+ctrl' 'C14orf178+ctrl' 'GOLGA6L1+ctrl' 'AHSA2+ctrl'
 'C1orf61+ctrl' 'C12orf76+ctrl' 'C6orf52+ctrl' 'KRTAP4-7+ctrl'
 'WDR89+ctrl' 'AC015871.1+ctrl' 'C16orf95+ctrl' 'C21orf58+ctrl'
 'C7orf26+ctrl' 'CENPBD1+ctrl' 'UBALD2+ctrl' 'FAM104B+ctrl' 'ZBED6CL+ctrl'
 'RTL8C+ctrl' 'RBM14-RBM4+ctrl' 'SZRD1+ctrl' 'CCDC71+ctrl' 'C8orf82+ctrl'
 'CCDC18+ctrl' 'OXLD1+ctrl' 'C4orf36+ctrl' 'RPL17-C18orf32+ctrl'
 'CSAG1+ctrl' 'C19orf53+ctrl' 'FAM131A+ctrl' 'C20orf96+ctrl'
 'CCDC144NL+ctrl' 'WDR53+ctrl' 'LRRC42+ctrl' 'FAM160B2+ctrl'
 'C19orf81+ctrl' 'OAF+ctrl' 'TTC13+ctrl' 'NME1-NME2+ctrl' 'TMEM99+ctrl'
 'ARPC4-TTLL3+ctrl' 'C19orf54+ctrl' 'PBDC1+ctrl' 'FAM89A+ctrl'
 'CCDC97+ctrl' 'ST20-MTHFS+ctrl' 'PROSER1+ctrl' 'IER5L+ctrl' 'PNMA8A+ctrl'
 'C11orf96+ctrl' 'C6orf62+ctrl' 'ERV3-1+ctrl' 'C5orf34+ctrl'
 'CCDC169-

batch info is available!


Done!


In [2]:
adata = pert_data.adata
unique_perts = [i.split('+')[0] for i in adata.obs.condition.unique()]

In [4]:
import scanpy as sc
from biothings_client import get_client
mg = get_client('gene')
adata_2 = sc.read_h5ad('/home/huangk28/scratch/perturb_seq_data/ReplogleWeissman2022_K562_gwps_processed_hvg1000.h5ad')
cond2gene_id = dict(adata_2.obs[['condition', 'gene_id']].values)

In [5]:
import pickle
import os
def save_kernel(folder_name, pert_list, kernel_npy, feat):
    kernel_path = '/home/huangk28/scratch/knowledge_kernels_gw/'
    if not os.path.exists(kernel_path + folder_name):
        os.mkdir(kernel_path + folder_name)
    with open(kernel_path + folder_name + '/pert_list.pkl', 'wb') as f:
        pickle.dump(pert_list, f)
    with open(kernel_path + folder_name + '/kernel.pkl', 'wb') as f:
        pickle.dump(kernel_npy, f)
    with open(kernel_path + folder_name + '/feat.pkl', 'wb') as f:
        pickle.dump(feat, f)
    


In [None]:
## get gold label upper bound kernels

In [6]:
ctrl_effect = adata[adata.obs.condition == 'ctrl'].X.mean(axis = 0)
mean_effect = adata[adata.obs.gene != 'ctrl'].X.mean(axis = 0)

In [7]:
from tqdm import tqdm
pert2effect = {}
pert2effect_delta = {}
pert2effect_delta_mean_pert = {}

for pert in tqdm(adata.obs.condition.unique()):
    mean_pert = adata[adata.obs.condition == pert].X.mean(axis = 0)
    pert2effect[pert] = mean_pert
    pert2effect_delta[pert] = mean_pert - ctrl_effect
    pert2effect_delta_mean_pert[pert] = mean_pert - mean_effect

100%|██████████| 9748/9748 [01:25<00:00, 113.66it/s]


In [8]:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.stack(list(pert2effect_delta.values())), index=list(pert2effect_delta.keys()))

In [9]:
id2emb = dict(zip([i.split('+')[0] for i in df.index.values], df.values))

In [10]:
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
truth_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])

G = np.dot(truth_feat, truth_feat.T)

In [12]:
truth_feat.shape

(9747, 1000)

In [13]:
save_kernel('ground_truth_delta', pert_list_non_ctrl, G, truth_feat)

## Protein embedding

In [36]:
import pickle
with open('/home/huangk28/scratch/knowledge_kernels/esm_emb/gene2esm.pkl', 'rb') as f:
    gene2esm = pickle.load(f)

In [37]:
len(np.intersect1d(list(gene2esm.keys()), unique_perts))

9609

In [44]:
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
failed_genes = []
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2esm.keys()))):
    if (i != 'ctrl'):
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            print(i)
            failed_genes.append(i)
fix_gene = {i: j[0]['symbol'] for i,j in gene2cond_name.items()}

pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
failed_genes = []
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2esm.keys()))):
    if (i != 'ctrl'):
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            print(i)
            failed_genes.append(i)
fix_gene = {}
for i,j in gene2cond_name.items():
    if 'symbol' in j[0]:
        fix_gene[i] = j[0]['symbol']
    else:
        failed_genes.append(i)
        

In [54]:
id2emb = {}
for i in tqdm(pert_list_non_ctrl):
    if i in gene2esm:
        out = gene2esm[i]
    elif (i in fix_gene) and (fix_gene[i] in gene2esm):
        out = gene2esm[fix_gene[i]]
    else:
        print(i)
        out = np.mean(list(gene2esm.values()), axis = 0)
    id2emb[i] = out
# {i: gene2esm[i] if i in gene2esm else gene2esm[fix_gene[i]] }

 57%|█████▋    | 5547/9747 [00:00<00:00, 40070.48it/s]

HIST2H2AA3
HIST3H2A
FCGR2C


100%|██████████| 9747/9747 [00:00<00:00, 28407.81it/s]

HIST3H2BB
ZNF883





In [55]:
esm_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(esm_feat, esm_feat.T)

In [56]:
save_kernel('esm_kernel', pert_list_non_ctrl, G, esm_feat)

## PoPS embedding

In [58]:
import pickle
with open(path_to_emb + 'pops_emb/gene2pops_all.pkl', 'rb') as f:
    gene2pops = pickle.load(f)

In [59]:
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
failed_genes = []
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2pops.keys()))):
    if (i != 'ctrl'):
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            print(i)
            failed_genes.append(i)
fix_gene = {}
for i,j in gene2cond_name.items():
    if 'symbol' in j[0]:
        fix_gene[i] = j[0]['symbol']
    else:
        failed_genes.append(i)

  0%|          | 0/627 [00:00<?, ?it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  0%|          | 1/627 [00:01<11:53,  1.14s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  0%|          | 2/627 [00:02<11:33,  1.11s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  0%|          | 3/627 [00:03<11:40,  1.12s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|          | 4/627 [00:04<11:32,  1.11s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|          | 5/627 [00:05<11:27,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|          | 6/627 [00:06<11:24,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|          | 7/627 [00:07<11:21,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|▏         | 8/627 [00:08<11:18,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.clie

FCGR2C


 27%|██▋       | 172/627 [03:07<06:23,  1.19it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 173/627 [03:08<06:50,  1.11it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 174/627 [03:09<07:11,  1.05it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 175/627 [03:10<07:28,  1.01it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 176/627 [03:11<07:39,  1.02s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 177/627 [03:12<07:48,  1.04s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 28%|██▊       | 178/627 [03:13<07:53,  1.05s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 29%|██▊       | 179/627 [03:15<07:57,  1.07s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
 29%|██▊       | 180/627 [03:16<07:59,  1.07s/it]INFO:biothings.client:querying 

In [60]:
mean_pops = np.mean(np.stack(gene2pops.values()), axis = 0)

In [65]:
id2emb = {}
for i in tqdm(pert_list_non_ctrl):
    if i in gene2pops:
        out = gene2pops[i]
    elif (i in fix_gene) and (fix_gene[i] in gene2pops):
        out = gene2pops[fix_gene[i]]
    else:
        print(i)
        out = mean_pops
    id2emb[i] = out

100%|██████████| 9747/9747 [00:00<00:00, 1420545.57it/s]

HIKESHI
SELENOP
NAXE
TMEM131L
PRPS2
FUNDC2
HDX
HAUS7
FOXI3
NUP62CL
HDAC8
ETFRF1
MCTS1
HSFX1
VIRMA
MSN
MORF4L2
ZNF182
RPS4X
WDR13
SSX4
INTS14
ALG13
YY2
TLCD3A
FMR1NB
DIPK1B
TMEM273
CYREN
EPOP
RXYLT1
HPRT1
CITED1
SSX1
ZUP1
STAG2
KYAT3
PIP4P2
NDUFAF8
SELENOK
MED14
AR
TBXT
ABO
FAM156B
ITM2A
FAM122B
SLC25A6
TMEM250
MESD
TXLNG
TAZ
TAF9B
MAGEC1
CENPS
CNOT9
PAGE1
RPS6KA3
SINHCAF
ATRX
PAGE5
PRICKLE3
KDM6A
MBTPS2
CFAP410
ARHGAP45
TMLHE
RAMAC
DUSP9
DNASE1L1
BCORL1
SLC66A1
PQBP1
UBQLN2
HSD17B10
SLC66A3
MSL3
CTPS2
DMAC2L
MAGEB2
SELENOM
HSFY1
COQ8A
TEDC1
USP11
STARD8
HTATSF1
ELOA
MAGED2
TCEAL1
NKAPD1
MAGEA1
SELENOT
TMSB4X
MID1IP1
ACOT9
SSR4
RBMX
TFDP3
DKC1
DENND10
TIMP1
ZNF280C
FBH1
BRCC3
CD99
RAB5IF
HCFC1
NR2E3
MAIP1
HPF1
C11orf95
ATP5F1D
ATP5MD
POGLUT2
MMUT
ELOB
PLP2
SAT1
UPF3B
NONO
RO60
GRIPAP1
MAGEB1
USP51
CRYBG1
CEMIP2
LAMP2
SEPTIN1
SPINDOC
IKBKG
PDHA1
ICE1
COA8
EIF1AX
ATP5MC1
ZNF630
CDK16
ZIC3
PRUNE1
SEPTIN7
IDH3G
WASHC5
RHOXF2
RIOX2
MAGED1
LNPK
PDZD11
NR0B1
RPGR
UBXN8
SEPTIN6
RPL36A
IQSEC2
FA




In [68]:
pops_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(pops_feat, pops_feat.T)

In [69]:
save_kernel('pops_kernel', pert_list_non_ctrl, G, pops_feat)

In [70]:
len(pert_list_non_ctrl)

9747

## BioGPT kernel

In [71]:
import pickle
with open('/home/huangk28/scratch/knowledge_kernels/biogpt_emb/gene2biogpt.pkl', 'rb') as f:
    gene2biogpt = pickle.load(f)

In [72]:
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
biogpt_feat = np.stack([gene2biogpt[i] for i in pert_list_non_ctrl])
G = np.dot(biogpt_feat, biogpt_feat.T)

In [73]:
save_kernel('biogpt_kernel', pert_list_non_ctrl, G, biogpt_feat)

# Node2Vec

In [74]:
import pickle
with open('/home/huangk28/scratch/knowledge_kernels/gears_emb/gene2gears_node2vec.pkl', 'rb') as f:
    gene2node_vec = pickle.load(f)

In [75]:
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']
failed_genes = []
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2node_vec.keys()))):
    if (i != 'ctrl'):
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            print(i)
            failed_genes.append(i)
fix_gene = {}
for i,j in gene2cond_name.items():
    if 'symbol' in j[0]:
        fix_gene[i] = j[0]['symbol']
    else:
        failed_genes.append(i)

100%|██████████| 1/1 [00:00<00:00, 21183.35it/s]


In [76]:
mean_node2vec = np.mean(np.stack(gene2node_vec.values()), axis = 0)

In [77]:
id2emb = {}
for i in pert_list_non_ctrl:
    if i in gene2node_vec:
        id2emb[i] = gene2node_vec[i]
    elif fix_gene[i] in gene2node_vec:
        id2emb[i] = gene2node_vec[fix_gene[i]]
    else:
        print(i)
        id2emb[i] = mean_node2vec

In [80]:
node2vec_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(node2vec_feat, node2vec_feat.T)
save_kernel('node2vec_kernel', pert_list_non_ctrl, G, node2vec_feat)

# Node2Vec on gears go-go graph

In [82]:
import pickle
with open('/home/huangk28/scratch/knowledge_kernels/gears_emb/gene2gears_node2vec.pkl', 'rb') as f:
    gene2node_vec = pickle.load(f)
pert_list = [i.split('+')[0] for i in df.index.values]
pert_list_non_ctrl = [i for i in pert_list if i!= 'ctrl']

gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2node_vec.keys()))):
    if i != 'ctrl':
        gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
fix_gene = {i: j[0]['symbol'] for i,j in gene2cond_name.items()}
mean_node2vec = np.mean(np.stack(gene2node_vec.values()), axis = 0)
id2emb = {}
for i in pert_list_non_ctrl:
    if i in gene2node_vec:
        id2emb[i] = gene2node_vec[i]
    elif fix_gene[i] in gene2node_vec:
        id2emb[i] = gene2node_vec[fix_gene[i]]
    else:
        print(i)
        id2emb[i] = mean_node2vec
node2vec_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(node2vec_feat, node2vec_feat.T)
save_kernel('gears_kernel', pert_list_non_ctrl, G, node2vec_feat)

100%|██████████| 1/1 [00:00<00:00, 17924.38it/s]


## OPS data

In [20]:
df = pd.read_csv('/home/huangk28/scratch/Profile_Aggregation/outputs/20200805_A549_WG_Screen_guide_normalized_feature_select_median_merged_ALLBATCHES___CP186___ALLWELLS_gene_aggregated.csv')
df = df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
gene2ops = dict(zip(df.index.values,df.values))
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2ops.keys()))):
    if i != 'ctrl':
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            gene2cond_name[i] = [{'symbol': i}]

  0%|          | 0/157 [00:00<?, ?it/s]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|          | 1/157 [00:01<02:58,  1.14s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  1%|▏         | 2/157 [00:02<02:53,  1.12s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  2%|▏         | 3/157 [00:03<02:50,  1.11s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  3%|▎         | 4/157 [00:04<02:48,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  3%|▎         | 5/157 [00:05<02:47,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  4%|▍         | 6/157 [00:06<02:45,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  4%|▍         | 7/157 [00:07<02:44,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
  5%|▌         | 8/157 [00:08<02:43,  1.10s/it]INFO:biothings.client:querying 1-1...
INFO:biothings.clie

In [21]:
fix_gene = {i: j[0]['symbol'] for i,j in gene2cond_name.items()}
mean_ops = np.mean(np.stack(gene2ops.values()), axis = 0)
id2emb = {}
for i in pert_list_non_ctrl:
    if i in gene2ops:
        id2emb[i] = gene2ops[i]
    elif fix_gene[i] in gene2ops:
        id2emb[i] = gene2ops[fix_gene[i]]
    else:
        print(i)
        id2emb[i] = mean_ops
ops_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(ops_feat, ops_feat.T)
save_kernel('ops_A549_kernel', pert_list_non_ctrl, G, ops_feat)

ETFRF1
VIRMA
TLCD3A
DIPK1B
TMEM273
CYREN
PIP4P2
NDUFAF8
ATP5ME
MESD
SINHCAF
CFAP410
RAMAC
SLC66A1
SLC66A3
DMAC2L
TEDC1
NKAPD1
SELENOT
DENND10
RAB5IF
ATP5F1D
ATP5MD
POGLUT2
MMUT
RO60
CEMIP2
SEPTIN1
COA8
SEPTIN7
MARCH9
SEPTIN6
FAM174C
SLC66A2
SEPTIN8
FAM241A
TLNRD1
ABITRAM
PCNX3
POGLUT3
TUT4
CCNQ
EIPR1
KHDC4
MARCH3
CSKMT
SELENOF
RTRAF
ATP23
MPIG6B
TENT4A
SELENOI
ILRUN
TLCD5
VPS35L
ABRAXAS1
NAA80
DMAC2
MARCH6
ERG28
VPS26C
GRK2
FDX2
SHFL
ATP5PF
MARCH7
ATP5MPL
MCRIP1
ANTKMT
TLE5
ATP5MF
SELENOW
GET3
CERT1
CIAO2B
ATP5PO
CYBC1
CZIB
ATP5MC3
PCLAF
ATP5MC2
STMP1
TAFA2
SEPTIN11
TUT7
RSKR
ZNF875
ATPSCKMT
ZNRD2
RESF1
ATP5F1B
MTREX
MARCH8
ITPRID2
COPS9
TASOR2
PACC1
UTP11
SHLD1
PLPBP
PRXL2A
CIAO2A
NUP42
ATP5PB
MTRES1
NSD2
RACK1
SEPTIN9
FCGR2C
ECPAS
ELP1
MARCH5
RBIS
CIAO3
FAM241B
GON7
HEXD
SHLD2
AOPEP
TASOR
OGA
DIPK2A
DGLUCY
OBI1
CFAP298
SEPTIN5
MICOS10
INKA2
YAE1
TENT4B
ATP5IF1
ODR4
ATP5F1A
REX1BD
ATP5MG
PWWP3A
COQ8B
GET1
GATD1
PRXL2C
TRIR
DELE1
CBLL2
ATP5PD
MRTFA
PRORP
TEPSIN
TENT2
MICOS13
LTO1
ATP5F

In [12]:

df = pd.read_csv('/home/huangk28/scratch/Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___HPLM___ALLWELLS_gene_aggregated.csv')
df = df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
gene2ops = dict(zip(df.index.values,df.values))
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2ops.keys()))):
    if i != 'ctrl':
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            gene2cond_name[i] = [{'symbol': i}]
fix_gene = {i: j[0]['symbol'] for i,j in gene2cond_name.items()}
mean_ops = np.mean(np.stack(gene2ops.values()), axis = 0)
id2emb = {}
for i in pert_list_non_ctrl:
    if i in gene2ops:
        id2emb[i] = gene2ops[i]
    elif fix_gene[i] in gene2ops:
        id2emb[i] = gene2ops[fix_gene[i]]
    else:
        print(i)
        id2emb[i] = mean_ops
ops_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(ops_feat, ops_feat.T)
save_kernel('ops_HeLa_HPLM_kernel', pert_list_non_ctrl, G, ops_feat)


df = pd.read_csv('/home/huangk28/scratch/Profile_Aggregation/outputs/20210422_6W_CP257_guide_normalized_feature_select_median_merged_ALLBATCHES___DMEM___ALLWELLS_gene_aggregated.csv')
df = df.set_index('Metadata_Foci_Barcode_MatchedTo_GeneCode')
gene2ops = dict(zip(df.index.values,df.values))
gene2cond_name = {}
for i in tqdm(np.setdiff1d(pert_list, list(gene2ops.keys()))):
    if i != 'ctrl':
        try:
            gene2cond_name[i] = mg.getgenes(cond2gene_id[i+'+ctrl'])
        except:
            gene2cond_name[i] = [{'symbol': i}]
fix_gene = {i: j[0]['symbol'] for i,j in gene2cond_name.items()}
mean_ops = np.mean(np.stack(gene2ops.values()), axis = 0)
id2emb = {}
for i in pert_list_non_ctrl:
    if i in gene2ops:
        id2emb[i] = gene2ops[i]
    elif fix_gene[i] in gene2ops:
        id2emb[i] = gene2ops[fix_gene[i]]
    else:
        print(i)
        id2emb[i] = mean_ops
ops_feat = np.stack([id2emb[i] for i in pert_list_non_ctrl])
G = np.dot(ops_feat, ops_feat.T)
save_kernel('ops_HeLa_DMEM_kernel', pert_list_non_ctrl, G, ops_feat)

{'CTSC+ctrl': 'ENSG00000109861',
 'CWC25+ctrl': 'ENSG00000273559',
 'PDE4DIP+ctrl': 'ENSG00000178104',
 'ZZEF1+ctrl': 'ENSG00000074755',
 'SNAPIN+ctrl': 'ENSG00000143553',
 'RBM6+ctrl': 'ENSG00000004534',
 'DHX33+ctrl': 'ENSG00000005100',
 'KIF1C+ctrl': 'ENSG00000129250',
 'XRCC2+ctrl': 'ENSG00000196584',
 'BNIP2+ctrl': 'ENSG00000140299',
 'HIKESHI+ctrl': 'ENSG00000149196',
 'SELENOP+ctrl': 'ENSG00000250722',
 'ZNF100+ctrl': 'ENSG00000197020',
 'PRPSAP1+ctrl': 'ENSG00000161542',
 'NUP98+ctrl': 'ENSG00000110713',
 'FANCG+ctrl': 'ENSG00000221829',
 'TMEM135+ctrl': 'ENSG00000166575',
 'MEGF8+ctrl': 'ENSG00000105429',
 'UPF2+ctrl': 'ENSG00000151461',
 'FASTKD1+ctrl': 'ENSG00000138399',
 'CYS1+ctrl': 'ENSG00000205795',
 'GPS1+ctrl': 'ENSG00000169727',
 'NAXE+ctrl': 'ENSG00000163382',
 'NT5C3B+ctrl': 'ENSG00000141698',
 'SNAPC4+ctrl': 'ENSG00000165684',
 'MARS2+ctrl': 'ENSG00000247626',
 'INO80B+ctrl': 'ENSG00000115274',
 'VDAC2+ctrl': 'ENSG00000165637',
 'ARG2+ctrl': 'ENSG00000081181',
 'MF