In [1]:
import os, gc, pickle, datetime, scipy.sparse,random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

import sklearn 
from sklearn.model_selection import GroupKFold,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse
from tqdm.notebook import tqdm

DATA_DIR = "./"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

VERBOSE = 0

In [2]:
import math

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization, AlphaDropout
import keras_tuner
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2022-11-10 19:52:02.448986: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-10 19:52:02.935365: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Num GPUs Available:  1


In [3]:
# Standardize
def std(x):
    empty_list = []
    for item in x:
        empty_list.append((item - np.mean(item)) / np.std(item))
    return np.array(empty_list)

In [4]:
import anndata as ad

In [5]:
def seed_tensorflow(seed=1):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [6]:
# READ METADATA
meta = pd.read_csv(FP_CELL_METADATA, index_col = 'cell_id')
meta = meta[meta.technology == 'citeseq']
meta.drop('technology', axis = 1, inplace = True)

#READ FEATURES
X = pd.read_hdf(FP_CITE_TRAIN_INPUTS)

# READ TARGETS
Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)

# REMOVE 2% OF OULIERS
cols = list(Y.columns)
for i in tqdm(range(140)):
    col = cols[i]
    v = Y[col]
    threshold = 1.0
    m1 = np.percentile(v, threshold)
    m2 = np.percentile(v, 100 - threshold)
    v = np.clip(v, m1, m2)
    Y[col] = v
    
# SHRINK META TO TRAIN SIZE
meta = meta[meta.index.isin(Y.index)]

# MERGE TARGETS WITH METADATA
df = Y.join(meta)

  0%|          | 0/140 [00:00<?, ?it/s]

In [7]:
corrs = np.zeros([X.shape[1], Y.shape[1]])
corrs = pd.DataFrame(corrs)
corrs.columns = Y.columns
corrs.index = X.columns
corrs

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410_A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000268895_A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000175899_A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000245105_A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000166535_A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198455_ZXDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000070476_ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000162378_ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000159840_ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
for i in tqdm(range(140)):
    y = Y[Y.columns[i]].values
    for j in range(22050):
        x = X[X.columns[j]].values
        corrs[corrs.columns[i]][corrs.index[j]] = np.corrcoef(y, x)[1, 0]
corrs

  0%|          | 0/140 [00:00<?, ?it/s]

  c /= stddev[:, None]
  c /= stddev[None, :]


gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410_A1BG,-0.000966,-0.003485,-0.002624,0.001722,0.016688,0.015719,0.000386,0.000389,-0.004525,0.010304,...,0.003951,-0.000214,-0.002191,0.000518,0.000457,-0.000450,-0.011117,0.006008,-0.016222,-0.000939
ENSG00000268895_A1BG-AS1,0.002032,-0.007829,-0.002993,0.016612,0.015510,0.012294,0.010719,0.001504,0.003827,0.009244,...,-0.001921,0.016840,-0.004302,0.002346,-0.000072,0.002508,-0.000079,0.000414,-0.003036,-0.003229
ENSG00000175899_A2M,0.041043,0.008660,0.019384,0.011152,0.031637,0.006257,0.035739,0.015517,0.009375,0.036531,...,-0.002178,0.003910,0.009244,0.007330,0.044391,0.013877,-0.005628,0.052544,-0.003655,0.019460
ENSG00000245105_A2M-AS1,0.004327,0.009921,0.021295,0.042130,0.068744,0.015493,-0.014891,0.007535,0.014520,0.042704,...,0.011131,-0.000085,0.005398,0.017217,-0.003114,0.013843,0.008296,0.006690,-0.006266,0.035730
ENSG00000166535_A2ML1,0.002878,0.003549,-0.005328,-0.010251,-0.014511,-0.010912,-0.003621,-0.000396,-0.003659,-0.007451,...,0.000571,-0.006946,-0.001120,-0.001728,0.001223,-0.006977,0.001707,-0.003900,0.005648,-0.004844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198455_ZXDB,0.006940,0.006438,0.008169,0.025069,0.028279,0.013233,0.010657,0.005491,0.007891,0.004313,...,0.002497,0.017943,0.012516,-0.000587,0.003760,0.009816,0.000486,0.000856,0.009836,0.011718
ENSG00000070476_ZXDC,0.018013,-0.010287,0.001193,0.063548,0.058743,0.049279,0.065299,-0.002654,-0.002687,0.004967,...,-0.014743,0.077335,-0.000016,-0.017757,-0.001511,0.006233,-0.003139,0.004822,0.008230,0.030803
ENSG00000162378_ZYG11B,-0.010463,0.003138,0.029036,0.046307,0.041455,-0.007749,-0.012975,0.026899,0.031662,-0.008615,...,-0.015307,0.063093,0.052804,-0.028652,-0.003490,0.021261,0.048187,-0.004684,0.091617,0.052107
ENSG00000159840_ZYX,0.056785,-0.006883,0.002664,0.144377,0.267935,0.145251,0.182322,0.018400,-0.040303,0.062389,...,-0.003646,0.083344,-0.063520,0.017067,0.025732,0.016655,-0.116165,0.055961,-0.147353,0.051282


In [9]:
top100fits = np.zeros([100, Y.shape[1]])
top100fits = pd.DataFrame(top100fits)
top100fits.columns = Y.columns

In [10]:
for i in tqdm(range(140)):
    df0 = corrs[[corrs.columns[i]]].copy()
    df0 = df0.sort_values(corrs.columns[i], axis=0, ascending=False)
#     print(df0[:100].index)
    top100fits[top100fits.columns[i]] = df0[:100].index

  0%|          | 0/140 [00:00<?, ?it/s]

In [13]:
top100fits

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
0,ENSG00000112799_LY86,ENSG00000229807_XIST,ENSG00000229807_XIST,ENSG00000229807_XIST,ENSG00000142089_IFITM3,ENSG00000128040_SPINK2,ENSG00000117091_CD48,ENSG00000137801_THBS1,ENSG00000229807_XIST,ENSG00000169442_CD52,...,ENSG00000229807_XIST,ENSG00000122862_SRGN,ENSG00000105610_KLF1,ENSG00000198034_RPS4X,ENSG00000163221_S100A12,ENSG00000229807_XIST,ENSG00000197956_S100A6,ENSG00000112799_LY86,ENSG00000105610_KLF1,ENSG00000229807_XIST
1,ENSG00000114013_CD86,ENSG00000198034_RPS4X,ENSG00000198034_RPS4X,ENSG00000142089_IFITM3,ENSG00000229807_XIST,ENSG00000172247_C1QTNF4,ENSG00000105374_NKG7,ENSG00000166091_CMTM5,ENSG00000105610_KLF1,ENSG00000133112_TPT1,...,ENSG00000198034_RPS4X,ENSG00000169385_RNASE2,ENSG00000229807_XIST,ENSG00000229807_XIST,ENSG00000197629_MPEG1,ENSG00000198034_RPS4X,ENSG00000102145_GATA1,ENSG00000182578_CSF1R,ENSG00000102145_GATA1,ENSG00000198034_RPS4X
2,ENSG00000117091_CD48,ENSG00000185559_DLK1,ENSG00000185559_DLK1,ENSG00000185559_DLK1,ENSG00000095932_SMIM24,ENSG00000171476_HOPX,ENSG00000221869_CEBPD,ENSG00000259207_ITGB3,ENSG00000102145_GATA1,ENSG00000198034_RPS4X,...,ENSG00000110700_RPS13,ENSG00000149516_MS4A3,ENSG00000102145_GATA1,ENSG00000110700_RPS13,ENSG00000120708_TGFBI,ENSG00000185559_DLK1,ENSG00000179639_FCER1A,ENSG00000197629_MPEG1,ENSG00000119865_CNRIP1,ENSG00000185559_DLK1
3,ENSG00000087086_FTL,ENSG00000184185_KCNJ12,ENSG00000196154_S100A4,ENSG00000198034_RPS4X,ENSG00000204472_AIF1,ENSG00000139278_GLIPR1,ENSG00000198829_SUCNR1,ENSG00000169704_GP9,ENSG00000198034_RPS4X,ENSG00000171476_HOPX,...,ENSG00000233927_RPS28,ENSG00000087086_FTL,ENSG00000119865_CNRIP1,ENSG00000138326_RPS24,ENSG00000038427_VCAN,ENSG00000229391_HLA-DRB6,ENSG00000005961_ITGA2B,ENSG00000143546_S100A8,ENSG00000160789_LMNA,ENSG00000229391_HLA-DRB6
4,ENSG00000169508_GPR183,ENSG00000041353_RAB27B,ENSG00000184185_KCNJ12,ENSG00000022556_NLRP2,ENSG00000107281_NPDC1,ENSG00000105374_NKG7,ENSG00000069974_RAB27A,ENSG00000160145_KALRN,ENSG00000160789_LMNA,ENSG00000229807_XIST,...,ENSG00000115268_RPS15,ENSG00000179218_CALR,ENSG00000160789_LMNA,ENSG00000145425_RPS3A,ENSG00000260314_MRC1,ENSG00000247627_MTND4P12,ENSG00000100368_CSF2RB,ENSG00000038427_VCAN,ENSG00000029534_ANK1,ENSG00000247627_MTND4P12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ENSG00000118785_SPP1,ENSG00000047648_ARHGAP6,ENSG00000147443_DOK2,ENSG00000259479_SORD2P,ENSG00000175857_GAPT,ENSG00000172260_NEGR1,ENSG00000066336_SPI1,ENSG00000105371_ICAM4,ENSG00000244879_GABPB1-AS1,ENSG00000110852_CLEC2B,...,ENSG00000232388_SMIM26,ENSG00000155368_DBI,ENSG00000136040_PLXNC1,ENSG00000169908_TM4SF1,ENSG00000111344_RASAL1,ENSG00000147443_DOK2,ENSG00000162367_TAL1,ENSG00000110031_LPXN,ENSG00000197993_KEL,ENSG00000140451_PIF1
96,ENSG00000122986_HVCN1,ENSG00000179639_FCER1A,ENSG00000169704_GP9,ENSG00000184185_KCNJ12,ENSG00000105472_CLEC11A,ENSG00000157110_RBPMS,ENSG00000110031_LPXN,ENSG00000136929_HEMGN,ENSG00000168754_FAM178B,ENSG00000117122_MFAP2,...,ENSG00000131143_COX4I1,ENSG00000112029_FBXO5,ENSG00000101439_CST3,ENSG00000105374_NKG7,ENSG00000197405_C5AR1,ENSG00000168497_CAVIN2,ENSG00000165092_ALDH1A1,ENSG00000158869_FCER1G,ENSG00000143627_PKLR,ENSG00000154917_RAB6B
97,ENSG00000155659_VSIG4,ENSG00000086506_HBQ1,ENSG00000107130_NCS1,ENSG00000184661_CDCA2,ENSG00000182621_PLCB1,ENSG00000197694_SPTAN1,ENSG00000258227_CLEC5A,ENSG00000283632_EXOC3L2,ENSG00000175130_MARCKSL1,ENSG00000180530_NRIP1,...,ENSG00000169100_SLC25A6,ENSG00000124783_SSR1,ENSG00000147804_SLC39A4,ENSG00000135390_ATP5MC2,ENSG00000138061_CYP1B1,ENSG00000116157_GPX7,ENSG00000039068_CDH1,ENSG00000103811_CTSH,ENSG00000152229_PSTPIP2,ENSG00000136108_CKAP2
98,ENSG00000166794_PPIB,ENSG00000197721_CR1L,ENSG00000135404_CD63,ENSG00000145386_CCNA2,ENSG00000211899_IGHM,ENSG00000026508_CD44,ENSG00000168060_NAALADL1,ENSG00000165682_CLEC1B,ENSG00000184792_OSBP2,ENSG00000010404_IDS,...,ENSG00000021355_SERPINB1,ENSG00000175538_KCNE3,ENSG00000184574_LPAR5,ENSG00000188404_SELL,ENSG00000110079_MS4A4A,ENSG00000005889_ZFX,ENSG00000141433_ADCYAP1,ENSG00000129450_SIGLEC9,ENSG00000175449_RFESD,ENSG00000162063_CCNF


In [12]:
top100fits.to_csv("top100features_cite.csv")