# Script for Processing Data

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import umap
import seaborn as sns
import matplotlib.pyplot as plt
# from ggplot import *


from ruffus import *
import sys, os, h5py, random, tempfile, scipy, time,copy
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct
import pipeline_support as PS
from sklearn.decomposition import PCA
from scipy import stats
from matplotlib.pyplot import imshow
from sklearn.preprocessing import MinMaxScaler

from collections import Counter
#from tensorflow_examples.models.pix2pix import pix2pix
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
from tensorflow_gan.python.losses import losses_impl


from numpy.random import seed
randomState = 123
seed(randomState)




Parameters

In [2]:
n_sampling = 50000

Input Filenames

In [3]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
l1000_filename = "../data/L1000/GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx"
l1000_geneinfo_filename = "../data/L1000/GSE92742_Broad_LINCS_gene_info.txt"
# gtex_rnaseq_filename = "../data/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_n3176x12320.gctx"
# gtex_l1000_filename = "../data/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_n3176x12320.gctx"
# gtex_rnaseq_filename = "../data/GTEx/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct"
gtex_rnaseq_filename = "../data/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"
gtex_l1000_filename = "../data/GTEx/DS_GTEX_L1000_n3176x12320.gctx"
gene_id_to_symbol_filename = "../data/GTEx/Gene_ID_to_Symbol_GRCh37.txt"
gene_transcript_id_to_hgnc_gene_filename = "../data/GTEx/gencode.v19.metadata.HGNC"
gtex_geneinfo_filename = "../data/GTEx/GSE92743_Broad_GTEx_gene_info.txt"

Output Filenames

In [4]:
l1000_gene_list_output_filename = "../data/processed/L1000/L1000_gene_list.txt"

l1000_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n978x1319138.f" # only landmark genes
l1000_overlap_landmark_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x{}.f" # n_samplingx967

ARCHS4_filtered_sample_output_filename = "../data/processed/ARCHS4/filtered_sample_list.txt"
ARCHS4_filtered_landmark_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}.f" # samplesx967
ARCHS4_filtered_overlap_landmark_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}.f" # n_samplingx967

gtex_filtered_l1000_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n{}x{}.f" # samplesx967
gtex_filtered_rnaseq_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n{}x{}.f" # samplesx967

overlap_landmark_file = "../data/processed/overlap_landmark_file.txt" 

## Load overlap landmark genes

In [5]:
with open(overlap_landmark_file, "r") as f:
    overlap_landmark_genes = f.readlines()
    overlap_landmark_genes = [x.strip() for x in overlap_landmark_genes]

## Load L1000 (GSE92742/Level 3) ~4 min

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742 downloaded @ ../data

In [6]:
print('Loading L1000 data.....')
l1000_data = parse_gctx.parse(l1000_filename,convert_neg_666=True).data_df

Loading L1000 data.....


In [7]:
gene_info = pd.read_csv(l1000_geneinfo_filename,header = 0, sep = '\t')

In [8]:
# create a probe_id to gene name dictionary 
gene_dict = dict(zip([str(x) for x in gene_info['pr_gene_id']], gene_info['pr_gene_symbol']))

# label rows with gene names 
l1000_data.index = [gene_dict[x] for x in l1000_data.index.values]

# Save L1000 genes to a txt file
l1000_genes = []
genes_978_file = open(l1000_gene_list_output_filename,'w')
landmark_gene_info = gene_info[gene_info["pr_is_lm"] == 1]
for x in landmark_gene_info["pr_gene_symbol"].values:
    genes_978_file.write(x.strip() + '\n')
    l1000_genes.append(x.strip())
genes_978_file.close()

In [9]:
# filter landmark genes
filtered_l1000_data = l1000_data[l1000_data.index.isin(l1000_genes)]

In [11]:
# save filtered_l1000 data (only landmark genes)
filtered_l1000_data.to_csv(l1000_output_filename)

Remove variables

In [11]:
del l1000_data

## Load ARCHS4 RNA-seq

Data preprocessing code from https://github.com/MaayanLab/L1k2RNA-seq-2.0/blob/cb5eaa3a447b502e32db6c1aae84eaa94d0ce0f4/pipeline/pipeline.py#L43

In [6]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]

Processing RNA-seq data.....


In [7]:
# Find samples with > 1 million reads
idx_read_sums_keep = [i for i, x in enumerate(h5['meta']['samples']['readstotal']) if x > 1000000]

In [8]:
# Calculate # samples in each study
studies_count_dict = Counter(h5['meta']['samples']['series_id'])

In [9]:
# landmark gene index
archs4_landmark_gene_index = [i for i, x in enumerate(genes) if x in overlap_landmark_genes]
archs4_landmark_gene_names = [x for i, x in enumerate(genes) if x in overlap_landmark_genes]

In [10]:
# filter landmake genes and covert array to pandas dataframe 
landmark_gene_expression = pd.DataFrame(expression[archs4_landmark_gene_index])
landmark_gene_expression.index = archs4_landmark_gene_names

In [11]:
landmark_gene_expression.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,307258,307259,307260,307261,307262,307263,307264,307265,307266,307267
AARS,10297,722,179,232,10981,533,328,168,500,235,...,1274,1450,2725,2102,0,475,1883,1642,16,189
ABCB6,5467,0,0,0,6392,0,0,0,0,0,...,755,1134,532,660,0,532,398,509,8,611
ABCC5,3273,0,0,0,3822,0,0,0,0,0,...,1649,1059,1469,1736,9,376,3065,1773,55,1366
ABCF1,9290,0,0,0,9100,0,0,0,0,0,...,867,2010,2428,1828,71,739,3180,2983,337,1735
ABCF3,3496,0,0,0,3259,0,0,0,0,0,...,492,529,894,1188,0,251,896,1156,17,220


In [12]:
# Remove single cell samples that are < 1 million reads and from studies with > 200 samples
samples = list()
samples_index = list()
filtered_expression = list()
i = 0
for sample_id, series_id in zip(h5['meta']['samples']['geo_accession'], h5['meta']['samples']['series_id']):
    if i in idx_read_sums_keep and studies_count_dict[series_id] <= 200:
        samples.append(sample_id)
        samples_index.append(i)
    if i % 10000 == 0:
        print(i)    
        
    i += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000


In [31]:
# filter out samples 
filtered_landmark_gene_expression = landmark_gene_expression.iloc[:, samples_index]
filtered_landmark_gene_expression.columns = samples

In [32]:
# save
filtered_landmark_gene_expression.reset_index().to_feather(ARCHS4_filtered_landmark_output_filename.format(filtered_landmark_gene_expression.shape[0], filtered_landmark_gene_expression.shape[1]))
print(ARCHS4_filtered_landmark_output_filename.format(filtered_landmark_gene_expression.shape[0], filtered_landmark_gene_expression.shape[1]))

../data/processed/ARCHS4/human_matrix_v9_filtered_n967x154575.f


In [20]:
# save sample ids
with open(ARCHS4_filtered_sample_output_filename, "w") as f:
    f.write("\n".join(samples))    

## Load GTEx 

GTEx L1000 from GSE92742 
GTEx RNA-seq from https://www.gtexportal.org/home/datasets version 6p Gene read count

gene_transcript_id_to_hgnc_gene_filename: transcript ID and Gene symbols <br>
gene_id_to_symbol_filename: Gene ID to transcript ID

In [6]:
gtex_gene_info = pd.read_csv(gtex_geneinfo_filename,header = 0, sep = '\t')
gtex_landmark_genes = gtex_gene_info.loc[gtex_gene_info["pr_is_lm"]==1, "pr_gene_symbol"].tolist()

In [7]:
# GTEx L1000 data
print('Loading GTEx L1000 data.....')
gtex_l1000_data = parse_gctx.parse(gtex_l1000_filename,convert_neg_666=True).data_df

# create a probe_id to gene name dictionary 
gtex_gene_dict = dict(zip([str(x) for x in gtex_gene_info['pr_gene_id']], gtex_gene_info['pr_gene_symbol']))

# label rows with gene names 
gtex_l1000_data.index = [gtex_gene_dict[x] for x in gtex_l1000_data.index.values]
gtex_l1000_data = gtex_l1000_data.T

Loading GTEx L1000 data.....


In [8]:
gtex_l1000_data

Unnamed: 0_level_0,NAT2,ADA,CDH2,AKT3,MED6,NAALAD2,NAALADL1,ACOT8,ABI1,GNPDA1,...,REC8,HNRNPDL,DMTF1,PPP4R1,CDH1,SLC12A6,PTBP3,KCNE2,DGCR2,SCO2
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-PLZ6-0326-SM-3P61J,5.6673,6.4632,5.1116,10.9845,7.5641,6.0204,6.1304,6.2121,7.1477,7.9405,...,4.0016,11.9817,9.2365,7.0643,0.0000,4.0942,3.6406,5.1597,10.0601,6.7101
GTEX-WOFM-1726-SM-3MJFA,5.3294,5.2113,4.5993,11.1465,6.9731,5.1882,5.1901,6.9868,7.7144,7.0483,...,2.4337,12.4660,8.0221,8.1497,0.9927,4.7571,4.4464,5.5312,8.7838,7.6696
GTEX-WHSE-2926-SM-3NMBG,6.5072,6.2826,6.0322,11.1166,6.5092,6.4562,4.9888,7.7656,8.6530,7.3503,...,7.0074,12.6445,10.0409,7.1485,0.2510,4.4981,3.0617,5.6127,9.3979,7.9547
GTEX-RNOR-0011-R7A-SM-2TF4V,5.4584,5.3979,8.1622,11.9603,7.0277,6.4182,4.1056,9.4084,8.9236,9.7747,...,4.5427,11.8377,9.0948,6.5408,0.0000,4.4630,3.8430,5.2444,10.4074,8.5540
GTEX-NPJ8-0011-R8a-SM-2HMLG,5.9728,4.5314,8.2633,9.9105,7.0623,7.4063,5.1423,8.0846,8.4310,10.5007,...,5.7505,11.0931,9.4113,7.4106,2.3044,4.5466,3.1868,5.4089,10.7937,8.3259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-XOT4-0726-SM-4GIAW,5.9257,7.4528,2.3588,9.9084,7.7172,7.7440,6.1538,6.8794,8.4322,9.4732,...,6.7500,12.4715,10.2708,8.8405,6.4299,5.1775,5.3725,6.5510,9.4951,6.9381
GTEX-TML8-1126-SM-4DXSS,5.6805,8.7358,4.4911,7.2818,8.0132,8.3827,5.5206,7.1094,9.5274,9.6651,...,6.1590,13.3749,10.0508,10.9172,6.2080,6.8106,6.0643,5.4351,8.9420,9.0916
GTEX-U8T8-1126-SM-4DXUE,6.7119,8.3630,5.9445,9.2438,6.4057,8.4391,5.7959,8.4106,7.4180,9.9347,...,8.0064,12.7390,10.6016,10.6954,0.1161,8.5405,4.9414,6.7459,9.8064,8.3660
GTEX-TKQ1-0926-SM-4DXU2,5.6058,8.0751,7.0641,8.9300,6.4559,8.6378,6.0450,9.0775,7.6028,9.8244,...,9.1216,12.0545,11.6019,11.5841,0.7429,8.0803,4.6793,5.5484,9.5725,7.6611


In [None]:
# GTEx RNA-seq data
print('Loading GTEx RNA-seq data.....')
gtex_rnaseq_data = parse_gct.parse(gtex_rnaseq_filename,convert_neg_666=True).data_df

Loading GTEx RNA-seq data.....


In [90]:
# # load gene id to symbol mapping file
gene_id_to_symbol = pd.read_csv(gene_id_to_symbol_filename, sep="\t")
gene_id_to_transcript_id_dict = dict(zip(gene_id_to_symbol["Gene stable ID version"], gene_id_to_symbol["Transcript stable ID version"]))

transcript_id_to_gene_symbol = pd.read_csv(gene_transcript_id_to_hgnc_gene_filename, sep="\t", header=None)
transcript_id_to_gene_symbol.columns = ["Transcript stable ID version", "HGNC symbol"]
transcript_id_to_gene_symbol_dict = dict(zip(transcript_id_to_gene_symbol["Transcript stable ID version"], transcript_id_to_gene_symbol["HGNC symbol"]))
# # load gene id to symbol mapping file
# gene_id_to_symbol = pd.read_csv(gene_id_to_symbol_filename.replace("GRCh37", "v90"), sep="\t")
# gene_id_to_symbol_dict = dict(zip(gene_id_to_symbol["Gene stable ID version"], gene_id_to_symbol["HGNC symbol"]))

In [99]:
new_index = list()
for x in gtex_rnaseq_data.index.values:
    
    try:
        new_index.append(transcript_id_to_gene_symbol_dict[gene_id_to_transcript_id_dict[x]])
#         print(x, gene_id_to_transcript_id_dict[x], transcript_id_to_gene_symbol_dict[gene_id_to_transcript_id_dict[x]])
    except:
        new_index.append(x)

ENSG00000223972.4 ENST00000518655.2 DDX11L1
ENSG00000227232.4 ENST00000423562.1 WASH7P
ENSG00000243485.2 ENST00000607096.1 MIR1302-11
ENSG00000237613.2 ENST00000461467.1 FAM138A
ENSG00000268020.2 ENST00000594647.1 OR4G4P
ENSG00000240361.1 ENST00000492842.1 OR4G11P
ENSG00000186092.4 ENST00000335137.3 OR4F5
ENSG00000233750.3 ENST00000442987.3 CICP27
ENSG00000222623.1 ENST00000410691.1 RNU6-1100P
ENSG00000233653.3 ENST00000432723.3 CICP7
ENSG00000235249.1 ENST00000426406.1 OR4F29
ENSG00000269732.1 ENST00000437905.2 WBP1LP7
ENSG00000225972.1 ENST00000416931.1 MTND1P23
ENSG00000225630.1 ENST00000457540.1 MTND2P28
ENSG00000240409.1 ENST00000467115.1 MTATP8P1
ENSG00000248527.1 ENST00000514057.1 MTATP6P1
ENSG00000268663.1 ENST00000438434.2 WBP1LP6
ENSG00000185097.2 ENST00000332831.2 OR4F16
ENSG00000229376.3 ENST00000440782.3 CICP3
ENSG00000223181.1 ENST00000411249.1 RNU6-1199P
ENSG00000177757.1 ENST00000326734.1 FAM87B
ENSG00000225880.4 ENST00000536430.1 LINC00115
ENSG00000230368.2 ENST0000042

ENSG00000213366.8 ENST00000369827.3 GSTM2
ENSG00000134184.8 ENST00000369819.2 GSTM1
ENSG00000134201.6 ENST00000369812.5 GSTM5
ENSG00000134202.6 ENST00000256594.3 GSTM3
ENSG00000198758.6 ENST00000477568.1 EPS8L3
ENSG00000224927.2 ENST00000432147.2 NDUFA5P10
ENSG00000184371.9 ENST00000344188.5 CSF1
ENSG00000168710.13 ENST00000393614.4 AHCYL1
ENSG00000143093.10 ENST00000369794.2 STRIP1
ENSG00000156150.6 ENST00000369792.4 ALX3
ENSG00000186150.3 ENST00000334179.3 UBL4B
ENSG00000197106.6 ENST00000465159.1 SLC6A17
ENSG00000224965.1 ENST00000455967.1 KCNC4-AS1
ENSG00000116396.9 ENST00000459877.1 KCNC4
ENSG00000162775.10 ENST00000487146.2 RBM15
ENSG00000168679.13 ENST00000437429.2 SLC16A4
ENSG00000224699.4 ENST00000597455.1 LAMTOR5-AS1
ENSG00000134248.9 ENST00000531779.1 LAMTOR5
ENSG00000143125.5 ENST00000271331.3 PROK1
ENSG00000240194.2 ENST00000420853.1 CYMP
ENSG00000143105.5 ENST00000369771.2 KCNA10
ENSG00000177301.9 ENST00000440270.1 KCNA2
ENSG00000177272.7 ENST00000369769.2 KCNA3
ENSG00000

ENSG00000212916.3 ENST00000418460.1 MAP10
ENSG00000206835.1 ENST00000384108.1 RNU1-74P
ENSG00000135778.7 ENST00000490098.1 NTPCR
ENSG00000135749.14 ENST00000430153.1 PCNXL2
ENSG00000231940.1 ENST00000443360.1 RPS7P3
ENSG00000252501.1 ENST00000516692.1 RNU4-77P
ENSG00000135750.10 ENST00000472869.1 KCNK1
ENSG00000265744.1 ENST00000585213.1 MIR4427
ENSG00000183780.8 ENST00000366617.3 SLC35F3
ENSG00000236101.1 ENST00000445367.1 RAC1P7
ENSG00000264377.1 ENST00000583284.1 MIR4671
ENSG00000168275.10 ENST00000366612.1 COA6
ENSG00000059588.5 ENST00000463793.1 TARBP1
ENSG00000168264.6 ENST00000491430.1 IRF2BP2
ENSG00000224939.1 ENST00000429269.1 LINC00184
ENSG00000201638.1 ENST00000364768.1 RNY4P16
ENSG00000239690.2 ENST00000493933.2 RN7SL668P
ENSG00000173726.6 ENST00000473132.1 TOMM20
ENSG00000207181.1 ENST00000384452.1 SNORA14B
ENSG00000188739.10 ENST00000366606.3 RBM34
ENSG00000054267.16 ENST00000264183.3 ARID4B
ENSG00000232686.1 ENST00000357671.6 ARID4B-IT1
ENSG00000263439.1 ENST00000585119.

ENSG00000224337.1 ENST00000427218.1 FAM8A3P
ENSG00000187123.10 ENST00000392854.4 LYPD6
ENSG00000168288.8 ENST00000460311.1 MMADHC
ENSG00000207270.1 ENST00000384540.1 RNU6-601P
ENSG00000115963.9 ENST00000439275.1 RND3
ENSG00000213201.3 ENST00000435729.1 FABP5P10
ENSG00000184898.6 ENST00000409092.1 RBM43
ENSG00000123609.6 ENST00000477072.1 NMI
ENSG00000123610.3 ENST00000460812.1 TNFAIP6
ENSG00000264684.1 ENST00000585225.1 MIR4773-2
ENSG00000242113.2 ENST00000498656.2 RN7SL124P
ENSG00000080345.13 ENST00000433166.2 RIF1
ENSG00000183091.15 ENST00000397336.2 NEB
ENSG00000162980.12 ENST00000487723.1 ARL5A
ENSG00000182389.14 ENST00000360283.6 CACNB4
ENSG00000115145.5 ENST00000494589.1 STAM2
ENSG00000234272.1 ENST00000414021.1 RPL30P2
ENSG00000157827.15 ENST00000497192.1 FMNL2
ENSG00000196504.11 ENST00000486100.1 PRPF40A
ENSG00000177917.6 ENST00000463690.1 ARL6IP6
ENSG00000226213.1 ENST00000446233.1 UBQLN4P2
ENSG00000214025.2 ENST00000397288.2 ATP5F1P4
ENSG00000177519.3 ENST00000325926.3 RPRM
E

ENSG00000168273.3 ENST00000476842.1 SMIM4
ENSG00000163939.14 ENST00000424867.1 PBRM1
ENSG00000252768.1 ENST00000516959.1 RNU6-856P
ENSG00000221518.1 ENST00000408591.1 RNU6ATAC16P
ENSG00000163938.12 ENST00000497356.1 GNL3
ENSG00000212493.1 ENST00000391191.1 SNORD19
ENSG00000238862.1 ENST00000459623.1 SNORD19B
ENSG00000212452.1 ENST00000391150.1 SNORD69
ENSG00000016864.12 ENST00000266014.5 GLT8D1
ENSG00000114902.9 ENST00000474945.1 SPCS1
ENSG00000114904.8 ENST00000535191.1 NEK4
ENSG00000055957.6 ENST00000537050.1 ITIH1
ENSG00000162267.8 ENST00000493136.1 ITIH3
ENSG00000055955.11 ENST00000434759.3 ITIH4
ENSG00000239799.1 ENST00000478366.1 ITIH4-AS1
ENSG00000272573.1 ENST00000446157.2 MUSTN1
ENSG00000213533.7 ENST00000467979.1 TMEM110
ENSG00000163935.9 ENST00000394750.1 SFMBT1
ENSG00000242142.1 ENST00000459980.1 SERBP1P3
ENSG00000163933.5 ENST00000467048.1 RFT1
ENSG00000163932.9 ENST00000464818.1 PRKCD
ENSG00000163931.11 ENST00000296289.6 TKT
ENSG00000162290.12 ENST00000606822.1 DCP1A
ENSG

ENSG00000145147.15 ENST00000508541.1 SLIT2
ENSG00000248228.1 ENST00000515882.1 SLIT2-IT1
ENSG00000207732.1 ENST00000384999.1 MIR218-1
ENSG00000163138.14 ENST00000538990.1 PACRGL
ENSG00000185774.10 ENST00000359001.5 KCNIP4
ENSG00000239001.1 ENST00000458780.1 RNU6-420P
ENSG00000152990.9 ENST00000506346.1 GPR125
ENSG00000249948.2 ENST00000508264.1 GBA3
ENSG00000237350.1 ENST00000421246.1 CDC42P6
ENSG00000251220.2 ENST00000515267.2 RFPL4AP3
ENSG00000109819.4 ENST00000514494.1 PPARGC1A
ENSG00000109606.8 ENST00000513092.1 DHX15
ENSG00000207697.1 ENST00000384964.1 MIR573
ENSG00000243005.2 ENST00000476316.2 RN7SL16P
ENSG00000249256.2 ENST00000514223.2 ATP5LP3
ENSG00000248444.1 ENST00000508873.1 HNRNPA1P65
ENSG00000109610.5 ENST00000382120.3 SOD3
ENSG00000181982.13 ENST00000428116.2 CCDC149
ENSG00000153012.7 ENST00000512108.1 LGI2
ENSG00000109618.7 ENST00000302922.3 SEPSECS
ENSG00000038210.9 ENST00000264864.6 PI4K2B
ENSG00000168228.10 ENST00000508058.1 ZCCHC4
ENSG00000053900.6 ENST00000506973.1

ENSG00000229666.1 ENST00000451496.1 MAST4-AS1
ENSG00000134061.4 ENST00000515027.1 CD180
ENSG00000252108.1 ENST00000516299.1 RNU6-1232P
ENSG00000213864.3 ENST00000519625.1 EEF1B2P2
ENSG00000145675.10 ENST00000274335.5 PIK3R1
ENSG00000250289.1 ENST00000510889.1 VWA8P1
ENSG00000249183.1 ENST00000504193.1 SUMO2P4
ENSG00000145740.14 ENST00000511158.1 SLC30A5
ENSG00000265968.1 ENST00000579642.1 RN7SL103P
ENSG00000134057.10 ENST00000513102.1 CCNB1
ENSG00000153044.5 ENST00000510742.1 CENPH
ENSG00000134056.7 ENST00000503793.1 MRPS36
ENSG00000134058.6 ENST00000513629.1 CDK7
ENSG00000183323.8 ENST00000396499.1 CCDC125
ENSG00000213830.3 ENST00000508046.1 CFL1P5
ENSG00000215006.4 ENST00000503744.1 CHCHD2P2
ENSG00000085231.9 ENST00000328663.4 TAF9
ENSG00000152942.14 ENST00000358030.2 RAD17
ENSG00000152939.10 ENST00000413223.2 MARVELD2
ENSG00000263737.1 ENST00000579812.1 RN7SL476P
ENSG00000244717.1 ENST00000496370.1 RPS27P14
ENSG00000266477.1 ENST00000584555.1 RN7SL616P
ENSG00000197822.6 ENST00000542

ENSG00000231402.1 ENST00000428639.1 WASF5P
ENSG00000234745.5 ENST00000474381.1 HLA-B
ENSG00000228432.1 ENST00000414224.1 DHFRP2
ENSG00000201658.1 ENST00000364788.1 RNU6-283P
ENSG00000230994.1 ENST00000449999.1 FGFR3P1
ENSG00000223702.1 ENST00000424108.1 ZDHHC20P2
ENSG00000225851.1 ENST00000425174.1 HLA-S
ENSG00000206337.6 ENST00000541196.1 HCP5
ENSG00000204520.8 ENST00000421350.1 MICA
ENSG00000204516.5 ENST00000538442.1 MICB
ENSG00000219797.2 ENST00000403866.2 PPIAP9
ENSG00000225499.1 ENST00000416625.1 RPL15P4
ENSG00000204511.2 ENST00000376191.2 MCCD1
ENSG00000198563.9 ENST00000449074.2 DDX39B
ENSG00000201785.1 ENST00000364915.1 SNORD117
ENSG00000265236.1 ENST00000584275.1 SNORD84
ENSG00000234006.1 ENST00000416684.1 DDX39B-AS1
ENSG00000213760.6 ENST00000481998.1 ATP6V1G2
ENSG00000204498.6 ENST00000473655.1 NFKBIL1
ENSG00000226979.4 ENST00000471842.1 LTA
ENSG00000232810.3 ENST00000449264.2 TNF
ENSG00000227507.2 ENST00000482429.1 LTB
ENSG00000204482.6 ENST00000376111.4 LST1
ENSG000002044

ENSG00000078399.11 ENST00000489695.1 HOXA9
ENSG00000253187.2 ENST00000523790.1 HOXA-AS4
ENSG00000253293.3 ENST00000519593.1 HOXA10
ENSG00000005073.5 ENST00000006015.3 HOXA11
ENSG00000240990.5 ENST00000479766.1 HOXA11-AS
ENSG00000106031.6 ENST00000222753.4 HOXA13
ENSG00000243766.3 ENST00000472494.1 HOTTIP
ENSG00000253405.1 ENST00000519218.1 EVX1-AS
ENSG00000106038.8 ENST00000535619.1 EVX1
ENSG00000213783.4 ENST00000446039.2 RPL35P4
ENSG00000236569.3 ENST00000446472.3 HNRNPA1P73
ENSG00000233830.2 ENST00000453115.1 EIF4HP1
ENSG00000213781.3 ENST00000423961.1 PSMC1P2
ENSG00000106049.4 ENST00000496814.1 HIBADH
ENSG00000106052.9 ENST00000433216.2 TAX1BP1
ENSG00000153814.7 ENST00000454041.1 JAZF1
ENSG00000265382.1 ENST00000577690.1 RN7SL365P
ENSG00000206623.1 ENST00000383896.1 RNU6-979P
ENSG00000234336.2 ENST00000444500.2 JAZF1-AS1
ENSG00000146592.12 ENST00000498316.3 CREB5
ENSG00000176734.3 ENST00000322982.3 TRIL
ENSG00000106066.9 ENST00000437527.1 CPVL
ENSG00000106069.16 ENST00000421775.2 C

ENSG00000236827.1 ENST00000443854.1 LINC00529
ENSG00000239315.3 ENST00000455290.2 RPL19P13
ENSG00000104643.5 ENST00000528389.1 MTMR9
ENSG00000177710.5 ENST00000382435.4 SLC35G5
ENSG00000154316.10 ENST00000326605.4 TDH
ENSG00000242483.2 ENST00000481765.2 RN7SL293P
ENSG00000184608.4 ENST00000284481.3 C8orf12
ENSG00000199368.1 ENST00000362498.1 RNU6-1084P
ENSG00000154319.10 ENST00000528111.1 FAM167A
ENSG00000136573.8 ENST00000526097.1 BLK
ENSG00000170983.3 ENST00000304233.3 LINC00208
ENSG00000136574.13 ENST00000526021.1 GATA4
ENSG00000255394.2 ENST00000525043.2 C8orf49
ENSG00000154328.11 ENST00000524741.1 NEIL2
ENSG00000227203.2 ENST00000424509.1 SUB1P1
ENSG00000079459.8 ENST00000538689.1 FDFT1
ENSG00000164733.16 ENST00000415599.2 CTSB
ENSG00000254948.1 ENST00000524483.1 OR7E158P
ENSG00000206014.5 ENST00000341827.4 OR7E161P
ENSG00000205884.2 ENST00000382209.2 DEFB136
ENSG00000205883.2 ENST00000382208.2 DEFB135
ENSG00000205882.4 ENST00000382205.4 DEFB134
ENSG00000254817.1 ENST00000528715.1

ENSG00000234734.5 ENST00000603237.1 SPATA31A7
ENSG00000237086.2 ENST00000451936.2 RBPJP2
ENSG00000243382.2 ENST00000411918.2 ATP5A1P10
ENSG00000233414.1 ENST00000424050.1 CNN2P5
ENSG00000235327.1 ENST00000435658.1 CYP4F61P
ENSG00000234430.1 ENST00000457659.1 SNX18P6
ENSG00000202474.1 ENST00000365604.1 RNA5SP283
ENSG00000184523.3 ENST00000328411.3 PTGER4P2
ENSG00000237451.3 ENST00000478512.1 CDK2AP2P2
ENSG00000238245.2 ENST00000449815.2 MYO5BP2
ENSG00000206946.1 ENST00000384219.1 RNU6-156P
ENSG00000198312.4 ENST00000457640.1 BMS1P9
ENSG00000176115.8 ENST00000322309.5 AQP7P4
ENSG00000186466.4 ENST00000334576.3 AQP7P1
ENSG00000237238.2 ENST00000443754.1 BMS1P10
ENSG00000232833.4 ENST00000455764.2 FAM27E3
ENSG00000240165.2 ENST00000491684.2 RN7SL787P
ENSG00000196774.3 ENST00000377477.2 ANKRD20A1
ENSG00000238397.1 ENST00000391117.1 RNU6-368P
ENSG00000228656.1 ENST00000431872.1 MYO5BP3
ENSG00000226020.4 ENST00000422476.2 CDK2AP2P3
ENSG00000239684.2 ENST00000458150.2 PTGER4P3
ENSG00000207277.

ENSG00000166295.4 ENST00000470481.2 ANAPC16
ENSG00000168209.4 ENST00000471240.1 DDIT4
ENSG00000148719.10 ENST00000394903.2 DNAJB12
ENSG00000107745.12 ENST00000401998.3 MICU1
ENSG00000202513.1 ENST00000365643.1 RNU6-805P
ENSG00000231471.1 ENST00000437647.1 HMGN2P34
ENSG00000156026.10 ENST00000536019.1 MCU
ENSG00000266719.1 ENST00000583078.1 MIR4676
ENSG00000138315.8 ENST00000334011.5 OIT3
ENSG00000215086.2 ENST00000428039.1 NPM1P24
ENSG00000138308.5 ENST00000373032.3 PLA2G12B
ENSG00000213700.3 ENST00000396131.2 RPL17P50
ENSG00000122884.8 ENST00000440381.1 P4HA1
ENSG00000166321.9 ENST00000537969.1 NUDT13
ENSG00000122882.6 ENST00000610256.1 ECD
ENSG00000138286.10 ENST00000468462.1 FAM149B1
ENSG00000213551.4 ENST00000512551.1 DNAJC9
ENSG00000227382.1 ENST00000455776.1 EIF4A2P2
ENSG00000182180.9 ENST00000416782.2 MRPS16
ENSG00000156042.13 ENST00000394865.1 TTC18
ENSG00000236756.4 ENST00000513954.1 DNAJC9-AS1
ENSG00000200356.1 ENST00000363486.1 RNU6-833P
ENSG00000138279.11 ENST00000535178.1 

ENSG00000176567.1 ENST00000320048.1 OR4X1
ENSG00000176555.1 ENST00000319988.1 OR4S1
ENSG00000176547.7 ENST00000319856.4 OR4C3
ENSG00000197161.6 ENST00000415304.1 OR4C4P
ENSG00000176540.3 ENST00000319813.3 OR4C5
ENSG00000182565.8 ENST00000530855.1 OR4C2P
ENSG00000184789.6 ENST00000434991.1 OR4C10P
ENSG00000254925.1 ENST00000532368.1 OR4C9P
ENSG00000255215.1 ENST00000529879.1 OR4R1P
ENSG00000237388.2 ENST00000446524.1 OR4A47
ENSG00000255113.1 ENST00000531359.1 OR4A48P
ENSG00000255304.1 ENST00000527255.1 OR4A46P
ENSG00000254832.1 ENST00000533878.1 OR4A40P
ENSG00000255534.1 ENST00000532938.1 OR4A43P
ENSG00000213607.4 ENST00000529954.1 OR4A45P
ENSG00000255297.1 ENST00000533053.1 OR4A41P
ENSG00000254674.1 ENST00000531397.1 OR4A42P
ENSG00000255053.1 ENST00000524504.1 OR4A44P
ENSG00000249910.2 ENST00000544390.1 TRIM51CP
ENSG00000220948.4 ENST00000534741.1 TRIM51GP
ENSG00000254764.1 ENST00000531490.1 TRIM53CP
ENSG00000182053.8 ENST00000332682.7 TRIM49B
ENSG00000214891.4 ENST00000530230.1 TRIM64

ENSG00000165682.10 ENST00000348658.4 CLEC1B
ENSG00000256660.1 ENST00000539155.1 CLEC12B
ENSG00000197992.2 ENST00000538482.1 CLEC9A
ENSG00000150048.6 ENST00000414501.2 CLEC1A
ENSG00000223042.1 ENST00000411110.1 RN7SKP161
ENSG00000255734.1 ENST00000406003.2 HNRNPABP1
ENSG00000172243.13 ENST00000525605.1 CLEC7A
ENSG00000173391.4 ENST00000543414.1 OLR1
ENSG00000165685.4 ENST00000536952.1 TMEM52B
ENSG00000139112.6 ENST00000545290.1 GABARAPL1
ENSG00000134539.12 ENST00000350274.5 KLRD1
ENSG00000213809.4 ENST00000396451.4 KLRK1
ENSG00000183542.4 ENST00000309384.1 KLRC4
ENSG00000205810.4 ENST00000381904.2 KLRC3
ENSG00000205809.5 ENST00000381901.1 KLRC2
ENSG00000134545.9 ENST00000544822.1 KLRC1
ENSG00000257016.1 ENST00000537614.1 SLC25A39P2
ENSG00000256667.2 ENST00000521068.2 KLRAP1
ENSG00000111196.5 ENST00000381881.2 MAGOHB
ENSG00000060140.4 ENST00000541561.1 STYK1
ENSG00000060138.8 ENST00000540747.1 YBX3
ENSG00000121377.2 ENST00000240687.2 TAS2R7
ENSG00000121314.2 ENST00000240615.2 TAS2R8
ENSG

ENSG00000132970.8 ENST00000361042.4 WASF3
ENSG00000234031.1 ENST00000425466.1 RPS3AP44
ENSG00000237001.2 ENST00000585599.1 WASF3-AS1
ENSG00000132975.6 ENST00000405846.3 GPR12
ENSG00000230256.1 ENST00000395943.3 FGFR1OP2P1
ENSG00000223782.1 ENST00000413365.1 RPS21P8
ENSG00000218198.2 ENST00000402278.2 RPS20P32
ENSG00000152484.9 ENST00000282344.6 USP12
ENSG00000232162.1 ENST00000440657.1 USP12-AS1
ENSG00000230641.1 ENST00000452222.1 USP12-AS2
ENSG00000234772.1 ENST00000444744.1 LINC00412
ENSG00000122026.6 ENST00000485756.1 RPL21
ENSG00000207500.1 ENST00000384769.1 SNORD102
ENSG00000207051.1 ENST00000384323.1 SNORA27
ENSG00000122035.6 ENST00000480803.1 RASL11A
ENSG00000252247.1 ENST00000516438.1 RNU6-70P
ENSG00000229609.1 ENST00000431171.1 LINC01079
ENSG00000201242.1 ENST00000364372.1 RNY1P1
ENSG00000122034.8 ENST00000482655.1 GTF3A
ENSG00000122033.10 ENST00000405591.2 MTIF3
ENSG00000252499.1 ENST00000516690.1 RNU6-63P
ENSG00000139517.6 ENST00000316334.3 LNX2
ENSG00000186184.11 ENST000003

ENSG00000207172.1 ENST00000384443.1 RNU6-1162P
ENSG00000126821.7 ENST00000247225.6 SGPP1
ENSG00000252749.1 ENST00000516940.1 RNU7-116P
ENSG00000270912.1 ENST00000604589.1 RPS28P1
ENSG00000054654.11 ENST00000357395.3 SYNE2
ENSG00000140009.14 ENST00000267525.6 ESR2
ENSG00000221537.1 ENST00000408610.1 MIR548H1
ENSG00000234911.1 ENST00000447107.1 TEX21P
ENSG00000100714.11 ENST00000554353.1 MTHFD1
ENSG00000089775.7 ENST00000394715.1 ZBTB25
ENSG00000179841.8 ENST00000320636.5 AKAP5
ENSG00000126804.9 ENST00000394712.2 ZBTB1
ENSG00000126803.8 ENST00000247207.6 HSPA2
ENSG00000165807.3 ENST00000556023.1 PPP1R36
ENSG00000126822.11 ENST00000492928.1 PLEKHG3
ENSG00000070182.13 ENST00000542895.1 SPTB
ENSG00000252497.1 ENST00000516688.1 RPPH1-2P
ENSG00000258289.3 ENST00000359118.2 CHURC1
ENSG00000176153.10 ENST00000557323.1 GPX2
ENSG00000139998.10 ENST00000436278.2 RAB15
ENSG00000257365.3 ENST00000542227.1 FNTB
ENSG00000125952.14 ENST00000554709.1 MAX
ENSG00000266531.1 ENST00000582134.1 MIR4706
ENSG0

ENSG00000242542.2 ENST00000486185.2 RN7SL489P
ENSG00000140478.10 ENST00000434739.3 GOLGA6D
ENSG00000244438.2 ENST00000488659.2 RN7SL327P
ENSG00000260357.1 ENST00000567850.1 DNM1P34
ENSG00000259790.1 ENST00000568553.1 ANP32BP1
ENSG00000140365.11 ENST00000567935.1 COMMD4
ENSG00000140398.9 ENST00000569758.1 NEIL1
ENSG00000207636.1 ENST00000384904.1 MIR631
ENSG00000140400.10 ENST00000565652.1 MAN2C1
ENSG00000169375.11 ENST00000565264.1 SIN3A
ENSG00000241890.1 ENST00000484355.1 RPL13P4
ENSG00000169410.5 ENST00000561731.1 PTPN9
ENSG00000169371.9 ENST00000371091.5 SNUPN
ENSG00000177971.7 ENST00000403490.1 IMP3
ENSG00000173548.8 ENST00000569152.1 SNX33
ENSG00000173546.7 ENST00000308508.5 CSPG4
ENSG00000182950.2 ENST00000332145.2 ODF3L1
ENSG00000246877.1 ENST00000501931.1 DNM1P35
ENSG00000261043.2 ENST00000580760.1 MIR4313
ENSG00000241807.2 ENST00000480656.2 RN7SL319P
ENSG00000261820.1 ENST00000564895.1 DNM1P49
ENSG00000140367.7 ENST00000338677.4 UBE2Q2
ENSG00000266308.1 ENST00000581311.1 RN7SL

ENSG00000167699.9 ENST00000536578.1 GLOD4
ENSG00000171861.6 ENST00000571157.1 RNMTL1
ENSG00000167693.12 ENST00000538650.1 NXN
ENSG00000177370.4 ENST00000327158.4 TIMM22
ENSG00000159842.10 ENST00000571543.1 ABR
ENSG00000264429.1 ENST00000578566.1 MIR3183
ENSG00000205899.3 ENST00000391429.1 BHLHA9
ENSG00000184811.3 ENST00000333813.3 TUSC5
ENSG00000108953.12 ENST00000498643.1 YWHAE
ENSG00000167193.7 ENST00000572145.1 CRK
ENSG00000197879.10 ENST00000361007.2 MYO1C
ENSG00000132376.15 ENST00000397335.3 INPP5K
ENSG00000236618.2 ENST00000425081.2 PITPNA-AS1
ENSG00000174238.10 ENST00000539476.1 PITPNA
ENSG00000167703.10 ENST00000382147.4 SLC43A2
ENSG00000243704.2 ENST00000475661.2 RN7SL105P
ENSG00000074660.11 ENST00000348987.3 SCARF1
ENSG00000167705.7 ENST00000573398.1 RILP
ENSG00000174231.12 ENST00000571346.1 PRPF8
ENSG00000185561.8 ENST00000330676.6 TLCD2
ENSG00000186594.8 ENST00000362190.1 MIR22HG
ENSG00000167716.14 ENST00000545662.1 WDR81
ENSG00000167711.9 ENST00000324015.3 SERPINF2
ENSG000

ENSG00000214140.6 ENST00000587063.1 PRCD
ENSG00000163597.10 ENST00000364968.1 SNHG16
ENSG00000199961.1 ENST00000363091.1 SNORD1B
ENSG00000070731.5 ENST00000592508.1 ST6GALNAC2
ENSG00000070526.10 ENST00000589992.1 ST6GALNAC1
ENSG00000199735.1 ENST00000362865.1 RNU6-227P
ENSG00000182534.9 ENST00000589082.1 MXRA7
ENSG00000070495.10 ENST00000589982.1 JMJD6
ENSG00000181038.9 ENST00000588964.1 METTL23
ENSG00000161547.10 ENST00000358156.6 SRSF2
ENSG00000092931.7 ENST00000355954.3 MFSD11
ENSG00000200257.1 ENST00000363387.1 RNU6-97P
ENSG00000267535.1 ENST00000589914.1 LINC00868
ENSG00000167889.8 ENST00000568598.1 MGAT5B
ENSG00000129657.10 ENST00000413679.2 SEC14L1
ENSG00000234912.5 ENST00000515981.1 LINC00338
ENSG00000222808.1 ENST00000410876.1 RNU4-47P
ENSG00000262870.1 ENST00000575927.1 CYCSP40
ENSG00000184640.13 ENST00000449803.2 SEPT9
ENSG00000264060.1 ENST00000579394.1 MIR4316
ENSG00000238898.1 ENST00000459500.1 RNU1-80P
ENSG00000078687.12 ENST00000544502.1 TNRC6C
ENSG00000267463.1 ENST000

ENSG00000089327.10 ENST00000392217.3 FXYD5
ENSG00000177558.3 ENST00000324675.3 FAM187B
ENSG00000105699.12 ENST00000427250.1 LSR
ENSG00000105698.11 ENST00000600898.1 USF2
ENSG00000105697.3 ENST00000593580.1 HAMP
ENSG00000105695.10 ENST00000597162.1 MAG
ENSG00000012124.10 ENST00000270311.6 CD22
ENSG00000263397.1 ENST00000578146.1 MIR5196
ENSG00000126266.2 ENST00000246553.2 FFAR1
ENSG00000185897.6 ENST00000594310.1 FFAR3
ENSG00000126251.5 ENST00000597214.1 GPR42
ENSG00000268222.1 ENST00000596811.1 EEF1A1P7
ENSG00000264400.1 ENST00000584748.1 RN7SL491P
ENSG00000126262.4 ENST00000246549.2 FFAR2
ENSG00000188508.6 ENST00000338897.3 KRTDAP
ENSG00000161249.16 ENST00000440396.1 DMKN
ENSG00000189001.6 ENST00000518157.1 SBSN
ENSG00000105679.4 ENST00000585510.1 GAPDHS
ENSG00000105677.7 ENST00000392205.1 TMEM147
ENSG00000105675.4 ENST00000590916.1 ATP4A
ENSG00000249115.4 ENST00000379045.2 HAUS5
ENSG00000126254.7 ENST00000360475.4 RBM42
ENSG00000105672.10 ENST00000402764.2 ETV2
ENSG00000126267.4 ENST

ENSG00000101074.3 ENST00000217043.2 R3HDML
ENSG00000101076.12 ENST00000415691.2 HNF4A
ENSG00000266151.1 ENST00000578301.1 MIR3646
ENSG00000168746.6 ENST00000372910.3 C20orf62
ENSG00000226243.1 ENST00000412502.1 RPL37AP1
ENSG00000124120.6 ENST00000461134.1 TTPAL
ENSG00000132824.9 ENST00000541235.1 SERINC3
ENSG00000168734.9 ENST00000372882.3 PKIG
ENSG00000196839.8 ENST00000535573.1 ADA
ENSG00000064205.6 ENST00000471629.1 WISP2
ENSG00000124249.5 ENST00000372861.3 KCNK15
ENSG00000101098.8 ENST00000541604.2 RIMS4
ENSG00000263911.1 ENST00000579266.1 RN7SL31P
ENSG00000166913.8 ENST00000479758.1 YWHAB
ENSG00000101104.8 ENST00000537323.1 PABPC1L
ENSG00000025772.7 ENST00000372813.3 TOMM34
ENSG00000227477.1 ENST00000445571.1 STK4-AS1
ENSG00000101109.7 ENST00000396731.4 STK4
ENSG00000124134.4 ENST00000537075.1 KCNS1
ENSG00000175121.7 ENST00000372789.4 WFDC5
ENSG00000168703.5 ENST00000372785.3 WFDC12
ENSG00000124102.4 ENST00000243924.3 PI3
ENSG00000124233.10 ENST00000244069.6 SEMG1
ENSG00000124157.

ENSG00000100429.13 ENST00000482213.1 HDAC10
ENSG00000188130.9 ENST00000395778.3 MAPK12
ENSG00000185386.10 ENST00000449719.2 MAPK11
ENSG00000196576.10 ENST00000359337.4 PLXNB2
ENSG00000205593.7 ENST00000460087.1 DENND6B
ENSG00000100239.11 ENST00000470046.1 PPP6R2
ENSG00000242463.2 ENST00000470627.2 RN7SL500P
ENSG00000100241.16 ENST00000390679.3 SBF1
ENSG00000128165.7 ENST00000362068.2 ADM2
ENSG00000100253.8 ENST00000451761.1 MIOX
ENSG00000100258.13 ENST00000380796.3 LMF2
ENSG00000025770.14 ENST00000522304.1 NCAPH2
ENSG00000130489.8 ENST00000535425.1 SCO2
ENSG00000025708.8 ENST00000487162.1 TYMP
ENSG00000177989.9 ENST00000329363.4 ODF3B
ENSG00000130487.4 ENST00000395676.2 KLHDC7B
ENSG00000217442.3 ENST00000402753.1 SYCE3
ENSG00000205560.8 ENST00000434492.2 CPT1B
ENSG00000100288.15 ENST00000465842.1 CHKB
ENSG00000205559.3 ENST00000380711.3 CHKB-AS1
ENSG00000008735.10 ENST00000008876.5 MAPK8IP2
ENSG00000100299.13 ENST00000547805.1 ARSA
ENSG00000251322.3 ENST00000445220.2 SHANK3
ENSG0000020

ENSG00000224732.1 ENST00000436482.1 MAGEA7P
ENSG00000229829.1 ENST00000424782.1 DUTP4
ENSG00000230899.1 ENST00000427671.1 MAGEA8-AS1
ENSG00000156009.5 ENST00000542674.1 MAGEA8
ENSG00000197021.4 ENST00000497550.1 CXorf40B
ENSG00000235703.1 ENST00000413076.1 LINC00894
ENSG00000252454.1 ENST00000516645.1 MIR2114
ENSG00000234825.2 ENST00000437286.2 XRCC6P2
ENSG00000013619.9 ENST00000455522.2 MAMLD1
ENSG00000171100.10 ENST00000543350.1 MTM1
ENSG00000063601.12 ENST00000538506.1 MTMR1
ENSG00000102181.15 ENST00000320893.6 CD99L2
ENSG00000222796.1 ENST00000410864.1 RNU6-383P
ENSG00000029993.10 ENST00000448905.2 HMGB3
ENSG00000230508.1 ENST00000424919.1 RPL19P21
ENSG00000265789.1 ENST00000579077.1 MIR4330
ENSG00000234696.1 ENST00000454196.1 GPR50-AS1
ENSG00000102195.7 ENST00000218316.3 GPR50
ENSG00000160131.9 ENST00000330374.6 VMA21
ENSG00000166049.10 ENST00000464219.1 PASD1
ENSG00000130032.11 ENST00000538575.1 PRRG3
ENSG00000147378.7 ENST00000417321.1 FATE1
ENSG00000183862.5 ENST00000329903.4 C

In [101]:
set(overlap_landmark_genes).difference(new_index)

{'ADGRE5', 'ADGRG1', 'B4GAT1', 'ERO1A', 'HACD3', 'JADE2', 'KIF1BP', 'MTERF3'}

In [100]:
new_index

['DDX11L1',
 'WASH7P',
 'MIR1302-11',
 'FAM138A',
 'OR4G4P',
 'OR4G11P',
 'OR4F5',
 'ENSG00000238009.2',
 'CICP27',
 'ENSG00000237683.5',
 'ENSG00000268903.1',
 'ENSG00000239906.1',
 'ENSG00000241860.2',
 'RNU6-1100P',
 'ENSG00000241599.1',
 'ENSG00000228463.4',
 'ENSG00000237094.7',
 'ENSG00000250575.1',
 'CICP7',
 'ENSG00000224813.2',
 'OR4F29',
 'WBP1LP7',
 'ENSG00000256186.1',
 'ENSG00000236601.1',
 'ENSG00000236743.1',
 'ENSG00000236679.2',
 'ENSG00000231709.1',
 'ENSG00000235146.2',
 'ENSG00000239664.2',
 'ENSG00000230021.3',
 'ENSG00000223659.1',
 'MTND1P23',
 'MTND2P28',
 'ENSG00000237973.1',
 'ENSG00000229344.1',
 'MTATP8P1',
 'MTATP6P1',
 'ENSG00000198744.5',
 'WBP1LP6',
 'OR4F16',
 'CICP3',
 'ENSG00000224956.5',
 'ENSG00000235373.1',
 'RNU6-1199P',
 'ENSG00000240618.1',
 'ENSG00000229905.1',
 'ENSG00000228327.2',
 'ENSG00000237491.4',
 'ENSG00000230092.3',
 'ENSG00000269831.1',
 'ENSG00000240453.1',
 'FAM87B',
 'LINC00115',
 'ENSG00000228794.4',
 'FAM41C',
 'TUBB8P11',
 'ENS

In [98]:
len(overlap_landmark_genes)

967

In [96]:
len(set(new_index).intersection(overlap_landmark_genes))

959

In [70]:
gene_id_to_symbol[gene_id_to_symbol["Gene stable ID version"]=="ENSG00000223972.4"]

Unnamed: 0,Gene stable ID,Transcript stable ID,HGNC symbol,Gene stable ID version,Transcript stable ID version


In [71]:
gene_id_to_symbol[gene_id_to_symbol["Gene stable ID"]=="ENSG00000223972"]

Unnamed: 0,Gene stable ID,Transcript stable ID,HGNC symbol,Gene stable ID version,Transcript stable ID version
6230,ENSG00000223972,ENST00000456328,DDX11L1,ENSG00000223972.5,ENST00000456328.2
6231,ENSG00000223972,ENST00000450305,DDX11L1,ENSG00000223972.5,ENST00000450305.2


In [42]:
gtex_rnaseq_data.index = [x.split(".")[0] for x in gtex_rnaseq_data.index.values]

In [48]:
# label rows with gene names 
gtex_rnaseq_data.index = gtex_rnaseq_data.index.map(gene_id_to_symbol_dict)

In [53]:
gtex_rnaseq_data = gtex_rnaseq_data.loc[:, gtex_l1000_data.index].T

In [51]:
# filter landmark genes and save
filtered_gtex_l1000_data = gtex_l1000_data.loc[:, overlap_landmark_genes]
filtered_gtex_l1000_data.reset_index().to_feather(gtex_filtered_l1000_output_filename.format(filtered_gtex_l1000_data.shape[0], filtered_gtex_l1000_data.shape[1]))

In [56]:
overlap_landmark_genes

['CHAC1',
 'KCTD5',
 'APPBP2',
 'TMED10',
 'PRPF4',
 'EBNA1BP2',
 'PLA2G15',
 'ASCC3',
 'ID2',
 'CDK6',
 'NFKBIB',
 'EDN1',
 'CCNA2',
 'FAM63A',
 'NRAS',
 'MMP2',
 'RB1',
 'ETV1',
 'HMGCR',
 'POLB',
 'ABCF3',
 'HES1',
 'FCHO1',
 'MRPL19',
 'ICAM3',
 'CASP3',
 'CCND1',
 'TCFL5',
 'SPAG4',
 'ARID5B',
 'CHP1',
 'HLA-DRA',
 'MLEC',
 'DDIT4',
 'PCMT1',
 'NOS3',
 'COL1A1',
 'TIMM22',
 'BLMH',
 'AMDHD2',
 'HMOX1',
 'SQRDL',
 'LGMN',
 'UBE2A',
 'TNIP1',
 'COPS7A',
 'PLA2G4A',
 'ECD',
 'PHKG2',
 'FBXL12',
 'KDELR2',
 'SENP6',
 'RFX5',
 'KCNK1',
 'SPTLC2',
 'SCAND1',
 'PSMB10',
 'MYC',
 'TRAK2',
 'RAB21',
 'CDCA4',
 'SLC35A1',
 'INPP4B',
 'HMG20B',
 'JMJD6',
 'POP4',
 'TSKU',
 'TMEM97',
 'RNH1',
 'DNAJC15',
 'TICAM1',
 'ALDOA',
 'GAPDH',
 'EML3',
 'ST6GALNAC2',
 'GATA2',
 'PTPN6',
 'PKIG',
 'RAB31',
 'PAFAH1B3',
 'DUSP3',
 'IGF2R',
 'KIAA0907',
 'CASP2',
 'TMEM2',
 'SYK',
 'LPGAT1',
 'AKAP8L',
 'SMNDC1',
 'TXLNA',
 'CD320',
 'BCL2',
 'POLE2',
 'PACSIN3',
 'CDK2',
 'SLC5A6',
 'FASTKD5',
 'COG4',


In [61]:
len(set(list(gtex_rnaseq_data.columns.unique())).intersection(overlap_landmark_genes))

938

In [55]:
gtex_rnaseq_data

Unnamed: 0_level_0,DDX11L1,WASH7P,MIR1302-2HG,FAM138A,OR4G4P,OR4G11P,OR4F5,NaN,CICP27,NaN,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-PLZ6-0326-SM-3P61J,0.0,883.0,0.0,0.0,0.0,0.0,0.0,44.0,2.0,306.0,...,966132.0,0.0,0.0,0.0,545683.0,255460.0,12.0,547035.0,0.0,0.0
GTEX-WOFM-1726-SM-3MJFA,0.0,668.0,0.0,0.0,0.0,0.0,0.0,35.0,0.0,141.0,...,569462.0,0.0,0.0,0.0,314005.0,136724.0,4.0,277571.0,0.0,1.0
GTEX-WHSE-2926-SM-3NMBG,2.0,1675.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,346.0,...,977575.0,0.0,3.0,0.0,169863.0,54910.0,2.0,579342.0,0.0,0.0
GTEX-RNOR-0011-R7A-SM-2TF4V,0.0,697.0,2.0,0.0,0.0,0.0,0.0,31.0,0.0,146.0,...,1148685.0,0.0,0.0,1.0,504484.0,119540.0,2.0,1291918.0,2.0,0.0
GTEX-NPJ8-0011-R8a-SM-2HMLG,1.0,287.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,342.0,...,1103988.0,0.0,0.0,1.0,352228.0,94934.0,0.0,737598.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-XOT4-0726-SM-4GIAW,2.0,1331.0,0.0,0.0,0.0,0.0,0.0,58.0,4.0,1821.0,...,564548.0,0.0,0.0,0.0,100843.0,21671.0,0.0,354419.0,0.0,0.0
GTEX-TML8-1126-SM-4DXSS,2.0,961.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,65.0,...,435513.0,0.0,0.0,0.0,114549.0,40219.0,1.0,224566.0,0.0,0.0
GTEX-U8T8-1126-SM-4DXUE,45.0,1280.0,1.0,0.0,0.0,0.0,0.0,44.0,5.0,5135.0,...,696739.0,0.0,0.0,0.0,73359.0,10124.0,0.0,237628.0,0.0,0.0
GTEX-TKQ1-0926-SM-4DXU2,37.0,1096.0,8.0,0.0,0.0,0.0,0.0,53.0,4.0,1435.0,...,783850.0,1.0,0.0,1.0,74097.0,20343.0,1.0,291340.0,0.0,0.0


In [54]:
# filter landmark genes and save
filtered_gtex_rnaseq_data = gtex_rnaseq_data.loc[:, overlap_landmark_genes]
filtered_gtex_rnaseq_data.reset_index().to_feather(gtex_filtered_rnaseq_output_filename.format(filtered_gtex_rnaseq_data.shape[0], filtered_gtex_rnaseq_data.shape[1]))

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['FAM63A', 'SQRDL', 'KIAA0907', 'TMEM2', 'TMEM110',\n       ...\n       'LRRC16A', 'KIAA0196', 'HIST2H2BE', 'ADCK3', 'NARFL'],\n      dtype='object', length=29). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

## Sampling

### Load again?

In [6]:
l1000_output_filename

'../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n978x1319138.f'

In [43]:
# load filtered l1000
# filtered_l1000_data = pd.read_feather(l1000_output_filename)
# first_col = filtered_l1000_data.columns.tolist()[0]
# filtered_l1000_data.set_index(first_col, inplace=True)

# load filtered ARCHS4
filtered_landmark_gene_expression = pd.read_feather("../data/processed/ARCHS4/human_matrix_v9_filtered_n967x154575.f")
first_col = filtered_landmark_gene_expression.columns.tolist()[0]
filtered_landmark_gene_expression.set_index(first_col, inplace=True)

### L1000

In [10]:
n_sampling = 50000

In [11]:
filtered_l1000_data.shape

(978, 1319138)

In [13]:
filtered_l1000_data_overlap_lanemark = filtered_l1000_data.loc[overlap_landmark_genes, :]

In [14]:
filtered_l1000_data_overlap_lanemark

cid,CPC005_A375_6H_X1_B3_DUO52HI53LO:K06,CPC005_A375_6H_X2_B3_DUO52HI53LO:K06,CPC005_A375_6H_X3_B3_DUO52HI53LO:K06,CPC005_A375_6H_X1_B3_DUO52HI53LO:C19,CPC005_A375_6H_X2_B3_DUO52HI53LO:C19,CPC005_A375_6H_X3_B3_DUO52HI53LO:C19,CPC004_A375_6H_X1_B3_DUO52HI53LO:K13,CPC004_A375_6H_X2_B3_DUO52HI53LO:K13,CPC004_A375_6H_X3_B3_DUO52HI53LO:K13,CPC005_A375_6H_X1_B3_DUO52HI53LO:K20,...,PCLB003_PC3_24H_X3_B13:P15,PCLB003_PC3_24H_X3_B13:P16,PCLB003_PC3_24H_X3_B13:P17,PCLB003_PC3_24H_X3_B13:P18,PCLB003_PC3_24H_X3_B13:P19,PCLB003_PC3_24H_X3_B13:P20,PCLB003_PC3_24H_X3_B13:P21,PCLB003_PC3_24H_X3_B13:P22,PCLB003_PC3_24H_X3_B13:P23,PCLB003_PC3_24H_X3_B13:P24
CHAC1,11.16810,10.893900,10.430901,11.720950,11.92555,9.502400,10.864349,10.702049,12.259501,10.97840,...,8.481750,10.284050,10.256550,10.235350,11.426451,10.112450,7.920350,10.257500,10.396749,8.426400
KCTD5,6.95100,7.856200,7.315200,6.949699,7.11265,7.546800,5.268750,4.381825,7.181776,7.50955,...,6.284025,6.338650,5.841875,5.918650,6.233700,6.358975,6.077525,8.601749,6.356650,6.489375
APPBP2,4.95800,5.591550,5.061800,4.819000,5.88270,5.288400,6.424250,6.163150,5.610900,5.60905,...,7.162400,6.880650,6.156250,6.427650,7.573600,7.465575,7.148000,7.107300,7.219950,7.187350
TMED10,10.04060,8.976049,10.009500,10.344600,9.62410,9.695200,10.085051,10.107300,9.502450,9.89370,...,10.576850,10.366550,10.292200,10.455600,10.478149,10.425950,10.101400,10.269300,10.380700,10.328900
PRPF4,11.03820,11.084000,11.514000,11.021200,10.93430,11.181300,11.828550,10.977050,10.870951,11.09740,...,5.110700,5.231700,4.840050,4.969225,5.033400,4.853300,4.974275,5.001750,5.026475,4.977150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NARFL,6.69210,7.896700,6.987401,7.004200,8.02530,7.029800,7.947249,8.209750,8.109700,7.25410,...,13.027200,7.105250,13.820749,6.518150,7.770800,7.401450,13.889750,7.455450,7.708000,13.955200
DDX42,8.37745,7.291000,8.365700,8.096701,7.20000,8.633500,8.097825,8.626100,8.153650,8.28740,...,7.141500,7.846300,7.558950,7.705551,7.703825,7.610700,7.454525,7.733900,7.557300,7.648350
PAN2,6.52680,7.129300,7.116100,6.583700,6.03170,6.825601,7.531850,4.097250,6.477825,7.07425,...,3.500400,3.269250,3.254600,3.485150,5.636850,3.438000,3.767650,3.125750,3.306100,5.733400
TLR4,9.02220,6.923600,12.571500,12.468100,7.12230,8.844200,8.281349,8.848850,6.692550,9.13670,...,10.631750,11.097349,3.740050,3.820000,3.879000,4.020750,11.682900,4.201125,4.084350,4.218350


In [15]:
filtered_l1000_data_sampled = filtered_l1000_data_overlap_lanemark.sample(axis='columns', n=n_sampling).T
filtered_l1000_data_sampled.reset_index().to_feather(l1000_overlap_landmark_output_filename.format(filtered_l1000_data_sampled.shape[0], filtered_l1000_data_sampled.shape[1]))

In [16]:
filtered_l1000_data_sampled.shape

(50000, 967)

### ARCHS4

In [63]:
random_sampled_landmark_gene_expression = filtered_landmark_gene_expression.sample(axis='columns', n=n_sampling).T
random_sampled_landmark_gene_expression.reset_index().to_feather(ARCHS4_filtered_overlap_landmark_output_filename.format(random_sampled_landmark_gene_expression.shape[0], random_sampled_landmark_gene_expression.shape[1]))

ValueError: Cannot take a larger sample than population when 'replace=False'