In [1]:
import pandas as pd
import h5py
import random
import numpy as np
import numpy_indexed as npi
import time

In [2]:
n_sampling = 50000

input filenames

In [3]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
ARCHS4_filtered_sample_output_filename = "../data/processed/ARCHS4/filtered_sample_list.txt"

l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

output filenames

In [4]:
ARCHS4_full_dimension_sampled_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}.f"

# Get overlap landmark genes

In [5]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()]
    



In [6]:
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))
overlap_rnaseq_genes = list(set(archs4_all_gene).intersection(gtex_rnaseq_all_gene)) # common genes in ARCHS4 and GTEx RNA-seq

In [7]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]
sample_geo_list = list(h5['meta']['samples']['geo_accession'])

Processing RNA-seq data.....


In [8]:
with open(ARCHS4_filtered_sample_output_filename, "r") as f:
    filtered_sample_ids = [x.strip() for x in f.readlines()] #~150K

In [70]:
# random sampling
sampled_ids = random.sample(filtered_sample_ids, n_sampling) 

In [71]:
# index of selected samples
sampled_index = npi.indices(sample_geo_list, sampled_ids)

In [72]:
sample_index_to_id_dict = dict(zip(sampled_index, sampled_ids))

In [74]:
chunk_size = 500
sampled_expression_gene = list()
strt_time = time.time()
for i in range(int(n_sampling/chunk_size)):
    sampled_index_i = sampled_index[i*chunk_size:(i+1)*chunk_size]
    expression_i = expression[:, sorted(sampled_index_i)]
    expression_i_df = pd.DataFrame(expression_i)
    expression_i_df.columns = [sample_index_to_id_dict[k] for k in sorted(sampled_index_i)]
    sampled_expression_gene.append(expression_i_df)
#     break

    print(i, time.time()-strt_time)
    strt_time = time.time()

0 6.838150501251221
1 7.429656028747559
2 7.289673566818237
3 7.372851610183716
4 7.3234357833862305
5 7.4700281620025635
6 7.361299753189087
7 7.389813184738159
8 7.507824659347534
9 7.453062534332275
10 7.482926368713379
11 7.4464943408966064
12 7.53975248336792
13 7.583191633224487
14 7.50426173210144
15 7.644843816757202
16 7.5303053855896
17 7.552522897720337
18 7.573420286178589
19 7.760143041610718
20 7.572859764099121
21 7.539182186126709
22 7.680570125579834
23 7.515921592712402
24 7.444918394088745
25 7.405945301055908
26 7.534308910369873
27 7.543490409851074
28 7.517668008804321
29 7.499485969543457
30 7.442661285400391
31 7.458033323287964
32 7.49985408782959
33 7.471463441848755
34 7.471463680267334
35 7.578266143798828
36 7.394474267959595
37 7.435899972915649
38 7.417991876602173
39 7.603836297988892
40 7.3987884521484375
41 7.56351375579834
42 7.6281208992004395
43 7.431013822555542
44 7.542052507400513
45 7.3841774463653564
46 7.427540063858032
47 7.6438868045806885
4

In [84]:
sampled_expression_df = pd.concat(sampled_expression_gene, axis=1)

In [85]:
sampled_expression_df.index = genes

In [86]:
sampled_expression_df = sampled_expression_df.T

In [88]:
sampled_expression_df.reset_index().to_feather(ARCHS4_full_dimension_sampled_output_filename.format(sampled_expression_df.shape[0], sampled_expression_df.shape[1]))

In [89]:
ARCHS4_full_dimension_sampled_output_filename.format(sampled_expression_df.shape[0], sampled_expression_df.shape[1])

'../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x35238.f'

# Re-load

In [10]:
archs4_preprocessed = pd.read_feather('../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x35238.f')

In [11]:
first_column = archs4_preprocessed.columns.tolist()[0]
archs4_preprocessed = archs4_preprocessed.set_index(first_column)

In [12]:
archs4_preprocessed.head()

Unnamed: 0_level_0,A1BG,A1CF,A2M,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,BP-21201H5.1,BP-21264C1.1,BP-2168N6.1,BP-2168N6.3,BP-2171C21.2,BP-2171C21.4,BP-2171C21.5,BP-2171C21.6,BP-2189O9.2,YR211F11.2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM2324158,0,0,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GSM3369560,691,14,4,9,3,0,1373,1,1507,1910,...,3,6,0,0,0,8,8,61,11,0
GSM4792403,11,0,1,3,0,0,0,0,116,18,...,1,61,0,0,0,0,0,1,102,0
GSM3972247,71,165,25,10,0,0,567,15,4763,3177,...,90,21,0,0,3,3,2,9,38,1
GSM3900214,163,1,0,14,0,0,18,0,1433,1353,...,0,0,0,0,0,0,0,0,0,0


We retained genes with read counts of at least 10 in 2% (200) of the samples

In [16]:
sample_count_for_gene = archs4_preprocessed[archs4_preprocessed>10].count() #> archs4_preprocessed.shape[0]

In [19]:
genes_with_high_count = sample_count_for_gene[sample_count_for_gene>int(archs4_preprocessed.shape[0]*0.02)].index.tolist()

In [21]:
archs4_preprocessed_high_count_genes = archs4_preprocessed[genes_with_high_count]
archs4_preprocessed_high_count_genes.reset_index().to_feather(ARCHS4_full_dimension_sampled_output_filename.format(archs4_preprocessed_high_count_genes.shape[0], archs4_preprocessed_high_count_genes.shape[1]))

In [22]:
ARCHS4_full_dimension_sampled_output_filename.format(archs4_preprocessed_high_count_genes.shape[0], archs4_preprocessed_high_count_genes.shape[1])

'../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x25312.f'