# Script for Processing Data

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import umap
import seaborn as sns
import matplotlib.pyplot as plt
# from ggplot import *


from ruffus import *
import sys, os, h5py, random, tempfile, scipy, time,copy
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct
import pipeline_support as PS
from sklearn.decomposition import PCA
from scipy import stats
from matplotlib.pyplot import imshow
from sklearn.preprocessing import MinMaxScaler

from collections import Counter
#from tensorflow_examples.models.pix2pix import pix2pix
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
from tensorflow_gan.python.losses import losses_impl


from numpy.random import seed
randomState = 123
seed(randomState)




Parameters

In [2]:
n_sampling = 50000

Input Filenames

In [3]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

Output Filenames

In [4]:
ARCHS4_filtered_sample_output_filename = "../data/processed/ARCHS4/filtered_sample_list.txt"
ARCHS4_filtered_overlap_landmark_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}.f" # n_samplingx967

## Get overlap landmark genes

In [5]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()]  



In [6]:
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))
overlap_rnaseq_genes = list(set(archs4_all_gene).intersection(gtex_rnaseq_all_gene)) # common genes in ARCHS4 and GTEx RNA-seq

## Initial: Load ARCHS4 RNA-seq

Data preprocessing code from https://github.com/MaayanLab/L1k2RNA-seq-2.0/blob/cb5eaa3a447b502e32db6c1aae84eaa94d0ce0f4/pipeline/pipeline.py#L43

In [7]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]

Processing RNA-seq data.....


In [11]:
# landmark gene index
archs4_landmark_gene_index = [i for i, x in enumerate(genes) if x in overlap_landmark_genes]
archs4_landmark_gene_names = [x for i, x in enumerate(genes) if x in overlap_landmark_genes]

In [10]:
# filter landmake genes and covert array to pandas dataframe 
landmark_gene_expression = pd.DataFrame(expression[archs4_landmark_gene_index])
landmark_gene_expression.index = archs4_landmark_gene_names

In [11]:
landmark_gene_expression.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,307258,307259,307260,307261,307262,307263,307264,307265,307266,307267
AARS,10297,722,179,232,10981,533,328,168,500,235,...,1274,1450,2725,2102,0,475,1883,1642,16,189
ABCB6,5467,0,0,0,6392,0,0,0,0,0,...,755,1134,532,660,0,532,398,509,8,611
ABCC5,3273,0,0,0,3822,0,0,0,0,0,...,1649,1059,1469,1736,9,376,3065,1773,55,1366
ABCF1,9290,0,0,0,9100,0,0,0,0,0,...,867,2010,2428,1828,71,739,3180,2983,337,1735
ABCF3,3496,0,0,0,3259,0,0,0,0,0,...,492,529,894,1188,0,251,896,1156,17,220


In [12]:
# Remove single cell samples that are < 1 million reads and from studies with > 200 samples
samples = list()
samples_index = list()
filtered_expression = list()
i = 0
for sample_id, series_id in zip(h5['meta']['samples']['geo_accession'], h5['meta']['samples']['series_id']):
    if i in idx_read_sums_keep and studies_count_dict[series_id] <= 200:
        samples.append(sample_id)
        samples_index.append(i)
    if i % 10000 == 0:
        print(i)    
        
    i += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000


In [31]:
# filter out samples 
filtered_landmark_gene_expression = landmark_gene_expression.iloc[:, samples_index]
filtered_landmark_gene_expression.columns = samples

In [32]:
# save
filtered_landmark_gene_expression.reset_index().to_feather(ARCHS4_filtered_landmark_output_filename.format(filtered_landmark_gene_expression.shape[0], filtered_landmark_gene_expression.shape[1]))
print(ARCHS4_filtered_landmark_output_filename.format(filtered_landmark_gene_expression.shape[0], filtered_landmark_gene_expression.shape[1]))

../data/processed/ARCHS4/human_matrix_v9_filtered_n967x154575.f


In [20]:
# save sample ids
with open(ARCHS4_filtered_sample_output_filename, "w") as f:
    f.write("\n".join(samples))    