# Script for Processing Data

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import umap
import seaborn as sns
import matplotlib.pyplot as plt
# from ggplot import *


from ruffus import *
import sys, os, h5py, random, tempfile, scipy, time,copy
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct
import pipeline_support as PS
from sklearn.decomposition import PCA
from scipy import stats
from matplotlib.pyplot import imshow
from sklearn.preprocessing import MinMaxScaler

from collections import Counter
#from tensorflow_examples.models.pix2pix import pix2pix
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
from tensorflow_gan.python.losses import losses_impl


from numpy.random import seed
randomState = 123
seed(randomState)




Parameters

In [2]:
gtex_rnaseq_filename = "../data/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"
gtex_l1000_filename = "../data/GTEx/DS_GTEX_L1000_n3176x12320.gctx"
gtex_geneinfo_filename = "../data/GTEx/GSE92743_Broad_GTEx_gene_info.txt"

l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

Output Filenames

In [23]:
gtex_filtered_l1000_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n{}x{}.f" # samplesx962
gtex_filtered_rnaseq_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n{}x{}.f" # samplesx962



## Load overlap landmark genes

In [4]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()]
    
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))


## Load GTEx 

GTEx L1000 from GSE92742 
GTEx RNA-seq from https://www.gtexportal.org/home/datasets version 8 Gene read count

In [5]:
gtex_gene_info = pd.read_csv(gtex_geneinfo_filename,header = 0, sep = '\t')
gtex_landmark_genes = gtex_gene_info.loc[gtex_gene_info["pr_is_lm"]==1, "pr_gene_symbol"].tolist()

In [6]:
# GTEx L1000 data
print('Loading GTEx L1000 data.....')
gtex_l1000_data = parse_gctx.parse(gtex_l1000_filename,convert_neg_666=True).data_df

# create a probe_id to gene name dictionary 
gtex_gene_dict = dict(zip([str(x) for x in gtex_gene_info['pr_gene_id']], gtex_gene_info['pr_gene_symbol']))

# label rows with gene names 
gtex_l1000_data.index = [gtex_gene_dict[x] for x in gtex_l1000_data.index.values]
gtex_l1000_data = gtex_l1000_data.T

Loading GTEx L1000 data.....


In [27]:
# GTEx RNA-seq data
print('Loading GTEx RNA-seq data.....')

with open(gtex_rnaseq_filename, "r") as f:
    lines = f.readlines()

    sample_line = lines[2]
    samples = sample_line.split("\t")
    sample_index = [i for i, x in enumerate(samples) if x in gtex_l1000_data.index]
    paired_sample_id = [x for i, x in enumerate(samples) if x in gtex_l1000_data.index]

    gex_paired_sample = list()
    gene_names = list()
    for line in lines[3:]:
        splited = np.array(line.split("\t"))
        gene_name = splited[1]

        if gene_name in overlap_landmark_genes:
            gene_names.append(gene_name)
            gex_paired_sample.append(splited[sample_index])

    gtex_rnaseq_data = pd.DataFrame(gex_paired_sample, columns=paired_sample_id, index=gene_names).T
    gtex_rnaseq_data.reset_index().to_feather(gtex_filtered_rnaseq_output_filename.format(gtex_rnaseq_data.shape[0], gtex_rnaseq_data.shape[1]))

Loading GTEx RNA-seq data.....


In [51]:
# filter landmark genes and save
filtered_gtex_l1000_data = gtex_l1000_data.loc[:, overlap_landmark_genes]
filtered_gtex_l1000_data.reset_index().to_feather(gtex_filtered_l1000_output_filename.format(filtered_gtex_l1000_data.shape[0], filtered_gtex_l1000_data.shape[1]))