In [None]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import numpy_indexed as npi
import random

import sys, h5py, time
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct

from scipy import stats
from numpy.random import seed

import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize


randomState = 123
seed(randomState)
random.seed(randomState)

Input filename

In [None]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
l1000_filename = "../data/L1000/GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx"
l1000_geneinfo_filename = "../data/L1000/GSE92742_Broad_LINCS_gene_info.txt"
gtex_rnaseq_filename = "../data/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"
gtex_l1000_filename = "../data/GTEx/DS_GTEX_L1000_n3176x12320.gctx"
gtex_geneinfo_filename = "../data/GTEx/GSE92743_Broad_GTEx_gene_info.txt"

Output filename

In [None]:
l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

# Load L1000

In [None]:
print('Loading L1000 data.....')
l1000_data = parse_gctx.parse(l1000_filename,convert_neg_666=True).data_df

In [None]:
gene_info = pd.read_csv(l1000_geneinfo_filename,header = 0, sep = '\t')

In [None]:
# create a probe_id to gene name dictionary 
gene_dict = dict(zip([str(x) for x in gene_info['pr_gene_id']], gene_info['pr_gene_symbol']))

# label rows with gene names 
l1000_data.index = [gene_dict[x] for x in l1000_data.index.values]


In [None]:
# save all genes
with open(l1000_all_gene_list, "w") as f:
    f.write("\n".join(sorted(l1000_data.index.tolist())))

In [None]:
# filter landmark genes
landmark_gene_info = gene_info[gene_info["pr_is_lm"] == 1]
l1000_data_landmark_genes = l1000_data.loc[landmark_gene_info["pr_gene_symbol"], :]

In [None]:
# Save L1000 genes to a txt file
with open(l1000_landmark_gene_list, "w") as f:
    f.write("\n".join(sorted(l1000_data_landmark_genes.index.tolist())))

# ARCHS4

In [None]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]

In [None]:
with open(archs4_all_gene_list, "w") as f:
    f.write("\n".join(sorted(genes)))

# GTEx

## GTEx L1000

In [None]:
gtex_gene_info = pd.read_csv(gtex_geneinfo_filename,header = 0, sep = '\t')
gtex_landmark_genes = gtex_gene_info.loc[gtex_gene_info["pr_is_lm"]==1, "pr_gene_symbol"].tolist()

In [None]:
# GTEx L1000 data
print('Loading GTEx L1000 data.....')
gtex_l1000_data = parse_gctx.parse(gtex_l1000_filename,convert_neg_666=True).data_df

# create a probe_id to gene name dictionary 
gtex_gene_dict = dict(zip([str(x) for x in gtex_gene_info['pr_gene_id']], gtex_gene_info['pr_gene_symbol']))

# label rows with gene names 
gtex_l1000_data.index = [gtex_gene_dict[x] for x in gtex_l1000_data.index.values]


gtex_l1000_data_landmark = gtex_l1000_data.loc[gtex_l1000_data.index.isin(gtex_landmark_genes), :]

In [None]:
with open(gtex_l1000_all_gene_list, "w") as f:
    f.write("\n".join(sorted(gtex_l1000_data.index.tolist())))

In [None]:
with open(gtex_l1000_landmark_gene_list, "w") as f:
    f.write("\n".join(sorted(gtex_l1000_data_landmark.index.tolist())))

## GTEx RNA-seq

In [None]:
with open(gtex_rnaseq_filename, "r") as fr:
    with open(gtex_rnaseq_all_gene_list, "w") as f:
        lines = fr.readlines()
        for i in range(3, len(lines)):
            f.write(lines[i].split("\t")[1])
            f.write("\n")
            f.flush()



# Overlapping gene list

In [None]:
l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"
archs4_high_count_gene_list = "../data/ARCHS4/high_count_gene_list.txt" # from 4_normalize_ARCHS4_full ~

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

In [None]:
# output
overlap_landmark_gene_list = "../data/processed/overlap_landmark_file.txt"

In [None]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()]
    



In [None]:
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))
overlap_rnaseq_genes = list(set(archs4_all_gene).intersection(gtex_rnaseq_all_gene)) # common genes in ARCHS4 and GTEx RNA-seq

In [None]:
len(overlap_landmark_genes)

In [None]:
with open(overlap_landmark_gene_list, "w") as f:
    f.write("\n".join(sorted(overlap_landmark_genes)))