# Script for Processing ARCHS4 Data

In [None]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import numpy_indexed as npi
import random

import sys, h5py, time
import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
import cmapPy.pandasGEXpress.parse_gct as parse_gct

from scipy import stats
from numpy.random import seed

import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize


randomState = 123
seed(randomState)
random.seed(randomState)

## Initialize

Parameters

In [None]:
n_sampling = 150000

Input Filenames

In [None]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
overlap_landmark_gene_list = "../data/processed/overlap_landmark_gene_file.txt"
overlap_rnaseq_gene_list = "../data/processed/overlap_rnaseq_gene_file.txt"
archs4_high_count_gene_list = "../data/ARCHS4/high_count_gene_list.txt" 

Output Filenames

In [None]:
ARCHS4_filtered_sample_output_filename = "../data/processed/ARCHS4/filtered_sample_list.txt"
ARCHS4_filtered_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}.f" # n_samplingx25312
ARCHS4_filtered_output_filename_normalized = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}_v2.f" # n_samplingx25312
ARCHS4_filtered_output_filename_normalized_overlap_landmark = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}_v2.f" # n_samplingx962


In [None]:
def save_feather(df, filename):
    df.reset_index().to_feather(filename)
    print("Saved!", filename)

## Load landmark/RNA-seq genes

In [None]:
with open(overlap_landmark_gene_list, "r") as f:
    landmark_gene = [x.strip() for x in f.readlines()]
with open(overlap_rnaseq_gene_list, "r") as f:
    overlap_rnaseq_genes = [x.strip() for x in f.readlines()]    
with open(archs4_high_count_gene_list, "r") as f:
    high_count_gene_list = [x.strip() for x in f.readlines()]



## Load ARCHS4 RNA-seq

Data preprocessing code from https://github.com/MaayanLab/L1k2RNA-seq-2.0/blob/cb5eaa3a447b502e32db6c1aae84eaa94d0ce0f4/pipeline/pipeline.py#L43

In [None]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]
sample_geo_list = list(h5['meta']['samples']['geo_accession'])
sample_series_id = list(h5['meta']['samples']['series_id'])
reads_total = list(h5['meta']['samples']['readstotal'])

In [None]:
expression.shape

In [None]:
# high count gene index
archs4_high_count_gene_index = [i for i, x in enumerate(genes) if x in high_count_gene_list]
archs4_high_count_gene_names = [x for i, x in enumerate(genes) if x in high_count_gene_list]

In [None]:
# geneate metadata
metadf = pd.DataFrame([sample_geo_list, sample_series_id, reads_total]).T
metadf.columns = ["geo_accession", "series_id", "readstotal"]

In [None]:
# series less than 200 samples
metadf_count = metadf.groupby("series_id").count()
series_ids_with_less_200samples = metadf_count[metadf_count["geo_accession"] < 200].index.tolist()

In [None]:
# find samples with 1M reads from studies less than 200 samples
filtered_metadf = metadf[(metadf["readstotal"] > 1000000) & (metadf["series_id"].isin(series_ids_with_less_200samples))]

## Random Sampling

In [None]:
# random sampling
sampled_ids = random.sample(filtered_metadf["geo_accession"].tolist(), n_sampling) 

In [None]:
sampled_metadf = filtered_metadf[filtered_metadf["geo_accession"].isin(sampled_ids)]

In [None]:
chunk_size = 500
sampled_expression_gene = list()
strt_time = time.time()
for i in range(int(n_sampling/chunk_size)):
    
    tmp_metadf = sampled_metadf.iloc[i*chunk_size:(i+1)*chunk_size, :]
    
    sampled_index_i = tmp_metadf.index.tolist()
    expression_i = expression[:, sorted(sampled_index_i)]
    expression_i_df = pd.DataFrame(expression_i)
    expression_i_df.columns = tmp_metadf["geo_accession"].tolist()
    expression_i_df.index = genes    
    expression_i_df = expression_i_df.loc[high_count_gene_list, :]
    sampled_expression_gene.append(expression_i_df)

    print(i, time.time()-strt_time)
    strt_time = time.time()
#     break


In [None]:
expression_df = pd.concat(sampled_expression_gene, axis=1).T

In [None]:
expression_df.head()

In [None]:
expression_df = expression_df.sort_index(axis=1)

In [None]:
expression_df.shape

In [None]:
# save
save_feather(expression_df, ARCHS4_filtered_output_filename.format(expression_df.shape[0], expression_df.shape[1]))


In [None]:
# save sample ids
with open(ARCHS4_filtered_sample_output_filename, "w") as f:
    f.write("\n".join(expression_df.index.tolist()))    

## Normalize

In [None]:
def CPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        
    return data
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log10(data+1)

    # Return
    return data
def log(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = data.fillna(0)
        data = np.log10(data+1)

    return data

def qnormalization(data):

    X_quantile_norm = quantile_normalize(data)
    return X_quantile_norm  

def normalization(data, logCPM_normalization=False, CPM_normalization=False, log_normalization=False, z_normalization=False, q_normalization=False):
    if logCPM_normalization == True:  
        data = logCPM(data)
    if CPM_normalization == True:
        data = CPM(data)
    if log_normalization == True:   
        data = log(data)
        
    if q_normalization == True:
        data = qnormalization(data)
        
    
    if z_normalization == True: 
        data = data.T.apply(ss.zscore, axis=0).T.dropna()

    return data

In [None]:
# temporary : load expression_df
expression_df = pd.read_feather(ARCHS4_filtered_output_filename.format(150000, 23614))
first_col = expression_df.columns.tolist()[0]
expression_df = expression_df.set_index(first_col)

In [None]:
expression_df.head()

In [None]:
expression_df = expression_df.sort_index(axis=1)

In [None]:
normalized_ARCHS4 = normalization(expression_df.T, logCPM_normalization=True, q_normalization=True).T

In [None]:
normalized_ARCHS4.head()

In [None]:
save_feather(normalized_ARCHS4, ARCHS4_filtered_output_filename_normalized.format(normalized_ARCHS4.shape[0], normalized_ARCHS4.shape[1]))

## Only Landmark genes 

In [None]:
normalized_ARCHS4_overlap_landmark = normalized_ARCHS4.loc[:, landmark_gene]

In [None]:
normalized_ARCHS4_overlap_landmark = normalized_ARCHS4_overlap_landmark.sort_index(axis=1)

In [None]:
normalized_ARCHS4_overlap_landmark

In [None]:
save_feather(normalized_ARCHS4_overlap_landmark, ARCHS4_filtered_output_filename_normalized_overlap_landmark.format(normalized_ARCHS4_overlap_landmark.shape[0], normalized_ARCHS4_overlap_landmark.shape[1]))