In [14]:
import pandas as pd
import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize

In [15]:
n_sampling=50000

input_filenames

In [24]:
# ARCHS4_filename = f"../data/processed/ARCHS4/human_matrix_v9_filtered_n{n_sampling}x35238.f"
ARCHS4_filename = f"../data/processed/ARCHS4/human_matrix_v9_filtered_n{n_sampling}x25312.f"

l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

output_filenames

In [25]:
ARCHS4_step2_input = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}_step2_input.f"
ARCHS4_step2_output = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x{}_step2_output.f"

In [38]:
archs4_high_count_gene_list = "../data/ARCHS4/high_count_gene_list.txt"

# Get overlap landmark genes

In [26]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()]
    



In [27]:
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))
overlap_rnaseq_genes = list(set(archs4_all_gene).intersection(gtex_rnaseq_all_gene)) # common genes in ARCHS4 and GTEx RNA-seq

In [28]:
def read_data(filename):
    df = pd.read_feather(filename)
    first_column = df.columns[0]
    df = df.set_index(first_column)
    # df = df.astype('int32')
    return df
    
def save_feather(obj, filename):
    obj.reset_index().to_feather(filename)
    print(filename)

In [29]:
def CPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        
    return data
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log10(data+1)

    # Return
    return data
def log(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = data.fillna(0)
        data = np.log10(data+1)

    return data

def rpkm(counts, lengths):
    """Calculate reads per kilobase transcript per million reads.

    RPKM = (10^9 * C) / (N * L)

    Where:
    C = Number of reads mapped to a gene
    N = Total mapped reads in the experiment
    L = Exon length in base pairs for a gene

    Parameters
    ----------
    counts: array, shape (N_genes, N_samples)
        RNAseq (or similar) count data where columns are individual samples
        and rows are genes.
    lengths: array, shape (N_genes,)
        Gene lengths in base pairs in the same order
        as the rows in counts.

    Returns
    -------
    normed : array, shape (N_genes, N_samples)
        The RPKM normalized counts matrix.
    """
    N = np.sum(counts, axis=0)  # sum each column to get total reads per sample
    L = lengths
    C = counts

    normed = 1e9 * C / (N[np.newaxis, :] * L[:, np.newaxis])

    return(normed)
    
def qnormalization(data):

    X_quantile_norm = quantile_normalize(data)
    return X_quantile_norm  

def normalization(data, logCPM_normalization=False, CPM_normalization=False, log_normalization=False, z_normalization=False, q_normalization=False):
    if logCPM_normalization == True:  
        data = logCPM(data)
    if CPM_normalization == True:
        data = CPM(data)
    if log_normalization == True:   
        data = log(data)
        
    if q_normalization == True:
        data = qnormalization(data)
        
    
    if z_normalization == True: 
        data = data.T.apply(ss.zscore, axis=0).T.dropna()

    return data

# Load sampled ARCHS4

In [30]:
ARCHS4_filename

'../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x25312.f'

In [31]:
archs4 = read_data(ARCHS4_filename)

In [32]:
archs4_normalized = normalization(archs4.T, logCPM_normalization=True, q_normalization=True).T # Step 2 output

In [33]:
# Step 2 input
landmark_archs4_normalized = normalization(archs4.loc[:,overlap_landmark_genes].T, logCPM_normalization=True, q_normalization=True).T

# Save

In [34]:
landmark_archs4_normalized.reset_index().to_feather(ARCHS4_step2_input.format(landmark_archs4_normalized.shape[0], landmark_archs4_normalized.shape[1]))

In [35]:
archs4_normalized.reset_index().to_feather(ARCHS4_step2_output.format(archs4_normalized.shape[0], archs4_normalized.shape[1]))

In [36]:
ARCHS4_step2_output.format(archs4_normalized.shape[0], archs4_normalized.shape[1])

'../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x25312_step2_output.f'

In [39]:
with open(archs4_high_count_gene_list, "w") as f:
    f.write("\n".join(archs4_normalized.columns.tolist()))