In [2]:
import pandas as pd
import scipy.stats as ss
import warnings
import numpy as np
import os
from maayanlab_bioinformatics.normalization import quantile_normalize

In [3]:
n_sampling = 50000

In [4]:
l1000_overlap_landmark_input_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x962_cellline_{}.f"
ARCHS4_filtered_overlap_landmark_input_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x962.f"

gtex_filtered_l1000_input_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n{}x962_cellline_{}.f"
gtex_filtered_rnaseq_input_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n{}x962.f"


In [5]:
l1000_overlap_landmark_output_filename = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{}x962_cellline_{}_v2.f"
ARCHS4_filtered_overlap_landmark_output_filename = "../data/processed/ARCHS4/human_matrix_v9_filtered_n{}x962_v2.f"

# gtex_filtered_l1000_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n2929x962_cellline_{}_v2.f"
# gtex_filtered_rnaseq_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n2929x962_v2.f"


In [6]:
cell_lines = ["MCF7", "PC3", "HA1E", "HCC515", "VCAP", "A375", "HEPG2", "HT29", "A549"]

In [7]:
def read_data(filename):
    df = pd.read_feather(filename)
    first_column = df.columns[0]
    df = df.set_index(first_column)
    # df = df.astype('int32')
    return df
    
def save_feather(obj, filename):
    obj.reset_index().to_feather(filename)
    print(filename)

def CPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        
    return data
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log10(data+1)

    # Return
    return data
def log(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = data.fillna(0)
        data = np.log10(data+1)

    return data

def rpkm(counts, lengths):
    """Calculate reads per kilobase transcript per million reads.

    RPKM = (10^9 * C) / (N * L)

    Where:
    C = Number of reads mapped to a gene
    N = Total mapped reads in the experiment
    L = Exon length in base pairs for a gene

    Parameters
    ----------
    counts: array, shape (N_genes, N_samples)
        RNAseq (or similar) count data where columns are individual samples
        and rows are genes.
    lengths: array, shape (N_genes,)
        Gene lengths in base pairs in the same order
        as the rows in counts.

    Returns
    -------
    normed : array, shape (N_genes, N_samples)
        The RPKM normalized counts matrix.
    """
    N = np.sum(counts, axis=0)  # sum each column to get total reads per sample
    L = lengths
    C = counts

    normed = 1e9 * C / (N[np.newaxis, :] * L[:, np.newaxis])

    return(normed)
    
def qnormalization(data):

    X_quantile_norm = quantile_normalize(data)
    return X_quantile_norm  

def normalization(data, logCPM_normalization=False, CPM_normalization=False, log_normalization=False, z_normalization=False, q_normalization=False):
    if logCPM_normalization == True:  
        data = logCPM(data)
    if CPM_normalization == True:
        data = CPM(data)
    if log_normalization == True:   
        data = log(data)
        
    if q_normalization == True:
        data = qnormalization(data)
        
    
    if z_normalization == True: 
        data = data.T.apply(ss.zscore, axis=0).T.dropna()

    return data

In [8]:
ARCHS4 = read_data(ARCHS4_filtered_overlap_landmark_input_filename.format(n_sampling))
normalized_ARCHS4 = normalization(ARCHS4.T, logCPM_normalization=True, q_normalization=True).T
normalized_ARCHS4.reset_index().to_feather(ARCHS4_filtered_overlap_landmark_output_filename)

In [9]:
for cell_line in cell_lines:
    l1000 = read_data(l1000_overlap_landmark_input_filename.format(n_sampling, cell_line))
    
    sorted_l1000 = l1000.sort_index(axis=1)
    normalized_l1000 = normalization(sorted_l1000.T, q_normalization=True).T
    normalized_l1000.reset_index().to_feather(l1000_overlap_landmark_output_filename.format(n_sampling, cell_line))


In [11]:
for cell_line in cell_lines:
    folder = "../data/processed/GTEx/"
    filenames = os.listdir(folder)
    # print(filenames)
    for filename in filenames:
        if cell_line in filename and "v2" not in filename:
            if "L1000" in filename:
                print(cell_line, filename)
                gtex_l1000 = read_data(folder+filename)

                sorted_gtex_l1000 = gtex_l1000.sort_index(axis=1)
                normalized_gtex_l1000 = normalization(sorted_gtex_l1000.T, q_normalization=True).T
                print(normalized_gtex_l1000.shape)
                normalized_gtex_l1000.reset_index().to_feather(folder+filename.replace(".f", "_v2.f"))

            else:
                print(cell_line, filename)
                gtex_rnaseq = read_data(folder+filename)
                sorted_gtex_rnaseq = gtex_rnaseq.sort_index(axis=1)
                sorted_gtex_rnaseq = sorted_gtex_rnaseq.astype('int32')
                normalized_gtex_rnaseq = normalization(sorted_gtex_rnaseq.T, logCPM_normalization=True, q_normalization=True).T
                print(normalized_gtex_rnaseq.shape)
                normalized_gtex_rnaseq.reset_index().to_feather(folder+filename.replace(".f", "_v2.f"))


MCF7 GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n43x962_cellline_MCF7.f
(43, 962)
MCF7 GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n43x962_cellline_MCF7.f
(43, 962)
PC3 GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n41x962_cellline_PC3.f
(41, 962)
PC3 GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n41x962_cellline_PC3.f
(41, 962)
HA1E GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n5x962_cellline_HA1E.f
(5, 962)
HA1E GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n5x962_cellline_HA1E.f
(5, 962)
HCC515 GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n133x962_cellline_HCC515.f
(133, 962)
HCC515 GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n133x962_cellline_HCC515.f
(133, 962)
VCAP GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n41x962_cellline_VCAP.f
(41, 962)
VCAP GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n41x962_cellline_VCAP.f
(41, 962)
A375 GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n151x962_cellline_A375.f
(151, 962)
A375 GSE927