In [66]:
import pandas as pd
import scipy.stats as ss
import warnings
import numpy as np
from maayanlab_bioinformatics.normalization import quantile_normalize

In [67]:
n_sampling = 50000
rnaseq_combine_bool = True

In [68]:
l1000_overlap_landmark_input_filename = f"../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{n_sampling}x962.f"
ARCHS4_filtered_overlap_landmark_input_filename = f"../data/processed/ARCHS4/human_matrix_v9_filtered_n{n_sampling}x962.f"

gtex_filtered_l1000_input_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n2929x962.f"
gtex_filtered_rnaseq_input_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n2929x962.f"


In [69]:
# l1000_overlap_landmark_output_filename = f"../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{n_sampling}x962_v2.f"
# ARCHS4_filtered_overlap_landmark_output_filename = f"../data/processed/ARCHS4/human_matrix_v9_filtered_n{n_sampling}x962_v2.f"

# gtex_filtered_l1000_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n2929x962_v2.f"
# gtex_filtered_rnaseq_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n2929x962_v2.f"
l1000_overlap_landmark_output_filename = f"../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n{n_sampling}x962_v3.f"
ARCHS4_filtered_overlap_landmark_output_filename = f"../data/processed/ARCHS4/human_matrix_v9_filtered_n{n_sampling}x962_v3.f"

gtex_filtered_l1000_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n2929x962_v3.f"
gtex_filtered_rnaseq_output_filename = "../data/processed/GTEx/GSE92743_Broad_GTEx_RNAseq_Log2RPKM_q2norm_filtered_n2929x962_v3.f"


In [70]:
def read_data(filename):
    df = pd.read_feather(filename)
    first_column = df.columns[0]
    df = df.set_index(first_column)
    # df = df.astype('int32')
    return df
    
def save_feather(obj, filename):
    obj.reset_index().to_feather(filename)
    print(filename)

In [71]:
gtex_filtered_l1000_input_filename

'../data/processed/GTEx/GSE92743_Broad_GTEx_L1000_Level3_Q2NORM_filtered_n2929x962.f'

In [72]:
l1000 = read_data(l1000_overlap_landmark_input_filename)
ARCHS4 = read_data(ARCHS4_filtered_overlap_landmark_input_filename)
gtex_l1000 = read_data(gtex_filtered_l1000_input_filename)
gtex_rnaseq = read_data(gtex_filtered_rnaseq_input_filename)


In [73]:
def CPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        
    return data
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log10(data+1)

    # Return
    return data
def log(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = data.fillna(0)
        data = np.log10(data+1)

    return data

def rpkm(counts, lengths):
    """Calculate reads per kilobase transcript per million reads.

    RPKM = (10^9 * C) / (N * L)

    Where:
    C = Number of reads mapped to a gene
    N = Total mapped reads in the experiment
    L = Exon length in base pairs for a gene

    Parameters
    ----------
    counts: array, shape (N_genes, N_samples)
        RNAseq (or similar) count data where columns are individual samples
        and rows are genes.
    lengths: array, shape (N_genes,)
        Gene lengths in base pairs in the same order
        as the rows in counts.

    Returns
    -------
    normed : array, shape (N_genes, N_samples)
        The RPKM normalized counts matrix.
    """
    N = np.sum(counts, axis=0)  # sum each column to get total reads per sample
    L = lengths
    C = counts

    normed = 1e9 * C / (N[np.newaxis, :] * L[:, np.newaxis])

    return(normed)
    
def qnormalization(data):

    X_quantile_norm = quantile_normalize(data)
    return X_quantile_norm  

def normalization(data, logCPM_normalization=False, CPM_normalization=False, log_normalization=False, z_normalization=False, q_normalization=False):
    if logCPM_normalization == True:  
        data = logCPM(data)
    if CPM_normalization == True:
        data = CPM(data)
    if log_normalization == True:   
        data = log(data)
        
    if q_normalization == True:
        data = qnormalization(data)
        
    
    if z_normalization == True: 
        data = data.T.apply(ss.zscore, axis=0).T.dropna()

    return data

# RNA-seq

In [74]:
# combine all rna-seq (ARCHS4 and GTEx) and normalize together
if rnaseq_combine_bool == True:
    # GTEx logCPM first
    sorted_gtex_rnaseq = gtex_rnaseq.sort_index(axis=1)
    sorted_gtex_rnaseq = sorted_gtex_rnaseq.astype('int32')
    normalized_gtex_rnaseq = normalization(sorted_gtex_rnaseq.T, logCPM_normalization=True).T
    
    # ARCHS4 logCPM 
    normalized_ARCHS4 = normalization(ARCHS4.T, logCPM_normalization=True).T
    # combine and quantile normalization
    combined_rnaseq = pd.concat([normalized_ARCHS4, normalized_gtex_rnaseq])
    
    normalized_combined_rnaseq = normalization(combined_rnaseq.T, q_normalization=True).T
    
    #split 
    normalized_ARCHS4 = normalized_combined_rnaseq.loc[normalized_combined_rnaseq.index.str.startswith("GSM"), :]
    normalized_gtex_rnaseq = normalized_combined_rnaseq.loc[normalized_combined_rnaseq.index.str.startswith("GTEX"), :]
else:
    sorted_gtex_rnaseq = gtex_rnaseq.sort_index(axis=1)
    sorted_gtex_rnaseq = sorted_gtex_rnaseq.astype('int32')
    normalized_gtex_rnaseq = normalization(sorted_gtex_rnaseq.T, logCPM_normalization=True, q_normalization=True).T
    
    
    normalized_ARCHS4 = normalization(ARCHS4.T, logCPM_normalization=True, q_normalization=True).T
#     

# L1000

In [75]:
sorted_l1000 = l1000.sort_index(axis=1)
normalized_l1000 = normalization(sorted_l1000.T, q_normalization=True).T

# Save

In [None]:
normalized_l1000.reset_index().to_feather(l1000_overlap_landmark_output_filename)

In [40]:
normalized_ARCHS4.reset_index().to_feather(ARCHS4_filtered_overlap_landmark_output_filename)

In [41]:
normalized_gtex_rnaseq.reset_index().to_feather(gtex_filtered_rnaseq_output_filename)

In [42]:
sorted_gtex_l1000 = gtex_l1000.sort_index(axis=1)
normalized_gtex_l1000 = normalization(sorted_gtex_l1000.T, q_normalization=True).T
normalized_gtex_l1000.reset_index().to_feather(gtex_filtered_l1000_output_filename)