# Benchmarking RNA-seq DEG Methods with the Dexamethasone Benchmark

In [13]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
import scipy.stats as ss
from itertools import combinations
from rpy2 import robjects
from rpy2.robjects import r, pandas2ri

# Load in Data

Using data from GEO from the study, "The effect of lithium and dexamethasone on fetal rat metatarsal bones transcriptome" 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE186104 

In [2]:
# Load in data
meta_df = pd.read_csv('GSE186104_series_matrix.txt', sep="\t", index_col=0, dtype=str)
expr_df = pd.read_csv('GSE186104_cross_tabulation_of_gene_expression.txt', index_col=0, sep="\t").sort_index()
meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: standard cultivation medium'

meta_df.index = meta_df.index.map(str)
meta_df = meta_df[meta_df.index.isin(expr_df.columns)]

classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)
meta_df['tmp_class'] = pd.Categorical(meta_df[meta_class_column_name], classes)
meta_df = meta_df.sort_values('tmp_class')
meta_df = meta_df.drop('tmp_class', axis=1)
expr_df = expr_df.loc[:,meta_df.index]
expr_df = expr_df.groupby(expr_df.index).sum()
assert(meta_df.shape[0]==expr_df.shape[1])

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset["dataset_metadata"] = meta_df

In [3]:
low_expression_threshold = 0.3

## Filter out non-expressed genes
expr_df = expr_df.loc[expr_df.sum(axis=1) > 0, :]
## Filter out lowly expressed genes
mask_low_vals = (expr_df > low_expression_threshold).sum(axis=1) > 2
expr_df = expr_df.loc[mask_low_vals, :]
current_dataset += '+filter_genes'
dataset[current_dataset] = expr_df

In [4]:
meta_df

Unnamed: 0_level_0,Sample_title,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM5632354,C1: Untreated control replicate 1,treatment: standard cultivation medium
GSM5632355,C2: Untreated control replicate 2,treatment: standard cultivation medium
GSM5632356,C3: Untreated control replicate 3,treatment: standard cultivation medium
GSM5632357,Dex1: Dexamethason treated sample replicate 1,treatment: standard cultivation medium + dexam...
GSM5632358,Dex2: Dexamethason treated sample replicate 2,treatment: standard cultivation medium + dexam...
GSM5632359,Dex3: Dexamethason treated sample replicate 3,treatment: standard cultivation medium + dexam...
GSM5632360,Li1: Lithium treated sample replicate 1,treatment: standard cultivation medium + lithium
GSM5632361,Li2: Lithium treated sample replicate 2,treatment: standard cultivation medium + lithium
GSM5632362,Li3: Lithium treated sample replicate 3,treatment: standard cultivation medium + lithium
GSM5632363,DL1: Dexamethason and lithium treated sample r...,treatment: standard cultivation medium + dexam...


In [5]:
expr_df

Sample_geo_accession,GSM5632354,GSM5632355,GSM5632356,GSM5632357,GSM5632358,GSM5632359,GSM5632360,GSM5632361,GSM5632362,GSM5632363,GSM5632364,GSM5632365
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A1i3,160,60,196,328,274,287,179,188,343,113,93,119
A2m,32,15,19,62,65,36,28,34,68,23,29,29
A2ml1,1,0,0,0,2,0,3,0,0,2,4,1
A3galt2,80,36,75,68,70,47,63,44,104,55,69,76
A4galt,0,0,2,0,1,2,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
l7Rn6,392,199,385,174,185,142,189,245,303,269,361,407
mrpl11,374,218,233,144,192,133,156,176,244,210,327,287
mrpl24,611,344,560,222,316,222,286,327,497,369,513,569
mrpl9,1048,598,922,505,596,399,493,646,954,684,1049,1192


In [6]:
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log2(data+1)

    # Return
    return data

In [7]:
def normalize(dataset, current_dataset, logCPM_normalization, log_normalization, z_normalization, q_normalization):
    normalization = current_dataset
    if logCPM_normalization == True:  
        data = dataset[normalization]
        normalization += '+logCPM'
        dataset[normalization] = logCPM(data)
        
    if log_normalization == True:    
        data = dataset[normalization]
        normalization += '+log'
        dataset[normalization] = log(data)
        
    if z_normalization == True:
        data = dataset[normalization]
        normalization += '+z_norm'    
        dataset[normalization] = data.T.apply(ss.zscore, axis=0).T.dropna()

    if q_normalization == True:
        data = dataset[normalization]
        normalization += '+q_norm'
        dataset[normalization] = qnormalization(data)
    return dataset, normalization

In [8]:
dataset, normalization = normalize(dataset, current_dataset, True, False, True, False)

In [9]:
dataset[normalization]

Sample_geo_accession,GSM5632354,GSM5632355,GSM5632356,GSM5632357,GSM5632358,GSM5632359,GSM5632360,GSM5632361,GSM5632362,GSM5632363,GSM5632364,GSM5632365
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A1i3,-0.823915,-1.203701,-0.394838,1.472828,0.950546,1.391266,0.682852,0.464780,0.636785,-0.661800,-1.357623,-1.157182
A2m,-0.850702,-0.913618,-1.407946,1.707599,1.497647,0.792520,0.368323,0.377260,0.752250,-0.662710,-0.757919,-0.902703
A2ml1,-0.294967,-0.772040,-0.772040,-0.772040,0.920040,-0.772040,2.292632,-0.772040,-0.772040,0.670972,1.308982,-0.265415
A3galt2,-0.875900,-1.189250,-0.782604,1.542712,1.049973,0.388160,1.659979,-0.508135,0.933154,-0.559917,-0.820001,-0.838169
A4galt,-0.600760,-0.600760,1.011257,-0.600760,0.792626,2.716666,-0.600760,-0.600760,-0.600760,-0.600760,0.285531,-0.600760
...,...,...,...,...,...,...,...,...,...,...,...,...
l7Rn6,-0.314866,0.016371,0.365216,-0.613465,-1.450318,-1.753019,1.168210,1.839965,-0.805107,0.471590,0.450185,0.625238
mrpl11,0.682765,1.931779,-2.038182,-0.558921,0.423214,-0.539152,0.762820,0.270977,-0.896703,-0.159605,0.889206,-0.768198
mrpl24,0.115012,1.517950,0.140863,-2.218022,-0.175757,-1.392579,1.395951,0.835633,0.106610,-0.338141,-0.004243,0.016723
mrpl9,-0.874443,0.767789,-1.288662,-0.328218,-0.159364,-1.983152,0.546399,1.424708,0.333021,-0.520593,0.898979,1.183536


# Differential Gene Expression
Using code adapted from Bulk RNA-seq Analysis pipeline appyter: https://appyters.maayanlab.cloud/Bulk_RNA_seq/

In [10]:
# Copied from the appyter source code
def get_signatures(classes, dataset, normalization, method, meta_class_column_name, filter_genes):
    tmp_normalization = normalization.replace("+z_norm+q_norm","").replace("+z_norm","")
    raw_expr_df = dataset['rawdata']
    expr_df = dataset['rawdata']
    if filter_genes == True:
        expr_df = dataset['rawdata+filter_genes']
        
    signatures = dict()

    for cls1, cls2 in combinations(classes, 2):
        print(cls1, cls2)
        cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls1, :].index.tolist() #control
        cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls2,:].index.tolist() #case
        
        signature_label = " vs. ".join([cls1, cls2])
        
        if method == "limma":
            limma = robjects.r['limma']

            design_dataframe = pd.DataFrame([{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in raw_expr_df.columns]).set_index('index')

            processed_data = {"expression": raw_expr_df, 'design': design_dataframe}
            
            limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']), pandas2ri.conversion.py2rpy(processed_data['design']), filter_genes=filter_genes))
                        
            signature = pd.DataFrame(limma_results[0])
            signature.index = limma_results[1]
            signature = signature.sort_values("t", ascending=False)
            
        elif method == "characteristic_direction":
            signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
            signature = signature.sort_values("CD-coefficient", ascending=False)
            
        signatures[signature_label] = signature

    return signatures

In [15]:
# Set method in this variable
diff_gex_method = 'limma'

signatures = get_signatures(classes, dataset, normalization, diff_gex_method, meta_class_column_name, True)


treatment: standard cultivation medium treatment: standard cultivation medium + dexamethason


KeyError: "'limma' not found"

In [None]:
# Enrichment analysis

In [None]:
# Extract NR3C1 rankings

In [None]:
# Compare rankings/methods