# Benchmarking RNA-seq DEG Methods with the Dexamethasone Benchmark

In [70]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings

# Load in Data

Using data from GEO from the study, "The effect of lithium and dexamethasone on fetal rat metatarsal bones transcriptome" 
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE186104 

In [66]:
# Load in data
meta_df = pd.read_csv('GSE186104_series_matrix.txt', sep="\t", index_col=0, dtype=str)
expr_df = pd.read_csv('GSE186104_cross_tabulation_of_gene_expression.txt', sep="\t", index_col=0, dtype=str)

dataset = dict()
current_dataset = 'rawdata'
dataset[current_dataset] = expr_df
dataset['meta_data'] = meta_df

meta_class_column_name = 'Sample_characteristics_ch1'
control_name = 'treatment: standard cultivation medium'
classes = list(meta_df[meta_class_column_name].unique())
classes.remove(control_name)
classes.insert(0, control_name)

In [63]:
meta_df

Unnamed: 0_level_0,Sample_title,Sample_characteristics_ch1
Sample_geo_accession,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM5632354,C1: Untreated control replicate 1,treatment: standard cultivation medium
GSM5632355,C2: Untreated control replicate 2,treatment: standard cultivation medium
GSM5632356,C3: Untreated control replicate 3,treatment: standard cultivation medium
GSM5632357,Dex1: Dexamethason treated sample replicate 1,treatment: standard cultivation medium + dexam...
GSM5632358,Dex2: Dexamethason treated sample replicate 2,treatment: standard cultivation medium + dexam...
GSM5632359,Dex3: Dexamethason treated sample replicate 3,treatment: standard cultivation medium + dexam...
GSM5632360,Li1: Lithium treated sample replicate 1,treatment: standard cultivation medium + lithium
GSM5632361,Li2: Lithium treated sample replicate 2,treatment: standard cultivation medium + lithium
GSM5632362,Li3: Lithium treated sample replicate 3,treatment: standard cultivation medium + lithium
GSM5632363,DL1: Dexamethason and lithium treated sample r...,treatment: standard cultivation medium + dexam...


In [73]:
dataset[current_dataset]

Unnamed: 0_level_0,GSM5632354,GSM5632355,GSM5632356,GSM5632357,GSM5632358,GSM5632359,GSM5632360,GSM5632361,GSM5632362,GSM5632363,GSM5632364,GSM5632365
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A1bg,0,0,0,0,0,0,0,1,0,0,0,0
A1cf,1,0,0,0,0,0,0,0,0,0,0,0
A1i3,160,60,196,328,274,287,179,188,343,113,93,119
A26c2,0,0,0,0,0,0,0,0,0,0,0,0
A2m,32,15,19,62,65,36,28,34,68,23,29,29
...,...,...,...,...,...,...,...,...,...,...,...,...
mrpl9,1048,598,922,505,596,399,493,646,954,684,1049,1192
pramef20l,0,0,0,0,0,0,0,0,0,0,0,0
rnf141,987,453,863,453,705,387,450,609,798,691,1016,1014
ste2,0,0,0,0,0,0,0,0,0,0,0,0


In [77]:
def logCPM(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = (data/data.sum())*10**6
        data = data.fillna(0)
        data = np.log2(data+1)

    # Return
    return data

In [78]:
def normalize(dataset, current_dataset, logCPM_normalization, log_normalization, z_normalization, q_normalization):
    normalization = current_dataset
    if logCPM_normalization == True:  
        data = dataset[normalization]
        normalization += '+logCPM'
        dataset[normalization] = logCPM(data)
        
    if log_normalization == True:    
        data = dataset[normalization]
        normalization += '+log'
        dataset[normalization] = log(data)
        
    if z_normalization == True:
        data = dataset[normalization]
        normalization += '+z_norm'    
        dataset[normalization] = data.T.apply(ss.zscore, axis=0).T.dropna()

    if q_normalization == True:
        data = dataset[normalization]
        normalization += '+q_norm'
        dataset[normalization] = qnormalization(data)
    return dataset, normalization

In [79]:
dataset, normalization = normalize(dataset, current_dataset, True, False, True, False)

TypeError: unsupported operand type(s) for /: 'str' and 'str'

# Differential Gene Expression
Using code adapted from Bulk RNA-seq Analysis pipeline appyter: https://appyters.maayanlab.cloud/Bulk_RNA_seq/

In [15]:
# Copied from the appyter source code
def get_signatures(classes, dataset, normalization, method, meta_class_column_name, filter_genes):
    tmp_normalization = normalization.replace("+z_norm+q_norm","").replace("+z_norm","")
    raw_expr_df = dataset['rawdata']
    expr_df = dataset['rawdata']
    if filter_genes == True:
        expr_df = dataset['rawdata+filter_genes']
        
    signatures = dict()

    for cls1, cls2 in combinations(classes, 2):
        print(cls1, cls2)
        cls1_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls1, :].index.tolist() #control
        cls2_sample_ids = dataset["dataset_metadata"].loc[dataset["dataset_metadata"][meta_class_column_name]==cls2,:].index.tolist() #case
        
        signature_label = " vs. ".join([cls1, cls2])
        
        if method == "limma":
            limma = robjects.r['limma']

            design_dataframe = pd.DataFrame([{'index': x, 'A': int(x in cls1_sample_ids), 'B': int(x in cls2_sample_ids)} for x in raw_expr_df.columns]).set_index('index')

            processed_data = {"expression": raw_expr_df, 'design': design_dataframe}
            
            limma_results = pandas2ri.conversion.rpy2py(limma(pandas2ri.conversion.py2rpy(processed_data['expression']), pandas2ri.conversion.py2rpy(processed_data['design']), filter_genes=filter_genes))
                        
            signature = pd.DataFrame(limma_results[0])
            signature.index = limma_results[1]
            signature = signature.sort_values("t", ascending=False)
            
        elif method == "characteristic_direction":
            signature = characteristic_direction(dataset[tmp_normalization].loc[:, cls1_sample_ids], dataset[normalization].loc[:, cls2_sample_ids], calculate_sig=True)
            signature = signature.sort_values("CD-coefficient", ascending=False)
            
        signatures[signature_label] = signature

    return signatures

In [3]:
# Set method in this variable
diff_gex_method = 'limma'

#signatures = get_signatures()


In [None]:
# Enrichment analysis

In [None]:
# Extract NR3C1 rankings

In [None]:
# Compare rankings/methods