# Benchmarking Data Normalization Methods with the Dexamethasone Benchmark

In [81]:
import pandas as pd
import warnings
import numpy as np
import scipy.stats as ss
from bioinfokit.analys import norm
from maayanlab_bioinformatics.normalization.quantile import quantile_normalize

# Load in Data

Temporarily using test data

In [214]:
gene = ['gene1', 'gene2', 'gene3', 'gene4']
ctr1 = [160, 32, 1, 80]
ctr2 = [60, 15, 0, 36]
ctr3 = [196, 19, 0, 75]
trt1 = [328, 62, 0, 68]
trt2 = [274, 65, 2, 70]
trt3 = [287, 36, 0, 47]
data = pd.DataFrame(list(zip(gene, ctr1, ctr2, ctr3, trt1, trt2, trt3)), 
                    columns=['gene', 'ctr1', 'ctr2','ctr3', 'trt1','trt2', 'trt3'])
data = data.set_index('gene')
data

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,160,60,196,328,274,287
gene2,32,15,19,62,65,36
gene3,1,0,0,0,2,0
gene4,80,36,75,68,70,47


# Normalization Methods

### CPM Normalization

In [197]:
nm = norm()
nm.cpm(df=data)

In [198]:
cpm_df = nm.cpm_norm
cpm_df

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,586080.586081,540540.540541,675862.068966,716157.20524,666666.666667,775675.675676
gene2,117216.117216,135135.135135,65517.241379,135371.179039,158150.851582,97297.297297
gene3,3663.003663,0.0,0.0,0.0,4866.180049,0.0
gene4,293040.29304,324324.324324,258620.689655,148471.615721,170316.301703,127027.027027


### Log Transformation

In [88]:
def log(data):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = data.fillna(0)
        data = np.log2(data+1)
        
    return data

In [200]:
log_df = log(data)
log_df

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,7.330917,5.930737,7.622052,8.361944,8.103288,8.169925
gene2,5.044394,4.0,4.321928,5.97728,6.044394,5.209453
gene3,1.0,0.0,0.0,0.0,1.584963,0.0
gene4,6.33985,5.209453,6.247928,6.108524,6.149747,5.584963


### Z-Score Normalization 


In [202]:
z_df = data.T.apply(ss.zscore, axis=0).T.dropna()
z_df

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,-0.637529,-1.746276,-0.238381,1.225165,0.626442,0.770579
gene2,-0.319524,-1.200374,-0.993115,1.234917,1.390361,-0.112265
gene3,0.654654,-0.654654,-0.654654,-0.654654,1.963961,-0.654654
gene4,1.099437,-1.691442,0.782292,0.338288,0.465146,-0.993722


### Quantile Normalization

In [91]:
def qnormalization(data):
  
    X_quantile_norm = quantile_normalize(data)
    return X_quantile_norm

In [92]:
quant_df = qnormalization(data)
quant_df

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,217.5,217.5,217.5,217.5,217.5,217.5
gene2,38.166667,38.166667,38.166667,38.166667,38.166667,38.166667
gene3,0.5,0.5,0.5,0.5,0.5,0.5
gene4,62.666667,62.666667,62.666667,62.666667,62.666667,62.666667


### Median Polish

In [275]:
def median_polish(df, iterations=20):
    residuals = df.copy() * 1.0
    
    for i in range(iterations):
    
        for j in range(len(residuals)):
            row = residuals.iloc[j].values
            row_median = np.median(row)
        
            for k in range(len(row)):
                row[k] -= row_median
    
        for column in residuals:
            col = residuals[column].values
            column_median = np.median(col)
        
            for i in range(len(col)):
                col[i] -= column_median
    
    return df - residuals

In [293]:
mp_df = median_polish(data)
mp_df

Unnamed: 0_level_0,ctr1,ctr2,ctr3,trt1,trt2,trt3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene1,225.166667,209.833333,218.166667,249.333333,251.833333,226.666667
gene2,33.25,17.916667,26.25,57.416667,59.916667,34.75
gene3,-0.25,-15.583333,-7.25,23.916667,26.416667,1.25
gene4,48.416667,33.083333,41.416667,72.583333,75.083333,49.916667


# Comparing Methods