## Notebook of reproducibility

In [1]:
source("utility-functions.R")
source("split-real-data.R")

**Joint normalization**

* generated data sets with separate notebooks for each of jointly-normalized data sets
* split by sample

**GSE22058**

In [2]:
# read data
GSE22058_data <- read.csv2("reproducibility/joint-norm/GSE22058.csv")
count.matrix <- t(GSE22058_data[, 3:length(colnames(GSE22058_data))])
colnames(count.matrix) <- GSE22058_data$ID
annotation <- GSE22058_data$Annots

head(count.matrix)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,183,184,185,186,187,188,189,190,191,192
hsa.let.7a,2.186,2.1915,2.2737,2.1323,2.2068,2.185,2.1886,2.0457,1.9819,2.2555,...,1.9703,2.1173,1.8536,1.8237,1.9188,2.2525,2.0683,2.1395,2.1884,2.4895
hsa.let.7b,2.111,2.108,2.0104,2.0735,2.1514,2.1115,1.9795,2.0859,2.0794,2.1882,...,1.7915,2.0543,1.4185,1.8008,1.4736,1.8837,2.026,1.9321,2.0447,1.9765
hsa.let.7c,1.8772,1.9446,1.8981,1.8247,1.8766,1.8108,1.9203,1.7491,1.6768,1.8415,...,1.3834,1.7529,1.1233,1.3432,1.3962,1.7528,1.6955,1.7532,1.8769,1.6587
hsa.let.7d,2.0204,1.9322,2.0261,1.8523,1.9446,1.9386,2.0248,1.7262,1.7004,2.0644,...,1.8071,1.9155,1.5925,1.7434,1.8187,2.033,1.9274,1.8838,1.9696,2.3102
hsa.let.7e,1.9329,1.8191,1.9377,1.9132,1.967,1.9253,1.8545,1.7633,1.6593,1.8999,...,1.5618,1.6809,1.6241,1.7549,1.8088,1.7455,1.871,1.7707,1.8882,1.9471
hsa.let.7f,2.2413,2.0943,2.3039,2.2299,2.0062,1.9667,2.2429,1.9232,2.0332,2.0437,...,2.0233,1.9973,1.9684,1.9896,2.3396,2.2753,2.3299,2.2899,2.3676,2.4871


In [3]:
for (i in 1:5) {
  
  # get sample IDs for division
  train <- read.csv2(paste("GSE22058/GSE22058_cv_train_", i, ".csv", sep=""))
  train.samples <- train$ID
  test <- read.csv2(paste("GSE22058/GSE22058_cv_val_", i, ".csv", sep=""))
  test.samples <- test$ID
  
  # divide intro train and test data
  file.name <- "GSE22058_joint-norm"
  
  # split data by sample
  count.matrix.split <- splitRealWorldData(name=file.name, 
                                        count.matrix=count.matrix, 
                                        annotation=annotation, 
                                        negative.samples=96, 
                                        positive.samples=96, 
                                        train.fraction=0.8, 
                                        by.sample=TRUE, 
                                        train.samples=train.samples, 
                                        test.samples=test.samples)
  
  train.data.set <- count.matrix.split$train.data.set 
  train.annots <- as.numeric(count.matrix.split$train.annots)
  test.data.set <- count.matrix.split$test.data.set
  test.annots <- as.numeric(count.matrix.split$test.annots)
  
  file.name <- paste("reproducibility/joint-norm/GSE22058/GSE22058_joint-norm_train_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, train.annots, train.data.set)
  
  file.name <- paste("reproducibility/joint-norm/GSE22058/GSE22058_joint-norm_test_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, test.annots, test.data.set)
  
}

**GSE10694**

In [4]:
# read data
GSE10694_data <- read.csv2("reproducibility/joint-norm/GSE10694.csv")
count.matrix <- t(GSE10694_data[, 3:length(colnames(GSE10694_data))])
colnames(count.matrix) <- GSE10694_data$ID
annotation <- GSE10694_data$Annots

head(count.matrix)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,133,134,135,136,137,138,139,140,141,142
hsa.miR.30a.5p,13.02595774,13.22959621,13.11578159,12.99817851,12.98529454,12.80859173,13.24747819,12.94318835,13.11888041,12.90117608,...,12.27490036,12.10252094,13.36426267,13.08290987,13.13636247,14.29878331,12.24055633,13.09897161,12.17431068,12.34581834
hsa.miR.106a,13.51515896,11.93059656,12.78182297,12.83070763,13.00935664,13.03982679,13.60481098,13.36834818,12.75476567,13.02259165,...,14.15516125,13.70305248,13.50575486,13.41761116,13.91473744,13.30768412,14.50756362,14.427957,14.33823565,14.26747479
hsa.miR.10a,12.74411173,11.54516667,12.91532312,10.94535058,11.94499262,12.90203028,12.23167133,11.8812936,12.04903208,12.100023,...,12.67592143,11.85021403,12.35870731,12.71151188,12.87379394,10.64385619,10.64385619,10.73437137,10.64385619,10.64385619
hsa.miR.191,14.06238537,13.62291642,13.27316205,12.20704231,13.35483754,13.45406399,13.57833697,13.58345533,12.75062139,13.34720692,...,14.30843475,14.71675827,13.06058297,13.1042297,13.83192872,14.81216297,14.81261481,13.75537559,14.46138206,13.45751686
hsa.miR.98,10.72953102,10.67373235,10.64385619,10.96186961,10.64385619,10.74282535,10.64385619,10.64385619,10.72744158,10.75793679,...,10.70350165,10.68229647,10.64385619,10.97631841,11.36430528,10.64385619,10.64385619,11.43065093,10.64385619,11.20727361
hsa.miR.107,14.63122379,14.1611553,14.1840707,13.86149258,14.5521857,14.18122933,14.1751041,14.76707798,13.92565661,14.41729788,...,15.51683973,15.2740322,14.28605481,13.95752541,14.45215576,14.05685334,14.34719746,14.33168805,15.2543443,14.71627621


In [5]:
for (i in 1:5) {
  
  # get sample IDs for division
  train <- read.csv2(paste("GSE10694/GSE10694_cv_train_", i, ".csv", sep=""))
  train.samples <- train$ID
  test <- read.csv2(paste("GSE10694/GSE10694_cv_val_", i, ".csv", sep=""))
  test.samples <- test$ID
  
  # divide intro train and test data
  file.name <- "GSE10694_joint-norm"
  
  # split data by sample
  count.matrix.split <- splitRealWorldData(name=file.name, 
                                        count.matrix=count.matrix, 
                                        annotation=annotation, 
                                        negative.samples=96, 
                                        positive.samples=96, 
                                        train.fraction=0.8, 
                                        by.sample=TRUE, 
                                        train.samples=train.samples, 
                                        test.samples=test.samples)
  
  train.data.set <- count.matrix.split$train.data.set 
  train.annots <- as.numeric(count.matrix.split$train.annots)
  test.data.set <- count.matrix.split$test.data.set
  test.annots <- as.numeric(count.matrix.split$test.annots)
  
  file.name <- paste("reproducibility/joint-norm/GSE10694/GSE10694_joint-norm_train_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, train.annots, train.data.set)
  
  file.name <- paste("reproducibility/joint-norm/GSE10694/GSE10694_joint-norm_test_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, test.annots, test.data.set)
  
}

**GSE36681 FF**

In [14]:
# read data
GSE36681_FF_data <- read.csv2("reproducibility/joint-norm/GSE36681_FF.csv")
count.matrix <- t(GSE36681_FF_data[, 3:length(colnames(GSE36681_FF_data))])
colnames(count.matrix) <- GSE36681_FF_data$ID
annotation <- GSE36681_FF_data$Annots

head(count.matrix)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,103,104,105,106,107,108,109,110,111,112
hsa.miR.329,8971.54,1855.45,857.356,765.527,967.544,640.197,662.596,715.415,913.87,583.823,...,1195.37,1619.05,1875.05,1440.27,999.89,1875.05,1131.08,1455.14,1529.56,1068.23
hsa.miR.424,13972.4,17253.1,13839.6,19483.1,18180.0,15283.1,16963.1,18614.0,19331.5,7678.44,...,15546.0,11437.6,11667.7,15416.6,9668.75,14613.9,14881.0,7768.91,13839.6,16298.3
hsa.miR.137,866.787,1126.42,582.289,586.967,760.127,595.862,583.088,613.016,638.145,501.746,...,852.602,2821.83,588.463,823.337,942.685,592.515,832.632,474.665,1289.52,761.783
hsa.miR.376c,9779.35,1052.31,767.538,1864.67,3074.67,3981.45,1635.66,3628.29,1611.19,728.606,...,4442.9,6683.55,1234.67,4308.1,727.003,5213.87,1440.27,963.983,1373.0,3873.88
hsa.miR.585,1398.58,1313.71,2700.37,3101.67,2410.38,2512.26,2661.34,2741.18,2233.92,4135.86,...,1267.55,913.87,2358.63,1553.97,1645.7,1267.55,1737.6,2219.99,1875.05,1440.27
hsa.miR.302c,427.473,492.067,407.561,417.814,440.079,407.907,400.473,414.606,423.841,412.567,...,400.095,427.173,410.495,460.199,500.891,466.55,426.569,413.196,536.27,448.437


In [15]:
for (i in 1:5) {
  
  # get sample IDs for division
  train <- read.csv2(paste("GSE36681/GSE36681_FF_cv_train_", i, ".csv", sep=""))
  train.samples <- train$ID
  test <- read.csv2(paste("GSE36681/GSE36681_FF_cv_val_", i, ".csv", sep=""))
  test.samples <- test$ID
  
  # divide intro train and test data
  file.name <- "GSE36681_FF_joint-norm"
  
  # split data by sample
  count.matrix.split <- splitRealWorldData(name=file.name, 
                                        count.matrix=count.matrix, 
                                        annotation=annotation, 
                                        negative.samples=96, 
                                        positive.samples=96, 
                                        train.fraction=0.8, 
                                        by.sample=TRUE, 
                                        train.samples=train.samples, 
                                        test.samples=test.samples)
  
  train.data.set <- count.matrix.split$train.data.set 
  train.annots <- as.numeric(count.matrix.split$train.annots)
  test.data.set <- count.matrix.split$test.data.set
  test.annots <- as.numeric(count.matrix.split$test.annots)
  
  file.name <- paste("reproducibility/joint-norm/GSE36681_FF/GSE36681_FF_joint-norm_train_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, train.annots, train.data.set)
  
  file.name <- paste("reproducibility/joint-norm/GSE36681_FF/GSE36681_FF_joint-norm_test_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, test.annots, test.data.set)
  
}

**GSE36681 FFPE**

In [16]:
# read data
GSE36681_FFPE_data <- read.csv2("reproducibility/joint-norm/GSE36681_FFPE.csv")
count.matrix <- t(GSE36681_FFPE_data[, 3:length(colnames(GSE36681_FFPE_data))])
colnames(count.matrix) <- GSE36681_FFPE_data$ID
annotation <- GSE36681_FFPE_data$Annots

head(count.matrix)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,85,86,87,88,89,90,91,92,93,94
hsa.miR.329,494.027,411.947,516.019,426.435,489.298,451.77,547.722,429.652,524.27,450.753,...,421.907,587.768,416.174,520.144,521.562,511.154,383.3,409.074,1091.07,495.997
hsa.miR.424,5520.13,3777.64,3670.45,692.57,4736.25,1742.05,4016.68,5520.13,3895.25,3183.0,...,6369.53,6138.93,6138.93,6614.73,5520.13,7678.15,1555.21,1091.07,5854.47,5317.66
hsa.miR.137,540.16,593.761,483.139,955.512,911.887,518.071,580.805,569.833,527.034,428.125,...,610.376,556.742,530.696,532.234,668.448,1468.3,654.015,632.525,551.393,1030.42
hsa.miR.376c,773.453,836.237,1529.35,1931.88,1774.43,1042.77,627.808,1430.9,1254.06,1181.46,...,1430.9,2653.28,1142.03,1311.82,2958.41,1665.1,883.545,471.796,3022.07,2146.66
hsa.miR.585,611.749,654.015,558.692,897.931,532.975,634.013,676.147,604.946,555.001,555.916,...,715.343,692.57,606.274,661.19,565.725,758.678,451.301,515.424,650.581,558.692
hsa.miR.302c,387.672,404.84,365.945,408.458,383.483,387.047,400.572,363.201,384.004,405.315,...,399.814,399.311,377.155,436.531,400.572,403.037,392.855,385.535,419.681,400.822


In [17]:
for (i in 1:5) {
  
  # get sample IDs for division
  train <- read.csv2(paste("GSE36681/GSE36681_FFPE_cv_train_", i, ".csv", sep=""))
  train.samples <- train$ID
  test <- read.csv2(paste("GSE36681/GSE36681_FFPE_cv_val_", i, ".csv", sep=""))
  test.samples <- test$ID
  
  # divide intro train and test data
  file.name <- "GSE36681_FFPE_joint-norm"
  
  # split data by sample
  count.matrix.split <- splitRealWorldData(name=file.name, 
                                        count.matrix=count.matrix, 
                                        annotation=annotation, 
                                        negative.samples=96, 
                                        positive.samples=96, 
                                        train.fraction=0.8, 
                                        by.sample=TRUE, 
                                        train.samples=train.samples, 
                                        test.samples=test.samples)
  
  train.data.set <- count.matrix.split$train.data.set 
  train.annots <- as.numeric(count.matrix.split$train.annots)
  test.data.set <- count.matrix.split$test.data.set
  test.annots <- as.numeric(count.matrix.split$test.annots)
  
  file.name <- paste("reproducibility/joint-norm/GSE36681_FFPE/GSE36681_FFPE_joint-norm_train_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, train.annots, train.data.set)
  
  file.name <- paste("reproducibility/joint-norm/GSE36681_FFPE/GSE36681_FFPE_joint-norm_test_", i, ".csv", sep = "")
  
  save_matrix(file_name=file.name, test.annots, test.data.set)
  
}