In [1]:
library(devtools)
library(ggbiplot)
library(stringr)

Loading required package: usethis

Loading required package: ggplot2

Loading required package: plyr

Loading required package: scales

Loading required package: grid



### Loading Init Data and define the header/column names that contain sample labels from MTBLS csv

In [17]:
# contents of the list would be user submitted
sample_labels_header <- c('Factor.Value.Breast.cancer.diagnosis.',
                          'Factor.Value.Histological.grade.',
                          'Factor.Value.Breast.cancer.relapse.',
                          'Factor.Value.Time.to.relapse.')

In [15]:
labels_table <- read.csv(file="/Users/morris/Box/Hall_Lab/Projects/HUNA/MTBLS424/s_Breast_Cancer.csv",
                   header=TRUE, sep=",", check.names=TRUE)
huna_init <- read.csv('/Users/morris/Box/Hall_Lab/Projects/HUNA/MTBLS424/MTBLS424_init.csv',
                      header=TRUE, check.names=FALSE)
nrow <- nrow(huna_init)
print('assigned variables:')
for (i in 1:nrow){
    assign(toString(huna_init$type[i]), toString(huna_init$path[i]))
    print(toString(huna_init$type[i]))
}

[1] "assigned variables:"
[1] "setwd_path"
[1] "data_set_root_dir"
[1] "path_to_fid"
[1] "norm_bin_500_path"
[1] "norm_bin_5000_path"
[1] "bin500_for_sig_tests"
[1] "figure_path"
[1] "sample_labels_header"
[1] "stat_results_data_path"
[1] "BonSig_Cohen_results_data_path"
[1] "filtered_ROI_path"
[1] "sample_type"
[1] "rDolphin_nmr_input_path"
[1] "rDolphin_nmr_input_path_Early_vs_Late"


In [3]:
MTBLS_bin500 <- read.csv(file=norm_bin_500_path, header=TRUE, sep=",")
MTBLS_bin5000 <- read.csv(file=norm_bin_5000_path, header=TRUE, sep=",")

In [4]:
# Extract real numbers
MTBLS_bin500_Re_apply <- apply(MTBLS_bin500[, c(2:(ncol(MTBLS_bin500)))],2, Re)
# adding back the sample name column
MTBLS_bin500_Re_apply <- cbind(MTBLS_bin500['X'], MTBLS_bin500_Re_apply)

MTBLS_bin5000_Re_apply <- apply(MTBLS_bin5000[, c(2:(ncol(MTBLS_bin5000)))],2, Re)
MTBLS_bin5000_Re_apply <- cbind(MTBLS_bin5000['X'], MTBLS_bin5000_Re_apply)

In [5]:
# Identify the columns that contain zeros and its composition of zeros in %. Zero columns result
# in PCA errors
MTBLS_bin500_Re_apply_DelZeroCol <- MTBLS_bin500_Re_apply[, colSums(MTBLS_bin500_Re_apply != 0) > 0]

MTBLS_bin5000_Re_apply_DelZeroCol <- MTBLS_bin5000_Re_apply[, colSums(MTBLS_bin5000_Re_apply != 0) > 0]

In [6]:
# creating path for the bin500 data to be used when finding significant bins.
bin500_for_sig_tests <- paste(setwd_path,data_set_root_dir, '/output/',
                              data_set_root_dir, '_bin500_for_sig_tests.csv',sep='')
bin5000_for_NN <- paste(setwd_path,data_set_root_dir, '/output/',
                              data_set_root_dir, '_bin5000_for_sig_tests.csv',sep='')
huna_init <- rbind(huna_init, data.frame(type = "bin500_for_sig_tests",
                                         path = bin500_for_sig_tests))

In [18]:
# for 500 bins data
# adding labels to the MTBLS data to generate a PCA plot
for (col in sample_labels_header){
    col_to_add <- c()
    print(col)
    for (i in 1:nrow(MTBLS_bin500_Re_apply_DelZeroCol)){
        # obtain ID of the sample from binning data
        sample_ID <- as.character(MTBLS_bin500_Re_apply_DelZeroCol[i,1]) 
        # add regular expression syntax to find exact matches
        sample_ID <- paste('^',sample_ID,'$', sep='')
        #print(paste('sample ID: ',sample_ID))
        # cross reference where the ID is found in the label table and query the row index
        row_index <- (which(grepl(sample_ID, labels_table$Sample.Name)))
        col_index <- (grep(col, colnames(labels_table)))
        col_content <- as.character(labels_table[row_index,col_index])
        #print(paste('col index: ', col_index, 'row index: ', row_index))
        col_to_add <- c(col_to_add, col_content)
    }
    MTBLS_bin500_Re_apply_DelZeroCol[col] <- col_to_add
}
write.csv(MTBLS_bin500_Re_apply_DelZeroCol, file= bin500_for_sig_tests, row.names=FALSE)

[1] "Factor.Value.Breast.cancer.diagnosis."
[1] "Factor.Value.Histological.grade."
[1] "Factor.Value.Breast.cancer.relapse."
[1] "Factor.Value.Time.to.relapse."


In [19]:
# for 5000 bins data
# adding labels to the MTBLS data to generate a PCA plot
for (col in sample_labels_header){
    col_to_add <- c()
    for (i in 1:nrow(MTBLS_bin5000_Re_apply_DelZeroCol)){
        # obtain ID of the sample from binning data
        sample_ID <- as.character(MTBLS_bin5000_Re_apply_DelZeroCol[i,1]) 
        # add regular expression syntax to find exact matches
        sample_ID <- paste('^',sample_ID,'$', sep='')
        #print(paste('sample ID: ',sample_ID))
        # cross reference where the ID is found in the label table and query the row index
        row_index <- (which(grepl(sample_ID, labels_table$Sample.Name)))
        col_index <- (grep(col, colnames(labels_table)))
        col_content <- as.character(labels_table[row_index,col_index])
        #print(paste('col index: ', col_index, 'row index: ', row_index))
        col_to_add <- c(col_to_add, col_content)
    }
    MTBLS_bin5000_Re_apply_DelZeroCol[col] <- col_to_add
}
write.csv(MTBLS_bin5000_Re_apply_DelZeroCol, file= bin5000_for_NN, row.names=FALSE)

#### MTBLS PCA

In [None]:
# calculate PCA all cols excluding the sample name and sample labels
MTBLS.pca <- prcomp(MTBLS_bin500_Re_apply_DelZeroCol
                     [,c(2:(ncol(MTBLS_bin500_Re_apply_DelZeroCol)-length(sample_labels_header)))],
                     center = TRUE, scale. = FALSE)

In [None]:
# creating the path and appending path to huna init csv for the outputed figures
figure_path <- (paste(setwd_path,data_set_root_dir, '/output/figures/',sep=''))

huna_init <- rbind(huna_init, data.frame(type = "figure_path", path = figure_path))

In [None]:
# appending the user defined list of sample label columns into the init csv file
# collapsed list separated by a space
sample_labels_header_string <- (paste(unlist(sample_labels_header), collapse=' '))
huna_init <- rbind(huna_init, data.frame(type = "sample_labels_header",
                                         path = sample_labels_header_string))

In [None]:
# saving the updated the huna init csv file
save <- (paste(setwd_path,'/',data_set_root_dir, '/', data_set_root_dir, '_init.csv', sep=''))
write.csv(huna_init, file= save, row.names=FALSE)

In [None]:
for (col in sample_labels_header){
    col_index <- (grep(col, colnames(MTBLS_bin500_Re_apply_DelZeroCol)))
    print(ggbiplot(MTBLS.pca, ellipse=TRUE, var.axes=FALSE,
         groups=MTBLS_bin500_Re_apply_DelZeroCol[,col_index]))
    if (grepl(".", col, fixed = TRUE)==TRUE){
        col <- gsub("[.]", "_", col)
    }
    filename <- paste(figure_path, col, 'pca_plot.png', sep='')
    ggsave(filename, plot = last_plot(), dpi = 'retina')
}

In [None]:
for (col in sample_labels_header){
    if (grepl(".", col, fixed = TRUE)==TRUE){
        col <- gsub("[.]", "_", col)
    }
}

In [None]:
col