# (Purpose) Apply normalization procedure - FSQN

In [None]:
# ---------------------- TCGA dataset ---------------------------              ** make sure input and outputs match!
# ---------------------------- input file------------------------
# filename_tcga = "tcga_unscaled_unnormalized_nobatchcorrection__mockData.tsv"  
filename_tcga = "tcga_unscaled_unnormalized_nobatchcorrection.tsv"     # real data

# ---------------------------- output file-----------------------
save_filename_tcga = "tcga_unscaled_fsqn_nobatchcorrection.tsv"          # real data


# ---------------------- GTEx dataset ---------------------------     
# ---------------------------- input file------------------------
# filename_gtex = "gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv"  
filename_gtex = "gtex_unscaled_unnormalized_nobatchcorrection.tsv"     # real data

# ---------------------------- output file-----------------------
save_filename_gtex = "gtex_unscaled_fsqn_nobatchcorrection.tsv"          # real data

## Install and load packages required in R

In [None]:
# install packages
print("  begin -- installing R packages")

options(install.packages.compile.from.source = "always")
install.packages("dplyr", repos = getCRANmirrors()[1,"URL"])
install.packages("readr", repos = getCRANmirrors()[1,"URL"])
install.packages("stringr", repos = getCRANmirrors()[1,"URL"])
install.packages("remotes", repos = getCRANmirrors()[1,"URL"])
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager", repos = getCRANmirrors()[1,"URL"])
BiocManager::install("preprocessCore", configure.args = c(preprocessCore = "--disable-threading"), force= TRUE, update=TRUE, type = "source")
remotes::install_github("jenniferfranks/FSQN")

print("  done  -- installing R packages")

In [None]:
# load packages
print("  begin -- loading packages")

library(dplyr)                  # a powerful and user-friendly package for data manipulation and transformation, 
                                #   providing a consistent and intuitive grammar to efficiently perform common data 
                                #   tasks on data frames and tibbles.  
library(readr)                  # for reading and importing data from various file formats, offering fast and 
                                #   memory-efficient data import capabilities into R.
library(stringr)                # for working with strings in R, offering a set of easy-to-use functions for string 
                                #   manipulation, pattern matching, and text extraction.
library(data.table)             # extends data frames, providing enhanced functionality for handling large datasets, 
                                #   aggregation, joins, and more with concise syntax.
library(preprocessCore)         # provides a collection of functions for preprocessing and normalizing microarray 
                                #   and RNA-Seq data, offering essential tools for quality control, 
                                #   background correction, and normalization to ensure accurate  
                                #   and reliable downstream analyses.
library(FSQN)                   # designed to quantile normalize each feature in a data set according to its 
                                #   corresponding feature in a target distribution. This eliminates distribution 
                                #   based differences resulting from the use of different gene expression 
                                #   profiling platforms.

print("  done  -- loading packages")

In [None]:
# ---------------------- load table with readr package ----------
filename_w_path_tcga = paste("data/preprocessing_combinations/", filename_tcga, sep="")
filename_w_path_gtex = paste("data/preprocessing_combinations/", filename_gtex, sep="")

# (remember) the # rows total does not include header row
sprintf("  loading TCGA table -- %s", filename_tcga)
samples_tcga <- read_tsv(filename_w_path_tcga)
print("  finished loading")

sprintf("  loading GTEx table -- %s", filename_gtex)
samples_gtex <- read_tsv(filename_w_path_gtex)
print("  finished loading")

In [None]:
# preview the top and bottom tcga table 
head(samples_tcga[,1:5], 2)
tail(samples_tcga[,1:5], 2)

In [None]:
# preview the top and bottom gtex table 
head(samples_gtex[,1:5], 2)
tail(samples_gtex[,1:5], 2)

## Split tcga into 80% train and 20% test

- **note** the tcga dataset has already been randomly split and preserving class distribution in notebook08
- here we split in the sense of taking the top 80% of file for train and bottom 20 for test

In [None]:
print("  [TCGA dataset already split] assigning the TCGA data into 80% train and 20% test sets")  
percent = 0.80

samples_tcga_train <- head(samples_tcga, as.integer(nrow(samples_tcga)*percent) )
samples_tcga_test <- anti_join(samples_tcga, samples_tcga_train, by = "sample_id" )

print("  checking dimensions and tally of 80% TCGA for train set")
dim(samples_tcga_train)
samples_tcga_train %>%
      group_by(label) %>%
      tally()

print("  checking dimensions and tally of 20% TCGA for test set")
dim(samples_tcga_test)
samples_tcga_test %>%
      group_by(label) %>%
      tally()

In [None]:
print ("previewing the top and bottom 80% tcga table") 
print (head(samples_tcga_train[,1:5], 2) )
print (tail(samples_tcga_train[,1:5], 2) )

In [None]:
print ("previewing the top and bottom 20% tcga table") 
print (head(samples_tcga_test[,1:5], 2) )
print (tail(samples_tcga_test[,1:5], 2) )

## Apply normalization (here is Feature Specific Quantile Normalization) on the following:
- 100% GTEx for test set #2

In [None]:
# declare function that will be function that will be run for each dataset normalization is applied to
#   pull non-numeric columns before apply quantile normalization (bc each column needs to be a chip, and each row a probe)
#   ** notice ** unlike qn and qn-target, tranpose is not required here!
#   (source) https://github.com/jenniferfranks/FSQN/blob/master/vignettes/FSQNvignette.pdf

apply_fsqn <- function(arg_samples, arg_target){
    print("")
    print("  begin apply_fsqn")

    tmp_df <- data.frame(arg_samples)    # to make sure we make a copy of the df
    
  
    print("    Preparing for the normalization call")
    samples_id_label <- arg_samples %>% 
        select(sample_id, label)
    
    tmp_df <- tmp_df %>% 
        select(-sample_id, -label)
    
    print("    (checking dimensions of arg_samples without sample_id and labels columns)")
    print(dim(tmp_df))


    print("    (converting df to matrix)")
    matrix <- as.matrix(sapply(tmp_df, as.numeric))
    rownames(matrix) <- samples_id_label$sample_id
    
    print("    (previewing matrix before normalization call)")
    print(head(matrix[,1:3], 2))
    print(tail(matrix[,1:3], 2))
    
    # code to deal with the target
    target_df <- data.frame(arg_target)
    target_id_label <- arg_target %>% 
        select(sample_id, label)
    target_df <- target_df %>% 
        select(-sample_id, -label)
    print("    (arg_target - checking dimensions of without sample_id and labels columns)")
    print(dim(target_df)) 
    print("    (arg_target - converting df to matrix)")
    matrix_target <- as.matrix(sapply(target_df, as.numeric))
    rownames(matrix_target) <- target_id_label$sample_id
    print("    (arg_target - previewing matrix before normalization call)")
    print(head(matrix_target[,1:3], 2))
    print(tail(matrix_target[,1:3], 2))
    
    # the actual normalization call
    print("   Performing the actual normalization call")
    matrix_norm <- quantileNormalizeByFeature(matrix, matrix_target)

    # *notice* at this point we do not care about the target anymore, just deal with the non-target
    
    print("    (previewing matrix after normalization call; now creating header for new matrix ")
    colnames(matrix_norm) <- colnames(matrix)
    rownames(matrix_norm) <- rownames(matrix)
    print(head(matrix_norm[,1:3], 2))
    print(tail(matrix_norm[,1:3], 2))
    
    print("    (converting matrix to df)")
    x_norm <- as.data.frame(matrix_norm)
    
    # put back non-numeric columns to df
    print("   Recreating the original df with updated expression values post normalization call")
    
    x_norm$sample_id <- rownames(x_norm)
    
    print("    (checking dimensions of x_normalized)")
    print(dim(x_norm))
    
    print("    (previewing x_norm)")
    print(head(x_norm[,1:3], 2))
    print(tail(x_norm[,1:3], 2))

    print("    (need to) perform an outer join with earlier df that contains label column")
    tmp_df <- merge(x=x_norm, y=samples_id_label, by="sample_id", all=TRUE)

    print("    (reorganize df column order)")
    updated_samples <- tmp_df %>% 
        select(sample_id, label, everything())
    
    print("  sort by label then by sample_id")
    updated_samples <- updated_samples %>%
       group_by(label) %>%
       arrange(sample_id, .by_group = TRUE)

    print("    (checking dimensions of tranposed df)")
    print(dim(updated_samples))

    print("    (previewing updated df)")
    print(head(updated_samples[,1:3], 2))
    print(tail(updated_samples[,1:3], 2))
    
    print("    (comparing back to arg_samples)")
    print(dim(arg_samples))
    print(head(arg_samples[,1:3], 2))
    print(tail(arg_samples[,1:3], 2))
    
    
    print("  end apply_fsqn, returning modified input")
    print("")
    
    return(updated_samples)
}

In [None]:
## Helper function

# (returns) if found then print the (rows=sample_id) and (cols=gene names) that are zero
check_if_any_genes_zero <- function(arg_data, arg_debug_filename){
    print("  begin check_if_any_genes_zero()")

    # remember that arg_data has first two cols as sample_id, label, then rest are genes; rows are each sample
    
    # work with a copy just in case and also assign row names as first column
    tmp_df <- data.frame(arg_data)
    
    # assign row names as first column
    rownames(tmp_df) <- tmp_df$sample_id
    
    print("")
    print("[DEBUG]---------------------------------[DEBUG]")
    
    counter = 0
    
    ## working - checks all elements and prints out the location with zero value
#     print(sprintf("  # rows: %s", nrow(tmp_df)))
#     print(sprintf("  # cols: %s", ncol(tmp_df)))
#     for (row in 1:nrow(tmp_df)){
#         for (column in 1:ncol(tmp_df)){
#             if (tmp_df[row, column] == 0){
#                 cur_row_name = rownames(tmp_df)[row]
#                 cur_col_name = colnames(tmp_df)[column]
#                 print(sprintf('  + found zero @ (%s,%s): %s',
#                               cur_row_name,
#                               cur_col_name,
#                               tmp_df[row, column])
#                 )
#             }
#         }
#     }
    
    
#     # (for testing only) set all value of this gene equal to zero
#     tmp_df$ENSG00000000419 <- 0
    
    ## working - checks for columns(=genes) where all values zero and prints out the name of gene    
    for (column in 1:ncol(tmp_df)){
        if (all(tmp_df[ ,column] == 0)){
            cur_col_name = colnames(tmp_df)[column]
            print(sprintf('  + found all zeroes in col @ %s', cur_col_name))
            counter = counter + 1
        }
    }
    
    print(sprintf('  + [Total genes found with all zeros] %s', counter))
    
#     # be sure that you want to save the snapshots of files, because each one is many GBs and takes up lots of space!
#     print(sprintf('  + writing %s to file under results/debug/', arg_debug_filename))
#     save_filename_w_path_debug = paste(PATH_TO_DEBUG, arg_debug_filename, ".tsv", sep="")
#     write_tsv(arg_data, save_filename_w_path_debug)
    
    
    print("[END_DEBUG]-------------------------[END_DEBUG]")
    print("")
    
    
    print("  end check_if_any_genes_zero()")
}

In [None]:
print("  about to apply normalization function on -- 20% TCGA data for testing, with helper function too")
check_if_any_genes_zero(samples_tcga_test, "notebook09b_tcga_test_before_fsqn")
normalized_samples_tcga_test <- apply_fsqn(samples_tcga_test, samples_tcga_train)
check_if_any_genes_zero(normalized_samples_tcga_test, "notebook09b_tcga_test_after_fsqn")

In [None]:
print("  about to apply normalization function on -- 100% GTEx data for testing, with helper function too")
check_if_any_genes_zero(samples_gtex, "notebook09b_gtex_test_before_fsqn")
normalized_samples_gtex <- apply_fsqn(samples_gtex, samples_tcga_train)
check_if_any_genes_zero(normalized_samples_gtex, "notebook09b_gtex_test_after_fsqn")

## Combine 80% of TCGA and 20% of TCGA back into one dataset
- *important to not sort before saving*

In [None]:
print("  combining train set (80% TCGA) and test set (20% TCGA)")
all_samples_tcga <- rbindlist(list(
                                    samples_tcga_train, 
                                    normalized_samples_tcga_test
                                    ))

print("  checking dimensions of each individual and combined datasets")
print("   (100% TCGA samples)")
print(dim(all_samples_tcga))
print("   (80%  TCGA samples)")
print(dim(samples_tcga_train))
print("   (20%  TCGA samples)")
print(dim(normalized_samples_tcga_test))

## Save the files

In [None]:
print ("  Saving the files ")

# ---------------------- TCGA dataset ---------------------------

save_filename_w_path_tcga = paste("data/preprocessing_combinations/", save_filename_tcga, sep="")

sprintf("    writing tcga table to path -- %s", save_filename_w_path_tcga)
write_tsv(all_samples_tcga, save_filename_w_path_tcga)
print("  tcga table saved-- ")

# ---------------------- GTEx dataset ---------------------------

save_filename_w_path_gtex = paste("data/preprocessing_combinations/", save_filename_gtex, sep="")

sprintf("    writing gtex table to path -- %s", save_filename_w_path_gtex)
write_tsv(normalized_samples_gtex, save_filename_w_path_gtex)
print("  gtex table saved-- ")


