# (Purpose) Filter and Clean aggregated datasets

In [None]:
# ---------------------- FORCE TISSUE TYPES TO BE COMMON--------------
FORCE_COMMON_TISSUE_TYPES = TRUE
# FORCE_COMMON_TISSUE_TYPES = FALSE

# ---------------------- TCGA dataset ---------------------------      
filename_tcga_mock = "tcga_unscaled_unnormalized_nobatchcorrection__mockData.tsv"  
filename_tcga = "tcga_unscaled_unnormalized_nobatchcorrection.tsv"    # real data

filename_tcga_w_path_mock = paste("data/preprocessing_combinations/", filename_tcga_mock, sep="")
filename_tcga_w_path = paste("data/preprocessing_combinations/", filename_tcga, sep="")



# ---------------------- GTEx dataset ---------------------------    
filename_gtex_mock = "gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv"  
filename_gtex = "gtex_unscaled_unnormalized_nobatchcorrection.tsv"    # real data

filename_gtex_w_path_mock = paste("data/preprocessing_combinations/", filename_gtex_mock, sep="")
filename_gtex_w_path = paste("data/preprocessing_combinations/", filename_gtex, sep="")


## Install and load packages required in R

In [None]:
# install packages
print("  begin -- installing R packages")

options(install.packages.compile.from.source = "always")
install.packages("dplyr", repos = getCRANmirrors()[1,"URL"])
install.packages("readr", repos = getCRANmirrors()[1,"URL"])
install.packages("data.table", repos = getCRANmirrors()[1,"URL"])

print("  done  -- installing R packages")

In [None]:
# load packages
print("  begin -- loading packages")

library(dplyr)                  
library(readr)                  
library(data.table)             

print("  done  -- loading packages")

## Load GTEx data and subset

In [None]:
# load table with readr package
# (remember) the # rows total does not include header row
sprintf("  loading gtex mock table -- %s", filename_gtex_mock)
samples_gtex_mock <- read_tsv(filename_gtex_w_path_mock)
print("  finished loading mock table")

sprintf("  loading gtex real table -- %s", filename_gtex)
samples_gtex <- read_tsv(filename_gtex_w_path)
print("  finished loading real table")

In [None]:
# preview the top and bottom this table of mock data before sorting
head(samples_gtex_mock[,1:5], 2)
tail(samples_gtex_mock[,1:5], 2)

In [None]:
# preview the top and bottom this table of real data before sorting
head(samples_gtex[,1:5], 2)
tail(samples_gtex[,1:5], 2)

In [None]:
print("  sort by label then by sample_id")
samples_gtex_mock <- samples_gtex_mock %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

samples_gtex <- samples_gtex %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

In [None]:
# preview the top and bottom this table of mock data after sorting by label
head(samples_gtex_mock[,1:5], 2)
tail(samples_gtex_mock[,1:5], 2)

In [None]:
# preview the top and bottom this table of real data after sorting by label
head(samples_gtex[,1:5], 2)
tail(samples_gtex[,1:5], 2)

In [None]:
print("  checking tally of labels on mock gtex dataset")
samples_gtex_mock %>%
      group_by(label) %>%
      tally()

In [None]:
print("  checking tally of labels on real gtex dataset")
samples_gtex %>%
      group_by(label) %>%
      tally()

### Determine what tissue types are the minimum set in the independent set

In [None]:
if (FORCE_COMMON_TISSUE_TYPES){
    print (" -----------------------------------------------------------------------")    
    print (" [NOTICE] FORCE_COMMON_TISSUE_TYPES is TRUE; thus we will drop tissue types in TCGA training set")
    print (" -----------------------------------------------------------------------") 
    
    # filter for unique values in one column
    minimum_tissue_types <- samples_gtex %>%
        distinct (label) %>%
        pull (label)
    print (minimum_tissue_types)
}


## Load TCGA data and subset

In [None]:

# load table with readr package
# (remember) the # rows total does not include header row
sprintf("  loading tcga mock table -- %s", filename_tcga_mock)
samples_mock <- read_tsv(filename_tcga_w_path_mock)
print("  finished loading mock table")

sprintf("  loading tcga real table -- %s", filename_tcga)
samples <- read_tsv(filename_tcga_w_path)
print("  finished loading real table")

In [None]:
# preview the top and bottom of the mock data before sorting by label
head(samples_mock[,1:5], 2)
tail(samples_mock[,1:5], 2)

In [None]:
# preview the top and bottom of the real data before sorting by label
head(samples[,1:5], 2)
tail(samples[,1:5], 2)

In [None]:
print("  sort by label then by sample_id")
samples_mock <- samples_mock %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

samples <- samples %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)
      

In [None]:
# preview the top and bottom of the mock data after sorting by label
head(samples_mock[,1:5], 2)
tail(samples_mock[,1:5], 2)

In [None]:
# preview the top and bottom of the real data after sorting by label
head(samples[,1:5], 2)
tail(samples[,1:5], 2)

In [None]:
print("  checking tally of labels before modifications on mock tcga dataset")
samples_mock %>%
      group_by(label) %>%
      tally()

In [None]:
print("  checking tally of labels before modifications on real tcga dataset")
samples %>%
      group_by(label) %>%
      tally()

### Regarding ESCC, STAD.EBV, and LUSC

- (from DrHan ) *Oh those are labels that came out of the GI paper that identified subtypes within GI
I think I separated or included lusc because turns out squamous cell type is one of the culprits for the Mis classification and i wanted to see how lusc behave. They are informative for interpreting the misclassifications. Because misclassifications tend to map to those subtypes in gi or squamous cell types*

In [None]:
print("  drop the sample types (ESCC, LUSC, STAD.EBV) that are being used for miscalculation exploration")  
samples_mock <- samples_mock  %>% 
    filter(label != "ESCC") %>% 
    filter(label != "STAD.EBV") %>% 
    filter(label != "LUSC") 

samples <- samples  %>% 
    filter(label != "ESCC") %>% 
    filter(label != "STAD.EBV") %>% 
    filter(label != "LUSC") 

### If applicable - drop tissue types from TCGA train data

In [None]:
if (FORCE_COMMON_TISSUE_TYPES){
    print("    performing actual subset function that drops tissue types")
    samples_mock <- samples_mock  %>% 
        filter(label %in% minimum_tissue_types)
    samples <- samples  %>% 
        filter(label %in% minimum_tissue_types)
}


In [None]:
print("  checking tally of labels after modifications on mock tcga dataset")
samples_mock %>%
      group_by(label) %>%
      tally()

In [None]:
print("  checking tally of labels after modifications on real tcga dataset")
samples %>%
      group_by(label) %>%
      tally()

In [None]:
print("  checking how many normal samples and how many cancer samples there are")

# Tally up the rows with "01A" and "11A" in the sample_id column
count_01A <- samples %>% filter(grepl("-01A", sample_id)) %>% nrow()           # notice need the dash
count_11A <- samples %>% filter(grepl("-11A", sample_id)) %>% nrow()
count_either <- samples %>% filter(grepl("-11A|-01A", sample_id)) %>% nrow()   
count_both <- samples %>% filter(grepl("01A", sample_id)) %>% filter(grepl("11A", sample_id)) 

# Display the counts
sprintf("    01A(cancer): %s", count_01A)
sprintf("    11A(normal): %s", count_11A)
sprintf("         either: %s", count_either)
sprintf("          TOTAL: %s", count_01A + count_11A)

sprintf("    samples that have both 01A and 11A in name below:")
head(count_both[,1:5],5)

In [None]:
print("  checking how many normal samples and how many cancer samples there are in GI")
samples_GI <- samples  %>% 
    filter(label == "GI") 

# Tally up the rows with "01A" and "11A" in the sample_id column
count_01A <- samples_GI %>% filter(grepl("-01A", sample_id)) %>% nrow()           # notice need the dash
count_11A <- samples_GI %>% filter(grepl("-11A", sample_id)) %>% nrow()
count_either <- samples_GI %>% filter(grepl("-11A|-01A", sample_id)) %>% nrow()   
count_both <- samples_GI %>% filter(grepl("01A", sample_id)) %>% filter(grepl("11A", sample_id)) 

# Display the counts
sprintf("    01A(cancer): %s", count_01A)
sprintf("    11A(normal): %s", count_11A)
sprintf("         either: %s", count_either)
sprintf("          TOTAL: %s", count_01A + count_11A)


## Split tcga into 80% train and 20% test - real data

- (source) https://www.statology.org/train-test-split-r/
- need to consider this to prevent data leakage
- will need to recombine later before potential batch effect correction
- the saved TCGA dataset will not be sorted 
    * thus before ML, will need to 
    * extract the top 80% of dataset to get train data
    * extract remaining 20% of dataset to get test data

In [None]:
set.seed(12345)            

print("  sort tcga by label then by sample_id")
samples <- samples %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  perform the actual train-test split at 80%")
samples_tcga_train <- samples  %>% 
    sample_frac(0.80)
samples_tcga_test <- anti_join(samples, samples_tcga_train, by = "sample_id" )

print("  sort tcga_train by label then by sample_id")
samples_tcga_train <- samples_tcga_train %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  sort tcga_test by label then by sample_id")
samples_tcga_test <- samples_tcga_test %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  checking dimensions and tally of 80% TCGA for train set")
dim(samples_tcga_train)
samples_tcga_train %>%
      group_by(label) %>%
      tally()

print("  checking dimensions and tally of 20% TCGA for test set")
dim(samples_tcga_test)
samples_tcga_test %>%
      group_by(label) %>%
      tally()

In [None]:
print ("previewing the top and bottom 80% tcga table") 
print (head(samples_tcga_train[,1:5], 2) )
print (tail(samples_tcga_train[,1:5], 2) )

In [None]:
print ("previewing the top and bottom 20% tcga table") 
print (head(samples_tcga_test[,1:5], 2) )
print (tail(samples_tcga_test[,1:5], 2) )

## Split tcga into 80% train and 20% test - mock data

In [None]:
set.seed(12345)            

print("  sort tcga by label then by sample_id")
samples_mock <- samples_mock %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  perform the actual train-test split at 80%")
samples_mock_tcga_train <- samples_mock  %>% 
    sample_frac(0.80)
samples_mock_tcga_test <- anti_join(samples_mock, samples_mock_tcga_train, by = "sample_id" )

print("  sort tcga_train by label then by sample_id")
samples_mock_tcga_train <- samples_mock_tcga_train %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  sort tcga_test by label then by sample_id")
samples_mock_tcga_test <- samples_mock_tcga_test %>%
   group_by(label) %>%
   arrange(sample_id, .by_group = TRUE)

print("  checking dimensions and tally of 80% TCGA for train set")
dim(samples_mock_tcga_train)
samples_mock_tcga_train %>%
      group_by(label) %>%
      tally()

print("  checking dimensions and tally of 20% TCGA for test set")
dim(samples_mock_tcga_test)
samples_mock_tcga_test %>%
      group_by(label) %>%
      tally()

In [None]:
print ("previewing the top and bottom 80% tcga table") 
print (head(samples_mock_tcga_train[,1:5], 2) )
print (tail(samples_mock_tcga_train[,1:5], 2) )

In [None]:
print ("previewing the top and bottom 20% tcga table") 
print (head(samples_mock_tcga_test[,1:5], 2) )
print (tail(samples_mock_tcga_test[,1:5], 2) )

## Drop genes with zero expression based on 80% TCGA train

In [None]:
## function to find zero-based genes

# (returns) list of genes with zero expression
find_zero_expression_genes <- function(arg_data){
    print("  begin find_zero_expression_genes()")

    # initialize an empty list
    zero_gene_list <- list()
    
    ## working - checks for columns(=genes) where all values zero and prints out the name of gene    
    for (column in 1:ncol(arg_data)){
        if (all(arg_data[ ,column] == 0)){
            cur_col_name = colnames(arg_data)[column]
            print(sprintf('  + found all zeroes in col @ %s', cur_col_name))
            zero_gene_list <- append(zero_gene_list, cur_col_name)
        }
    }
    
    print(sprintf('  + [Total genes found with all zeros] %s', length(zero_gene_list)))
    
    print("  end find_zero_expression_genes()")
    return (zero_gene_list)
}

In [None]:
## function to drop zero-based genes

# (source to help with excluding) https://stackoverflow.com/questions/49515311/dplyr-select-all-variables-except-for-those-contained-in-vector

# (returns) updated DF with zero-based genes dropped 
drop_zero_expression_genes_on_dataset <- function(arg_data, arg_genes_to_drop){
    print("  begin drop_zero_expression_genes()")
    
    print("    check dimensions before subset")
    print(dim(arg_data))
    
    genes_to_drop <- simplify2array(arg_genes_to_drop)
    
    print("    performing actual subset function that drops zero expression genes")
    return_df <- arg_data %>% 
        select(-any_of(genes_to_drop))
    
    print("    check dimensions after subset")
    print(dim(return_df))
    
    print("  end drop_zero_expression_genes_on_dataset()")
    return (return_df)
}

In [None]:
print("  finding zero expression genes from -- 80% TCGA train")
genes_to_drop <- find_zero_expression_genes(samples_tcga_train)

In [None]:
print("  dropping zero expression genes for dataset -- 80% TCGA train")
samples_tcga_train <- drop_zero_expression_genes_on_dataset(samples_tcga_train, genes_to_drop)

In [None]:
print("  dropping zero expression genes for dataset -- 20% TCGA test")
samples_tcga_test <- drop_zero_expression_genes_on_dataset(samples_tcga_test, genes_to_drop)

In [None]:
print("  dropping zero expression genes for dataset -- 100% GTEX test")
samples_gtex <- drop_zero_expression_genes_on_dataset(samples_gtex, genes_to_drop)

## Combine 80% of TCGA and 20% of TCGA back into one dataset
- *important to not sort before saving*

In [None]:
print("  combining train set (80% TCGA) and test set (20% TCGA) -- real data")
samples <- rbindlist(list(
                                    samples_tcga_train, 
                                    samples_tcga_test
                                    ))

print("  checking dimensions of each individual and combined datasets")
print("   (100% TCGA samples)")
print(dim(samples))
print("   (80%  TCGA samples)")
print(dim(samples_tcga_train))
print("   (20%  TCGA samples)")
print(dim(samples_tcga_test))

In [None]:
print("  combining train set (80% TCGA) and test set (20% TCGA) -- mock data")
samples_mock <- rbindlist(list(
                                    samples_mock_tcga_train, 
                                    samples_mock_tcga_test
                                    ))

print("  checking dimensions of each individual and combined datasets")
print("   (100% TCGA samples)")
print(dim(samples_mock))
print("   (80%  TCGA samples)")
print(dim(samples_mock_tcga_train))
print("   (20%  TCGA samples)")
print(dim(samples_mock_tcga_test))

## Saving TCGA

In [None]:
print("  saving table back to file ")
print(sprintf("    writing mock tcga table -- %s", filename_tcga_mock))
write_tsv(samples_mock, filename_tcga_w_path_mock)
print(  "    real mock saved-- ")

print(sprintf("    writing real tcga table -- %s", filename_tcga))
write_tsv(samples, filename_tcga_w_path)
print(  "    real table saved-- ")

## Saving GTEx

In [None]:
print("  saving table back to file ")
sprintf("    writing mock gtex table -- %s", filename_gtex_mock)
write_tsv(samples_gtex_mock, filename_gtex_w_path_mock)
print(  "    real mock saved-- ")

sprintf("    writing real gtex table -- %s", filename_gtex)
write_tsv(samples_gtex, filename_gtex_w_path)
print(  "    real table saved-- ")