In [1]:
library(dplyr) # data wrangling
library(ggplot2) # plotting
library(DESeq2) # rna-seq
library(edgeR) # rna-seq
library(tximport) # importing kalisto transcript counts to geneLevels
library(readr) # Fast readr of files.
library(rhdf5) # read/convert kalisto output files.  


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min



Attaching package

In [None]:
## define path
fpath_fqd <- file.path('../','data', 'fastqs_output','train','samples')
train_names <- read.delim(file.path('../','data', 'fastqs_output','train','mixed_train_names_list.csv'), sep=',')
folders <- paste0(fpath_fqd, '/mixed_train_sample_id', train_names[['id']], '_counts')
kallisto_abundance_files <- paste0(folders,'/','abundance.tsv')
existing_kallisto_abundance_files <- c()
existing_indices <- c()

for (i in seq_along(kallisto_abundance_files)){
    fpath <- kallisto_abundance_files[i]
    if (file.exists(fpath)){
        existing_kallisto_abundance_files <- c(existing_kallisto_abundance_files, fpath)
        existing_indices <- c(existing_indices, i)
    }
}

In [None]:
#load ref (Homo_sapiens.GRCh38)
tx2gene <- read.delim(file.path('../', 'data', 'transcripts_to_genes.txt'), header=FALSE)[,-2]

#generate the aggregated
output <- tximport(existing_kallisto_abundance_files, type = "kallisto", tx2gene = tx2gene, ignoreAfterBar = TRUE)

#retrieve the count matrix and save it
count_matrix <- output$counts
colnames(count_matrix) <- as.character(sapply(existing_kallisto_abundance_files, function(x){
                                return(strsplit(strsplit(x,'/')[[1]][7],'_')[[1]][4])
                            }))

In [None]:

#read train/test split criteria
train_mixed_proportions <- read.delim(file.path('../', 'data', 'train', 'mixed_train_proportions.csv'), sep=',')
train_mixed_proportions <- train_mixed_proportions[existing_indices,]

rownames(train_mixed_proportions) <- paste0(colnames(count_matrix), ' - ', train_names[existing_indices,][['samples_used']])
train_mixed_bulkrna <- count_matrix

#save it
write.table(train_mixed_bulkrna, file.path('../', 'data', 'bulkrna_train_mixed.tsv'), sep='\t')
write.table(train_mixed_proportions, file.path('../', 'data', 'proportions_train_mixed.tsv'), sep='\t')