# Preprocess data

Aims:
- Write the probe information from the two datasets
- Save the formatted data matrix (GEP and methylation)

In [52]:
suppressMessages({
    library(hashmap)
    library(tidyverse)
    library(mygene)
})
trim.trailing <- function (x) sub("\\s+$", "", x)

In [48]:
# load RNAseq and methylation data
gep <- read_tsv('../../data/Figueroa/GEP-filtered.tsv')
methylation <- read_tsv('../../data/Figueroa/methylation-filtered.tsv')

Parsed with column specification:
cols(
  .default = col_double(),
  Name = col_character(),
  Description = col_character()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_double(),
  Gene = col_character(),
  Name = col_character()
)
See spec(...) for full column specifications.


## Preprocess Expression Array

In [49]:
gep.mat <- data.matrix(gep[,-seq(2)])
colnames(gep.mat) <- colnames(gep)[-seq(2)]
rownames(gep.mat) <- gep$Name 
head(gep.mat, n=2) # not normalized 

Unnamed: 0,GSM464771,GSM464772,GSM464773,GSM464774,GSM464775,GSM464776,GSM464777,GSM464778,GSM464779,GSM464780,⋯,GSM465020,GSM465024,GSM465026,GSM465030,GSM465060,GSM465061,GSM465062,GSM465063,GSM465064,GSM465065
10000_at,74.3174,27.66245,102.0721,101.6374,31.43173,74.90709,68.8868,114.4036,98.7125,103.8562,⋯,106.8033,207.4528,120.7066,84.64849,139.1024,43.22202,236.6838,210.127,220.1624,147.4783
10001_at,660.5697,536.1886,314.0691,856.6095,708.81936,433.50528,528.4353,448.2595,543.218,707.8009,⋯,406.9814,586.143,410.5169,486.99356,547.8272,587.26037,549.8494,398.5402,457.6389,384.2906


In [50]:
name=gep[,seq(1)]
gene=sapply(as.vector(gep$Description), function(x) trim.trailing(strsplit(x, ' - ')[[1]][1]))
desc=sapply(as.vector(gep$Description), function(x) trim.trailing(strsplit(x, ' - ')[[1]][2]))

write.table(data.frame(name=name, gene=gene, desc=desc), file='../../data/Figueroa/gep-name2desc.tsv', sep='\t', col.names=T, row.names=F, quote=F)

In [None]:
annotations <- queryMany(gene, scopes='symbol', fields=c('go'), species='human', returnall=T)

terms <- c()
for (i in seq(length(annotations$response$query))) {
  if (is.na(annotations$response$notfound[i])) {
    query <- annotations$response$query[i]
    termConcat <- paste(as.character(annotations$response$go.BP[[i]]$term), collapse=', ')
    terms <- rbind(terms, c(query, termConcat))
  }
}

In [59]:
head(terms[terms[,1]=='DNMT3B',] ) # a methyltransferase

In [60]:
terms.cleaned <- data.frame(name=terms[,1], annotation=terms[,2]) %>%
  group_by(name) %>%
  mutate(annotations = paste0(annotation, collapse = ", ")) %>%
  dplyr::select(-annotation) %>%
  distinct()
terms.filtered <- terms.cleaned %>%
  dplyr::filter(grepl('methyl|transcrip', annotations)) # 3415 left
dim(terms.filtered)

In [69]:
write.table(terms.filtered, file='../../data/Figueroa/genes-transcription-related.tsv',
           sep='\t', row.names=F, col.names=T, quote=F)

In [66]:
filtered.idx <- which(gene %in% terms.filtered$name)
gep.filtered <- gep.mat[filtered.idx,]
rownames(gep.filtered) <- gene[filtered.idx]

In [67]:
write.table(gep.filtered, file='../../data/Figueroa/gep-transcription-related.tsv', sep='\t', row.names=T, col.names=T, quote=F)