-
Notifications
You must be signed in to change notification settings - Fork 1
/
analyze_init.R
executable file
·124 lines (105 loc) · 4.59 KB
/
analyze_init.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/opt/apps/R/3.0.1/bin/Rscript
#===============================================================================
#
# ooooo ooo . .o.
# `888b. `8' .o8 888
# 8 `88b. 8 .ooooo. .o888oo .ooooo. 888
# 8 `88b. 8 d88' `88b 888 d88' `88b Y8P
# 8 `88b.8 888 888 888 888ooo888 `8'
# 8 `888 888 888 888 . 888 .o .o.
# o8o `8 `Y8bod8P' "888" `Y8bod8P' Y8P
#
#
# This file is sourced by the other analysis scripts and should not be run
# directly by the user.
#
# The files should be run in the order:
# - setup.R
# - analyze_tune.R (preferably on a computation cluster or multicore machine)
# - analyze_final.R
#
#===============================================================================
# Load data and packages
#-------------------------------------------------------------------------------
tryCatch({
library(pamr)
library(predict)
library(nordlund2013)
load("data/methylation.Rdata")
load("data/annotations.Rdata")
load("data/phenotypes.Rdata")
# Remove sample not used in the study
# and sites that did not pass quality control
sample.idx <- !is.na(met.pheno$subtype) & !grepl("rep[2-9]$", met.pheno$id)
site.idx <- met.annot$analysed
# ^^^ needs to be saved as a separate variable since we need it later
met.pheno <- met.pheno[sample.idx,]
met.data <- t(met.data[site.idx, sample.idx])
met.annot <- met.annot[site.idx,]
}, error=function(...)
cat("Could not setup analysis environment. Did you run `setup.R` first?\n"))
#===============================================================================
# Prepare responses and output variables
#-------------------------------------------------------------------------------
dir.create("results", showWarnings=FALSE)
if(file.exists("results/pred.Rdata")){
load("results/pred.Rdata")
} else {
attach(met.pheno)
# Make the response vectors
class.types <- c("T-ALL", "HeH", "t(12;21)", "11q23/MLL", "t(1;19)", "dic(9;20)", "t(9;22)", "iAMP21")
known.types <- c(class.types, "<45chr", ">67chr", "biclone")
unknown.types <- c("other", "normal", "no result")
y <- data.frame(
reference = factor(ifelse(subtype == "reference", 1,
ifelse(subtype %in% known.types, 2, NA)),
labels=c("reference", "ALL")),
lapply(class.types, function(lev){
factor(ifelse(subtype %in% known.types, ifelse(subtype == lev, 1, 2), NA),
labels=c(lev, paste("not", lev)))
}),
sex = factor(ifelse(sample.type %in% c("diagnosis", "remission"),
sex, NA),
levels=1:2, labels=levels(sex))
)
names(y)[1+seq(class.types)] <- class.types
# Setup cross validation
set.seed(123)
tmp <- factor(paste(subtype, sex))
cv <- resample.crossval(tmp, nfold=5, nrep=5)
inner.cv <- lapply(cv, function(idx) resample.crossval(tmp, 5, 5, subset=!idx))
rm(tmp)
feat.sel <- cons <- structure(vector("list", ncol(y)), names=names(y))
save.workspace <- function(){
obj <- c("sample.idx", "y", "cv", "inner.cv",
"class.types", "known.types", "unknown.types",
"feat.sel", "cons.sites", "cons", "pred", "cons.pred",
"val.pred", "save.workspace")
save(list=obj[sapply(obj, exists)], file="results/pred.Rdata")
}
save.workspace()
detach(met.pheno)
}
#===============================================================================
# Feature selection plugin for predict::batch.predict()
#-------------------------------------------------------------------------------
design.feature_selection <- function(x, y, chr, cv){
if(missing(cv)){
cv <- resample.crossval(y, 5, 5)
} else {
cv[is.na(y),] <- NA
}
my.pre.trans <- function(...){
sets <- pre.trans.450k(..., feat=met.annot$CHR %in% chr, dbeta=.2, na.frac=.1)
predict::trace.msg(3, "%i features passed filter.", ncol(sets$design), time=FALSE)
sets
}
my.pred <- batch.predict(x, y, "nsc", test.subset=cv,
pre.trans=my.pre.trans, save.fits=TRUE, save.vimp=TRUE, .verbose=TRUE)
# Return an integer vector of how many times each feature had a coefficinent != 0
apply(sapply(subtree(my.pred$cv, T, "nsc", "vimp", flatten=2),
function(x) apply(abs(x), 1, sum) > 1e-12),
1, sum, na.rm=TRUE)
# 1e-12 for avoiding comparison with exactly 0.
# Might not be an issue, but doesn't harm.
}