-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML_cobre.R
224 lines (191 loc) · 8 KB
/
ML_cobre.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Code for ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# #
# #
# Multivariate and Neuroimaging Methods in Psychiatry #
# #
# R software hands on session #
# #
# #
# #
# #
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ---------------------------------- 0: load libraries -----------------------------------
# check if required packages are installed, if not, install them
packages <- c("mlr", "tidyverse", "e1071")
install.packages(setdiff(packages, rownames(installed.packages())))
library(mlr)
library(tidyverse)
# ---------------------------------- 1: load & preprocess data -----------------------------------
# load data
data_cobre_neuropsych <- read_csv("data_cobre_neuropsych.csv")
# first, some basic preprocessing
data_cobre_neuropsych <- data_cobre_neuropsych %>%
mutate(across(
# we recode "8888" to NA
.cols = -c("id", "study_group", "age", "gender"),
~ if_else(. %in% c(8888, 9999), NA_real_, .)
)) %>%
mutate(outcome = if_else(study_group == "Control", "Control", "Patient")) %>%
# then, we convert our outcome to a factor
mutate(outcome = as.factor(outcome),
gender = as.factor(gender)) %>%
select(-c(id, study_group)) # we don't need id & study_group anymore, so we drop it
# ---------------------------------- 2: check missingness in data -----------------------------------
# check missingness => we may need to impute data
data_cobre_neuropsych %>%
summarize(across(everything(), ~ sum(is.na(.x)))) # returns number of missing values per variable
# in general, we do not have many missing values, but we still need to impute data
# ---------------------------------- 3: set up machine learning pipeline (nested resampling) -----------------------------------
# nested resampling is computationally expensive
# therefore, in the examples shown below we use a rather low number of optimization/resampling iterations
# both should be increased
# --------- 3.1: make a learning task ---------
# learning tasks encapsulate the data set and other relevant information about a machine learning problem
# for example, the name of the target variable for supervised problems
task_cobre_neuropsych <-
makeClassifTask(
"cobre_neuropsych_task",
data = data_cobre_neuropsych,
target = "outcome",
positive = "Patient"
)
# let's check out the ML task we've created
task_cobre_neuropsych
# 189 observations: 90 control, 99 patients => good news: our task is balanced
# --------- 3.2: construct a learner ---------
# a learner is the specific machine learning algorithm we want to use
# here, we use a linear SVM as implemented the package e1071
lrn <- makeLearner("classif.svm", kernel = "linear")
# In order to tune a machine learning algorithm, you have to specify:
# 1) the search space
# 2) the optimization algorithm (aka tuning method)
# 3) an evaluation method, i.e., a resampling strategy and a performance measure
# 1) search space: define hyperparameters of SVM + search space we are going to cover
ps <- makeParamSet(makeNumericParam(
"cost",
lower = -5,
upper = 5,
trafo = function(x)
2 ^ x,
default = 0.01
))
# 2) the optimization algorithm (aka tuning method)
# we use random search for hyperparameter optimization (this is not ideal and may be substituted by more advanced methods)
ctrl <-
makeTuneControlRandom(budget = 50) # budget should at least be 50, better around 250
# 3) an evaluation method, i.e., a resampling strategy
# we define the set up of the inner resampling scheme: repeated CV
inner <-
makeResampleDesc("RepCV",
folds = 5,
reps = 2,
stratify = TRUE)
# --------- 3.3: impute missing values ---------
# also within our inner resampling scheme, missing values will be imputed
# here: via median, could also be mean, kNN-imputation...
lrn <-
makeImputeWrapper(lrn, classes = list(numeric = imputeMedian()))
# --------- 3.4: feature preprocessing ---------
# here, we include our preprocessing - 2 alternatives (run only one - a) or b)):
# a) a z-transformation on every variable
lrn <-
makePreprocWrapperCaret(lrn,
ppc.scale = TRUE, # ensures variables are scaled, is necessary for SVM
ppc.center = TRUE)
# b) PCA
lrn <-
makePreprocWrapperCaret(
lrn,
ppc.pca = TRUE,
ppc.thresh = 0.9,
ppc.scale = TRUE,
# ensures variables are scaled, is necessary for meaningful results in PCA
ppc.center = TRUE
)
# --------- 3.5: initialize inner resampling scheme ---------
# initialize inner resampling scheme: missing value imputation, feature preprocessing, hyperparameter tuning
lrn <- makeTuneWrapper(
lrn,
resampling = inner,
par.set = ps,
measures = list(bac, tpr, tnr),
# we use BAC as our perfomance measure, and TPR/TNR are returned in addition
control = ctrl,
show.info = FALSE
)
# --------- 3.5: initialize outer resampling scheme ---------
# initialize outer resampling scheme: repeated CV
outer <-
makeResampleDesc(
"RepCV",
folds = 5,
reps = 2,
predict = "both",
# save results from both train/test folds
stratify = TRUE
)
set.seed(1)
resampling_cobre_neuropsych <- resample(
lrn,
task_cobre_neuropsych,
resampling = outer,
measures = list(bac, tpr, tnr),
# generalization performance will be evaluated with BAC, and TPR/TNR are returned in addition
extract = getTuneResult,
show.info = TRUE,
models = TRUE
)
# ---------------------------------- 4: plot resuls -----------------------------------
# plot train vs. test bac for each iteration of resampling
resampling_cobre_neuropsych$measures.train %>% mutate(set = "train") %>%
bind_rows(resampling_cobre_neuropsych$measures.test %>% mutate(set = "test")) %>%
mutate(set = factor(set, levels = c("train", "test"))) %>%
ggplot(., aes(
x = iter,
y = bac,
group = set,
fill = set
)) +
geom_bar(stat = "identity",
width = 0.75,
position = position_dodge(width = 0.6)) +
scale_fill_manual(values = c("lightgrey", "steelblue")) +
coord_cartesian(ylim = c(0.5, 0.9)) +
scale_x_continuous(breaks = seq(1, 10)) +
ylab("BAC \n") +
xlab("\n Resampling Iteration") +
guides(fill = guide_legend(title = "Set")) +
theme_classic() +
theme(
axis.title = element_text(size = 14),
legend.title = element_text(size = 14),
axis.text = element_text(size = 12, color = "black"),
legend.text = element_text(size = 12, color = "black")
)
# ---------------------------------- 5: feature importance -----------------------------------
# --------- 5.1: retrain on whole dataset ---------
set.seed(1)
res <- tuneParams(
lrn,
task = task_cobre_neuropsych,
resampling = inner,
par.set = ps,
control = ctrl,
measures = list(bac, tpr, tnr)
)
# --------- 5.2: extract best hyperparameter value ---------
lrn <-
setHyperPars(lrn,
cost = res$x$cost)
# --------- 5.3: compute feature importance
# this takes a while, ~ 20-40 min, depending on your computer
feat_imp <- generateFeatureImportanceData(
task_cobre_neuropsych,
method = "permutation.importance",
lrn,
nmc = 1000,
features = getTaskFeatureNames(task_cobre_neuropsych),
interaction = F,
measure = list(bac),
local = F
)