-
Notifications
You must be signed in to change notification settings - Fork 9
/
core.jl
317 lines (260 loc) · 9.53 KB
/
core.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
"""
initGigaSOM(data, args...)
Initializes a SOM by random selection from the training data. A generic
overload that works for matrices and DataFrames that can be coerced to
`Matrix{Float64}`. Other arguments are passed to the data-independent
`initGigaSOM`.
Arguments:
- `data`: matrix of data for running the initialization
"""
function initGigaSOM(data::Union{Matrix,DataFrame}, args...; kwargs...)
d = Matrix{Float64}(data)
(n, ncol) = size(d)
means = [sum(d[:, i]) / n for i = 1:ncol]
sdevs = [sqrt(sum((d[:, i] .- means[i]) .^ 2.0) / n) for i = 1:ncol]
return initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end
"""
function initGigaSOM(data::LoadedDataInfo,
xdim::Int64, ydim::Int64 = xdim;
seed=rand(Int), rng=StableRNG(seed))
`initGigaSOM` overload for working with distributed-style `LoadedDataInfo`
data. The rest of the arguments is passed to the data-independent
`initGigaSOM`.
Arguments:
- `data`: a `LoadedDataInfo` object with the distributed dataset matrix
"""
function initGigaSOM(data::LoadedDataInfo, args...; kwargs...)
ncol = get_val_from(data.workers[1], :(size($(data.val))[2]))
(means, sdevs) = dstat(data, Vector(1:ncol))
initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end
"""
function initGigaSOM(ncol::Int64,
means::Vector{Float64}, sdevs::Vector{Float64},
xdim::Int64, ydim::Int64 = xdim;
seed = rand(Int), rng = StableRNG(seed))
Generate a stable random initial SOM with the random distribution that matches the parameters.
Arguments:
- `ncol`: number of desired data columns
- `means`, `sdevs`: vectors that describe the data distribution, both of size `ncol`
- `xdim`, `ydim`: Size of the SOM
- `seed`: a seed (defaults to random seed from the current default random generator
- `rng`: a random number generator to be used (defaults to a `StableRNG` initialized with the `seed`)
Returns: a new `Som` structure
"""
function initGigaSOM(
ncol::Int64,
means::Vector{Float64},
sdevs::Vector{Float64},
xdim::Int64,
ydim::Int64 = xdim;
seed = rand(UInt),
rng = StableRNG(seed),
)
numCodes = xdim * ydim
grid = gridRectangular(xdim, ydim)
# Initialize with an unbiased random gaussian with same mean/sd as the data
# in each dimension
codes = randn(rng, (numCodes, ncol))
for col = 1:ncol
codes[:, col] .*= sdevs[col]
codes[:, col] .+= means[col]
end
return Som(codes = codes, xdim = xdim, ydim = ydim, grid = grid)
end
"""
trainGigaSOM(som::Som, dInfo::LoadedDataInfo;
kernelFun::Function = gaussianKernel,
metric = Euclidean(),
somDistFun = distMatrix(Chebyshev()),
knnTreeFun = BruteTree,
rStart = 0.0, rFinal=0.1, radiusFun=expRadius(-5.0),
epochs = 20)
# Arguments:
- `som`: object of type Som with an initialised som
- `dInfo`: `LoadedDataInfo` object that describes a loaded dataset
- `kernelFun::function`: optional distance kernel; one of (`bubbleKernel, gaussianKernel`)
default is `gaussianKernel`
- `metric`: Passed as metric argument to the KNN-tree constructor
- `somDistFun`: Function for computing the distances in the SOM map
- `knnTreeFun`: Constructor of the KNN-tree (e.g. from NearestNeighbors package)
- `rStart`: optional training radius. If zero (default), it is computed from the SOM grid size.
- `rFinal`: target radius at the last epoch, defaults to 0.1
- `radiusFun`: Function that generates radius decay, e.g. `linearRadius` or `expRadius(10.0)`
- `epochs`: number of SOM training iterations (default 10)
"""
function trainGigaSOM(
som::Som,
dInfo::LoadedDataInfo;
kernelFun::Function = gaussianKernel,
metric = Euclidean(),
somDistFun = distMatrix(Chebyshev()),
knnTreeFun = BruteTree,
rStart = 0.0,
rFinal = 0.1,
radiusFun = expRadius(-5.0),
epochs = 20,
)
# set the default radius
if rStart == 0.0
rStart = (som.xdim + som.ydim) / 2
@debug "The radius has been determined automatically." rStart rFinal
end
# get the SOM neighborhood distances
dm = somDistFun(som.grid)
codes = som.codes
for j = 1:epochs
@debug "Epoch $j..."
numerator, denominator = distributedEpoch(
dInfo,
codes,
knnTreeFun(Array{Float64,2}(transpose(codes)), metric),
)
r = radiusFun(rStart, rFinal, j, epochs)
@debug "radius: $r"
if r <= 0
@error "Sanity check failed: radius must be positive"
error("Radius check")
end
wEpoch = kernelFun(dm, r)
codes = (wEpoch * numerator) ./ (wEpoch * denominator)
end
som.codes = copy(codes)
return som
end
"""
trainGigaSOM(som::Som, train;
kernelFun::Function = gaussianKernel,
metric = Euclidean(),
somDistFun = distMatrix(Chebyshev()),
knnTreeFun = BruteTree,
rStart = 0.0, rFinal=0.1, radiusFun=expRadius(-5.0),
epochs = 20)
Overload of `trainGigaSOM` for simple DataFrames and matrices. This slices the
data using `DistributedArrays`, sends them the workers, and runs normal
`trainGigaSOM`. Data is `undistribute`d after the computation.
"""
function trainGigaSOM(
som::Som,
train;
kernelFun::Function = gaussianKernel,
metric = Euclidean(),
somDistFun = distMatrix(Chebyshev()),
knnTreeFun = BruteTree,
rStart = 0.0,
rFinal = 0.1,
radiusFun = expRadius(-5.0),
epochs = 20,
)
train = Matrix{Float64}(train)
#this slices the data into parts and and sends them to workers
dInfo = distribute_array(:GigaSOMtrainDataVar, train, workers())
som_res = trainGigaSOM(
som,
dInfo,
kernelFun = kernelFun,
metric = metric,
somDistFun = somDistFun,
knnTreeFun = knnTreeFun,
rStart = rStart,
rFinal = rFinal,
radiusFun = radiusFun,
epochs = epochs,
)
undistribute(dInfo)
return som_res
end
"""
doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, tree)
vectors and the adjustment in radius after each epoch.
# Arguments:
- `x`: training Data
- `codes`: Codebook
- `tree`: knn-compatible tree built upon the codes
"""
function doEpoch(x::Array{Float64,2}, codes::Array{Float64,2}, tree)
# initialise numerator and denominator with 0's
sumNumerator = zeros(Float64, size(codes))
sumDenominator = zeros(Float64, size(codes)[1])
# for each sample in dataset / trainingsset
for s = 1:size(x, 1)
(bmuIdx, bmuDist) = knn(tree, x[s, :], 1)
target = bmuIdx[1]
sumNumerator[target, :] .+= x[s, :]
sumDenominator[target] += 1
end
return sumNumerator, sumDenominator
end
"""
distributedEpoch(dInfo::LoadedDataInfo, codes::Matrix{Float64}, tree)
Execute the `doEpoch` in parallel on workers described by `dInfo` and collect
the results. Returns pair of numerator and denominator matrices.
"""
function distributedEpoch(dInfo::LoadedDataInfo, codes::Matrix{Float64}, tree)
return distributed_mapreduce(
dInfo,
(data) -> doEpoch(data, codes, tree),
((n1, d1), (n2, d2)) -> (n1 + n2, d1 + d2),
)
end
"""
mapToGigaSOM(som::Som, dInfo::LoadedDataInfo;
knnTreeFun = BruteTree, metric = Euclidean(),
output::Symbol=tmpSym(dInfo)::LoadedDataInfo
Compute the index of the BMU for each row of the input data.
# Arguments
- `som`: a trained SOM
- `dInfo`: `LoadedDataInfo` that describes the loaded and distributed data
- `knnTreeFun`: Constructor of the KNN-tree (e.g. from NearestNeighbors package)
- `metric`: Passed as metric argument to the KNN-tree constructor
- `output`: Symbol to save the result, defaults to `tmpSym(dInfo)`
Data must have the same number of dimensions as the training dataset
and will be normalised with the same parameters.
"""
function mapToGigaSOM(
som::Som,
dInfo::LoadedDataInfo;
knnTreeFun = BruteTree,
metric = Euclidean(),
output::Symbol = tmpSym(dInfo),
)::LoadedDataInfo
tree = knnTreeFun(Array{Float64,2}(transpose(som.codes)), metric)
return distributed_transform(
dInfo,
(d) -> (vcat(knn(tree, transpose(d), 1)[1]...)),
output,
)
end
"""
mapToGigaSOM(som::Som, data;
knnTreeFun = BruteTree,
metric = Euclidean())
Overload of `mapToGigaSOM` for simple DataFrames and matrices. This slices the
data using `DistributedArrays`, sends them the workers, and runs normal
`mapToGigaSOM`. Data is `undistribute`d after the computation.
"""
function mapToGigaSOM(som::Som, data; knnTreeFun = BruteTree, metric = Euclidean())
data = Matrix{Float64}(data)
if size(data, 2) != size(som.codes, 2)
@error "Data dimension ($(size(data,2))) does not match codebook dimension ($(size(som.codes,2)))."
error("Data dimensions do not match")
end
dInfo = distribute_array(:GigaSOMmappingDataVar, data, workers())
rInfo = mapToGigaSOM(som, dInfo, knnTreeFun = knnTreeFun, metric = metric)
res = distributed_collect(rInfo)
undistribute(dInfo)
undistribute(rInfo)
return DataFrame(index = res)
end
"""
scaleEpochTime(iteration::Int64, epochs::Int64)
Convert iteration ID and epoch number to relative time in training.
"""
function scaleEpochTime(iteration::Int64, epochs::Int64)
# prevent division by zero on 1-epoch training
if epochs > 1
epochs -= 1
end
return Float64(iteration - 1) / Float64(epochs)
end