/
Kmeans.jl
117 lines (104 loc) · 3.06 KB
/
Kmeans.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Kmeans(K; max=300, th=1e-4)
Kmeans method.
# Parameters:
- `K`: number of class
- `max`: maximum number of repitition
- `th`: converge threshold
# Example
```jldoctest
julia> model = Kmenas(3)
Kmeans{3}(Matrix{Float64}(undef, 0, 0), 100000000, 0.0001)
julia> using HorseML.Clustering: fit!
julia> fit!(model, x)
julia> model.labels |> size
(100, 3)
```
"""
mutable struct Kmeans{K}
μ::Matrix{Float64}
max::Int64
th::Float64
labels::Vector{Int}
Kmeans(K; max = 300, th = 1e-4) = new{K}(Array{Float64}(undef, 0, 0), max, th, [])
end
n_clusters(model::Kmeans{K}) where K = K
function clustering(μ, x, K)
r = zeros(size(x, 1), K)
for n in 1 : size(x, 1)
diff = Matrix(undef, size(μ)...)
for k in 1 : K
diff[k, :] = x[n, :] - μ[k, :]
end
distance = @. abs(diff)
r[n, argmin(vec(sum(distance, dims=2)))] = 1
end
return r
end
function update(μ, x, K)
r = clustering(μ, x, K)
μ = similar(μ)
for k in 1 : K
μ[k, :] = sum(x .* r[:, k], dims=1) / sum(r[:, k])
end
return μ
end
"""
fit!(model, x)
fit Kmeans or GMM model with data.
size of `x` is (numer of data, number of features).
"""
function fit!(model::Kmeans{K}, x) where {K}
coverge(x, th) = @. abs(x) < th
μ = kmeanspp(x, K)
for _ in 1 : model.max
μ_new = update(μ, x, K)
if coverge.(μ_new - μ, model.th) == trues(size(μ)...)
model.μ = μ_new
labels = Array{Int}(undef, size(x, 1))
r = clustering(model.μ, x, K)
for i in 1 : size(r, 2)
labels[findall(r[:, i].==1)] .= i
end
model.labels = labels
return
end
μ = μ_new
end
@warn "Not Converged!"
model.μ = μ
labels = Array{Int}(undef, size(x, 1))
r = clustering(model.μ, x, K)
for i in 1 : size(r, 2)
labels[findall(r[:, i].==1)] .= i
end
model.labels = labels
end
#initialize μ(centeroid vectors) with keams++
function kmeanspp(x, K)
μ = Array{Float64, 2}(undef, K, size(x, 2))
#pick first centeroid randomly
μ[1, :] = x[rand(1:size(x, 1)), :]
for i in 2 : K
#compute the distance between each point and each centeroid
distances = compute_distance(x, μ[1:i-1, :], size(x, 1), i-1)
#compute the distance to the nearest centeroid
closest_dist_sq = minimum(distances.^2, dims=2)
#add the squares of each distances
weights = sum(closest_dist_sq)
rand_vals = rand() * weights
#compute the culative sum of the squares of distances
#Get the index of the data point where the cumulative sum and the value of rand_val are closest
candidate_ids = searchsortedfirst(vec(cumsum(closest_dist_sq, dims=1)), rand_vals)
#add centeroidx
μ[i, :] = x[candidate_ids, :]
end
return μ
end
function compute_distance(x, μ, n, nc)
dists = Array{Float64}(undef, n, nc)
for i in 1 : nc
dists[:, i] = sum((x.-μ[i, :]').^2, dims=2)
end
return dists
end