/
molteno.jl
128 lines (114 loc) · 4.98 KB
/
molteno.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
export molteno_boxing, molteno_dim
"""
molteno_dim(X::AbstractStateSpaceSet; k0::Int = 10, q = 1.0, base = 2)
Return an estimate of the [`generalized_dim`](@ref) of `X` using the
algorithm by [Molteno1993](@cite). This function is a simple utilization of the
probabilities estimated by [`molteno_boxing`](@ref) so see that function for more details.
Here the entropy of the probabilities is computed at each size, and a line is fitted
in the entropy vs log(size) graph, just like in [`generalized_dim`](@ref).
"""
function molteno_dim(X, k0::Int = 10; q = 1.0, base = 2)
probs, εs = molteno_boxing(X; k0)
dd = entropy.(Ref(Renyi(;q, base)), probs)
return slopefit(-log.(base, εs), dd)[1]
end
"""
molteno_boxing(X::AbstractStateSpaceSet; k0::Int = 10) → (probs, εs)
Distribute `X` into boxes whose size is halved in each step, according to the
algorithm by [Molteno1993](@cite). Division stops if the
average number of points per filled box falls below the threshold `k0`.
Return `probs`, a vector of [`Probabilities`](@ref) of finding points in boxes for
different box sizes, and the corresponding box sizes `εs`.
These outputs are used in [`molteno_dim`](@ref).
## Description
Project the `data` onto the whole interval of numbers that is covered by
`UInt64`. The projected data is distributed into boxes whose size
decreases by factor 2 in each step. For each box that contains more than one
point `2^D` new boxes are created where `D` is the dimension of the data.
The process of dividing the data into new boxes stops when the number of points
over the number of filled boxes falls below `k0`. The box sizes `εs` are
calculated and returned together with the `probs`.
This algorithm is faster than the traditional approach of using `ValueHistogram(ε::Real)`,
but it is only suited for low dimensional data since it divides each
box into `2^D` new boxes if `D` is the dimension. For large `D` this leads to low numbers of
box divisions before the threshold is passed and the divison stops. This results
to a low number of data points to fit the dimension to and thereby a poor estimate.
"""
function molteno_boxing(X::AbstractStateSpaceSet; k0 = 10)
integers, ε0 = real_to_uint64(X)
boxes = _molteno_boxing(integers; k0)
εs = ε0 ./ (2 .^ (1:length(boxes)))
return boxes, εs
end
"""
real_to_uint64(data::StateSpaceSet{D,T}) where {D, T}
Calculate maximum and minimum value of `data` to then project the values onto
``[0, M - ε(M)]`` where ``\\epsilon`` is the
precision of the used Type and ``M`` is the maximum value of the UInt64 type.
"""
function real_to_uint64(data::AbstractStateSpaceSet{D,T}) where {D,T<:Real}
N = length(data)
# The maximum value needs to be smaller than the absolute typemax due to
# the 12 bits used by the float type for sign and exponent.
max_val = typemax(UInt64) - 2(UInt64 |> typemax |> T |> eps)
mins, maxs = minmaxima(data)
spans = maxs .- mins
ε0 = maximum(spans)
# Let f:[min,max] -> [0,typemax(UInt64)] linearly, then f(x) = m*x + b.
m = max_val ./ ε0
b = - mins .* m
res = Vector{SVector{D,UInt64}}()
sizehint!(res, N)
for x in data
int_val = floor.(UInt64, m .* x .+ b)
push!(res, int_val)
end
StateSpaceSet(res), ε0
end
function _molteno_boxing(data::AbstractStateSpaceSet{D,T}; k0 = 10) where {D,T<:UInt}
N = length(data)
box_probs = Vector{Float64}[]
iteration = 1
boxes = [[1:N;]]
while N / length(boxes) > k0
l = length(boxes)
for t in 1:l
# Take the first box.
box = popfirst!(boxes)
# Check if only one element is contained.
if length(box) == 1
push!(boxes, box)
continue
end
# Append a new partitioned box.
append!(boxes, molteno_subboxes(box, data, iteration))
end
# Calculate probabilities by dividing the number of elements in a box by N.
push!(box_probs, length.(boxes) ./ N)
iteration += 1
end
Probabilities.(box_probs)
end
"""
molteno_subboxes(box, data, iteration)
Divide a `box` containing indices into `data` to `2^D` smaller boxes and sort
the points contained in the former box into the new boxes. Implemented according to
[Molteno1993](@cite). Sorts the elements of the former box into the smaller boxes
using cheap bit shifting and `&` operations on the value of `data` at each box
element. `iteration` determines which bit of the array should be shifted to the
last position.
"""
function molteno_subboxes(box, data::AbstractStateSpaceSet{D,UInt64}, iteration) where {D}
new_boxes = [UInt64[] for i in 1:2^D]
index_multipliers = [2^i for i in 0:D-1]
sorting_number = 64-iteration
for elem in box
index = one(UInt64)
for (i, multi) in enumerate(index_multipliers)
# index shifting magic
index += ((data[elem][i] >> sorting_number) & 1) * multi
end
push!(new_boxes[index], elem)
end
filter!(!isempty, new_boxes)
end