Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LearnDataFrontEnds"
uuid = "5cca22a3-9356-470e-ba1b-8268d0135a4b"
authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
version = "0.1.2"
version = "0.2.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -11,7 +11,7 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
CategoricalArrays = "0.10"
CategoricalArrays = "1"
LearnAPI = "0.2, 1, 2"
MLCore = "1.0.0"
StatsModels = "0.7.4"
Expand Down
72 changes: 36 additions & 36 deletions docs/src/quick_start.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- [Supervised classifiers](@ref)
- [Transformers](@ref)

Refer to the front end [docstrings](@ref front_ends) for options ignored below.
Refer to the front end [docstrings](@ref front_ends) for options ignored below.

## Supervised regressors

Expand All @@ -31,35 +31,35 @@ Your [`LearnAPI.fit`](@ref) implementation will then look like this:

```julia
function LearnAPI.fit(
learner::MyLearner,
observations::Obs;
verbosity=1,
)
X = observations.features # p x n matrix
y = observations.target # n-vector (use `Saffron(multitarget=true)` for matrix)
feature_names = observations.names
learner::MyLearner,
observations::Obs;
verbosity=1,
)
X = observations.features # p x n matrix
y = observations.target # n-vector (use `Saffron(multitarget=true)` for matrix)
feature_names = observations.names

# do stuff with `X`, `y` and `feature_names`:
...
# do stuff with `X`, `y` and `feature_names`:
...

end
LearnAPI.fit(learner::MyLearner, data; kwargs...) =
LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
```

For each [`KindOfProxy`](@ref) subtype `K` to be supported (e.g., `Point`), your
[`LearnAPI.predict`](@ref) implementation(s) will look like this:

```julia
function LearnAPI.predict(model::MyModel, :K, observations::Obs)
X = observations.features # p x n matrix
names = observations.names # if really needed
X = observations.features # p x n matrix
names = observations.names # if really needed

# do stuff with `X`:
...
# do stuff with `X`:
...
end
LearnAPI.predict(model::MyModel, kind_of_proxy, X) =
LearnAPI.predict(model, kind_of_proxy, obs(model, X))
LearnAPI.predict(model, kind_of_proxy, obs(model, X))
```

## Supervised classifiers
Expand Down Expand Up @@ -94,13 +94,13 @@ function LearnAPI.fit(
X = observations.features # p x n matrix
y = observations.target # n-vector
decoder = observations.decoder
classes_seen = observatioins.classes_seen
levels_seen = observations.levels_seen
feature_names = observations.names

# do stuff with `X`, `y` and `feature_names`:
# return a `model` object which also stores the `decoder` and/or `classes_seen`
# to make them available to `predict`.
...
# return a `model` object which also stores the `decoder` and/or `levels_seen`
# to make them available to `predict`.
...
end
LearnAPI.fit(learner::MyLearner, data; kwargs...) =
LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
Expand All @@ -116,10 +116,10 @@ function LearnAPI.predict(model::MyModel, :K, observations::Obs)

# Do stuff with `X` and `model` to obtain raw `predictions` (a vector of integer
# codes for `K = Point`, or an `n x c` matrix of probabilities for `K = Distribution`).
# Extract `decoder` or `classes_seen` from `model`.
# Extract `decoder` or `levels_seen` from `model`.
# For `K = Point`, return `decoder.(predictions)`.
# For `K = Distribution`, return, say,
# `CategoricalDistributions.Univariate(classes_seen, predictions)`.
# `CategoricalDistributions.Univariate(levels_seen, predictions)`.
...
end
LearnAPI.predict(model::MyModel, kind_of_proxy, X) = LearnAPI.predict(model,
Expand Down Expand Up @@ -152,29 +152,29 @@ Your [`LearnAPI.fit`](@ref) implementation will then look like this:

```julia
function LearnAPI.fit(
learner::MyLearner,
observations::Obs;
verbosity=1,
)
x = observations.features # p x n matrix
feature_names = observations.names

# do stuff with `x` and `feature_names`:
...
learner::MyLearner,
observations::Obs;
verbosity=1,
)
x = observations.features # p x n matrix
feature_names = observations.names

# do stuff with `x` and `feature_names`:
...
end
LearnAPI.fit(learner::MyLearner, data; kwargs...) =
LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...)
```

Your [`LearnAPI.transform`](@ref) implementation will look like this:

```julia
function LearnAPI.transform(model::MyModel, observations::Obs)
x = observations.features # p x n matrix
feature_names = observations.names # if really needed
x = observations.features # p x n matrix
feature_names = observations.names # if really needed

# do stuff with `x`:
...
# do stuff with `x`:
...
end
LearnAPI.transform(model::MyModel, X) = LearnAPI.transform(model, obs(model, X))
```
Expand Down
1 change: 0 additions & 1 deletion docs/src/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,5 @@ LearnDataFrontEnds.feature_names
LearnDataFrontEnds.swapdims
LearnDataFrontEnds.decoder
LearnDataFrontEnds.decompose
LearnDataFrontEnds.classes
LearnDataFrontEnds.canonify
```
85 changes: 77 additions & 8 deletions src/backends.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,86 @@ If [`Sage`](@ref)`(multitarget=..., code_type=...)` has been implemented, then
`observations.target` has an integer element type controlled by `code_type`, and we
additionally have:

- `observations.classes`: A categorical vector of the ordered target classes, as actually
seen in the user-supplied target, with the full pool of classes available by applying
`Categorical.levels` to the result. The corresponding integer codes will be
`sort(unique(observations.target))`.
- `observations.levels`: A categorical vector of the ordered target levels, as actually
seen in the user-supplied target. The corresponding integer codes will be
`sort(unique(observations.target))`. To get the full pool of levels, apply
`CategoricalArrays.levels` to `observations.levels_seen`; see the example below.

- `observations.decoder`: A callable function that converts an integer code back to the
original `CategoricalValue` it represents.

Pass the first onto `predict` for making probabilistic predictions, and the second for
point predictions; see [`Sage`](@ref) for details.

# Extended help

In the example below, `observations` implements the full `Obs` interface described above,
for a learner implementing the `Sage` front end:

```julia-repl
using LearnAPI, LearnDataFrontEnds, LearnTestAPI
using CategoricalDistributions, CategoricalArrays, DataFrames
X = DataFrame(rand(10, 3), :auto)
y = categorical(collect("ababababac"))
learner = LearnTestAPI.ConstantClassifier()
observations = obs(learner, (X[1:9,:], y[1:9]))

julia> observations.features
3×9 Matrix{Float64}:
0.234043 0.526468 0.227417 0.956471 … 0.00587146 0.169291 0.353518 0.402631
0.631083 0.151317 0.781049 0.00320728 0.756519 0.15317 0.452169 0.127005
0.285315 0.347433 0.69174 0.516915 0.900343 0.404006 0.448986 0.962649

julia> yint = observations.target
9-element Vector{UInt32}:
0x00000001
0x00000002
0x00000001
0x00000002
0x00000001
0x00000002
0x00000001
0x00000002
0x00000001

julia> observations.levels_seen
2-element CategoricalArray{Char,1,UInt32}:
'a'
'b'

julia> sort(unique(observations.target))
2-element Vector{UInt32}:
0x00000001
0x00000002

julia> observations.levels_seen |> levels
3-element CategoricalArray{Char,1,UInt32}:
'a'
'b'
'c'

julia> observations.decoder.(yint)
9-element CategoricalArray{Char,1,UInt32}:
'a'
'b'
'a'
'b'
'a'
'b'
'a'
'b'
'a'

julia> d = UnivariateFinite(observations.levels_seen, [0.4, 0.6])
UnivariateFinite{Multiclass{3}}(a=>0.4, b=>0.6)

julia> levels(d)
3-element CategoricalArray{Char,1,UInt32}:
'a'
'b'
'c'
```

"""
abstract type Obs end

Expand Down Expand Up @@ -111,7 +180,7 @@ struct SageObs{F,T,E,D} <: Obs
features::F # p x n
names::Vector{Symbol}
target::T
classes_seen::CategoricalArrays.CategoricalVector{E}
levels_seen::CategoricalArrays.CategoricalVector{E}
decoder::D
end

Expand All @@ -122,8 +191,8 @@ function Base.show(io::IO, ::MIME"text/plain", observations::SageObs)
println(io, " features :: $(typeof(A))($(size(A)))")
println(io, " names: $(observations.names)")
println(io, " target :: $(typeof(y))($(size(y)))")
println(io, " classes_seen: "*
"$(CategoricalArrays.unwrap.(observations.classes_seen)) "*
println(io, " levels_seen: "*
"$(CategoricalArrays.unwrap.(observations.levels_seen)) "*
"(categorical vector with complete pool)")
print(io, " decoder: <callable>")
end
Expand All @@ -133,7 +202,7 @@ Base.getindex(observations::SageObs, idx) =
MLCore.getobs(observations.features, idx),
observations.names,
MLCore.getobs(observations.target, idx),
observations.classes_seen,
observations.levels_seen,
observations.decoder,
)

Expand Down
6 changes: 3 additions & 3 deletions src/saffron.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,13 @@ function finalize(x, names, y, int) # here `int` is `levelcode` or `refcode` fu
CategoricalArrays.CategoricalArray,
SubArray{<:Any, <:Any, <:CategoricalArrays.CategoricalArray},
} || throw(ERR_EXPECTED_CATEGORICAL)
l = LearnDataFrontEnds.classes(y)
l = CategoricalArrays.levels(y)
u = unique(y)
mask = map(in(u), l)
_classes_seen = l[mask]
_levels_seen = l[mask]
_decoder = LearnDataFrontEnds.decoder(l)

return SageObs(x, names, int.(y), _classes_seen, _decoder)
return SageObs(x, names, int.(y), _levels_seen, _decoder)
end

# for input `(x::AbstractMatrix, y::MatrixOrVector)`:
Expand Down
8 changes: 4 additions & 4 deletions src/sage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ function LearnAPI.fit(
X = observations.features # p x n matrix
y = observations.target # n-vector or q x n matrix
decoder = observations.decoder
classes_seen = observations.classes_seen
levels_seen = observations.levels_seen
feature_names = observations.names
# do stuff with `X`, `y` and `feature_names`:
# return a `model` object which also stores the `decoder` and/or
# `classes_seen` to make them available to `predict`.
# `levels_seen` to make them available to `predict`.
...
end
Expand All @@ -127,10 +127,10 @@ function LearnAPI.predict(model::MyModel, :K, observations::Obs)
# Do stuff with `X` and `model` to obtain raw `predictions` (a vector of integer
# codes for `K = Point`, or an `n x c` matrix of probabilities for `K = Distribution`).
# Extract `decoder` or `classes_seen` from `model`.
# Extract `decoder` or `levels_seen` from `model`.
# For `K = Point`, return `decoder.(predictions)`.
# For `K = Distribution`, return, say,
# `CategoricalDistributions.Univariate(classes_seen, predictions)`.
# `CategoricalDistributions.Univariate(levels_seen, predictions)`.
...
end
LearnAPI.predict(model::MyModel, kind_of_proxy, X) = LearnAPI.predict(model,
Expand Down
56 changes: 3 additions & 53 deletions src/tools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,58 +112,8 @@ function decompose(X, v, _targets::NTuple)
return swapdims(A, v), collect(names), swapdims(B, v)
end

"""
classes(x)

*Private method.*

Return, as a `CategoricalVector`, all the categorical elements with
the same pool as `CategoricalValue` `x` (including `x`), with an
ordering consistent with the pool. Note that `x in classes(x)` is
always true.

Not to be confused with `levels(x.pool)`. See the example below.

Also, overloaded for `x` a `CategoricalArray`, `CategoricalPool`, and for views of
`CategoricalArray`.

julia> v = categorical(['c', 'b', 'c', 'a'])
4-element CategoricalArrays.CategoricalArray{Char,1,UInt32}:
'c'
'b'
'c'
'a'

julia> levels(v)
3-element Array{Char,1}:
'a'
'b'
'c'

julia> x = v[4]
CategoricalArrays.CategoricalValue{Char,UInt32} 'a'

julia> classes(x)
3-element CategoricalArrays.CategoricalArray{Char,1,UInt32}:
'a'
'b'
'c'

julia> levels(x.pool)
3-element Array{Char,1}:
'a'
'b'
'c'

"""
classes(p::CategoricalArrays.CategoricalPool) = [p[i] for i in 1:length(p)]
classes(x::CategoricalArrays.CategoricalValue) = classes(CategoricalArrays.pool(x))
classes(v::CategoricalArrays.CategoricalArray) = classes(CategoricalArrays.pool(v))
classes(v::SubArray{<:Any, <:Any, <:CategoricalArrays.CategoricalArray}) = classes(parent(v))


struct CategoricalDecoder{V,R}
classes::CategoricalArrays.CategoricalVector{
levels::CategoricalArrays.CategoricalVector{
V,
R,
V,
Expand Down Expand Up @@ -193,7 +143,7 @@ pool as `x`.
*Warning:* There is no guarantee that `levelcode.(d.(u)) == u` will always holds.

"""
decoder(x) = CategoricalDecoder(classes(x))
decoder(x) = CategoricalDecoder(CategoricalArrays.levels(x))

(d::CategoricalDecoder{V,R})(i::Integer) where {V,R} =
CategoricalArrays.CategoricalValue{V,R}(d.classes[i])
CategoricalArrays.CategoricalValue{V,R}(d.levels[i])
Loading
Loading