Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CategoricalVector and collapse #88

Merged
merged 14 commits into from
Aug 3, 2017
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
julia 0.6
IntervalSets 0.1
IterTools
RangeArrays
5 changes: 4 additions & 1 deletion src/AxisArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ __precompile__()
module AxisArrays

using Base: tail
import Base.Iterators: repeated
using RangeArrays, IntervalSets
using IterTools

export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue
export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue, collapse

# From IntervalSets:
export ClosedInterval, ..
Expand All @@ -15,6 +17,7 @@ include("intervals.jl")
include("search.jl")
include("indexing.jl")
include("sortedvector.jl")
include("categoricalvector.jl")
include("combine.jl")

end
82 changes: 82 additions & 0 deletions src/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless
of the element type. Duplicate values are not allowed but are not filtered out.

A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the
hierarchical index of the Python Pandas package or the R data.table package.

In general, indexing into a CategoricalVector will be much slower than the corresponding
SortedVector or another sorted axis type, as linear search is required.

### Constructors

```julia
CategoricalVector(x::AbstractVector)
```

### Arguments

* `x::AbstractVector` : the wrapped vector

### Examples

```julia
v = CategoricalVector(collect([1; 8; 10:15]))
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
A[Axis{:row}(1), :]
A[Axis{:row}(10), :]
A[Axis{:row}([1, 10]), :]

## Hierarchical index example with three key levels

data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
A = AxisArray(data, CategoricalVector(v), [:a, :b])
A[:b, :]
A[[:a,:c], :]
A[(:a,:x), :]
A[(:a,:x,:x), :]
```
"""
immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T}
data::A
end

function CategoricalVector(data::AbstractVector{T}) where T
CategoricalVector{T, typeof(data)}(data)
end

Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx]
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx])

Base.length(v::CategoricalVector) = length(v.data)
Base.size(v::CategoricalVector) = size(v.data)
Base.size(v::CategoricalVector, i) = size(v.data, i)
Base.indices(v::CategoricalVector) = indices(v.data)

axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical
checkaxis(::CategoricalVector) = nothing


## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like
## Panda's hierarchical indexing

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,))

function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple)
collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...))
end

function _tuple_matches(element::Tuple, idx::Tuple)
length(idx) <= length(element) || return false

for (x, y) in zip(element, idx)
x == y || return false
end

return true
end

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) =
vcat([axisindexes(ax, i) for i in idx]...)
194 changes: 194 additions & 0 deletions src/combine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,197 @@ function Base.join{T,N,D,Ax}(As::AxisArray{T,N,D,Ax}...; fillvalue::T=zero(T),
return result

end #join

function _collapse_array_axes(array_name, array_axes...)
((array_name, (idx isa Tuple ? idx : (idx,))...) for idx in product((Ax.val for Ax in array_axes)...))
end

function _collapse_axes(array_names, array_axes)
collect(Iterators.flatten(map(array_names, array_axes) do tup_name, tup_array_axes
_collapse_array_axes(tup_name, tup_array_axes...)
end))
end

function _splitall{N}(::Type{Val{N}}, As...)
tuple((Base.IteratorsMD.split(A, Val{N}) for A in As)...)
end

function _reshapeall{N}(::Type{Val{N}}, As...)
tuple((reshape(A, Val{N}) for A in As)...)
end

function _check_common_axes(common_axis_tuple)
if !all(axisname(first(common_axis_tuple)) .=== axisname.(common_axis_tuple[2:end]))
throw(ArgumentError("Leading common axes must have the same name in each array"))
end

return nothing
end

function _collapsed_axis_eltype(LType, trailing_axes)
eltypes = map(trailing_axes) do array_trailing_axes
Tuple{LType, eltype.(array_trailing_axes)...}
end

return typejoin(eltypes...)
end

function collapse{N, AN}(::Type{Val{N}}, As::Vararg{AxisArray, AN})
collapse(Val{N}, ntuple(identity, Val{AN}), As...)
end

function collapse{N, AN, NewArrayType<:AbstractArray}(::Type{Val{N}}, ::Type{NewArrayType}, As::Vararg{AxisArray, AN})
collapse(Val{N}, NewArrayType, ntuple(identity, Val{AN}), As...)
end

@generated function collapse{N, AN, LType}(::Type{Val{N}}, labels::NTuple{AN, LType}, As::Vararg{AxisArray, AN})
collapsed_dim_int = Int(N) + 1
new_eltype = Base.promote_eltype(As...)

quote
collapse(Val{N}, Array{$new_eltype, $collapsed_dim_int}, labels, As...)
end
end

"""
collapse(::Type{Val{N}}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, labels::Tuple, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, labels::Tuple, As::AxisArray...) -> AxisArray

Collapses `AxisArray`s with `N` equal leading axes into a single `AxisArray`.
All additional axes in any of the arrays are collapsed into a single additional
axis of type `Axis{:collapsed, CategoricalVector{Tuple}}`.

### Arguments

* `::Type{Val{N}}`: the greatest common dimension to share between all input
arrays. The remaining axes are collapsed. All `N` axes must be common
to each input array, at the same dimension. Values from `0` up to the
minimum number of dimensions across all input arrays are allowed.
* `labels::Tuple`: (optional) an index for each array in `As` used as the leading element in
the index tuples in the `:collapsed` axis. Defaults to `1:length(As)`.
* `::Type{NewArrayType<:AbstractArray{_, N+1}}`: (optional) the desired underlying array
type for the returned `AxisArray`.
* `As::AxisArray...`: `AxisArray`s to be collapsed together.

### Examples

```
julia> price_data = AxisArray(rand(10), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)))
1-dimensional AxisArray{Float64,1,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
And data, a 10-element Array{Float64,1}:
0.885014
0.418562
0.609344
0.72221
0.43656
0.840304
0.455337
0.65954
0.393801
0.260207

julia> size_data = AxisArray(rand(10,2), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)), Axis{:measure}([:area, :volume]))
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:measure, Symbol[:area, :volume]
And data, a 10×2 Array{Float64,2}:
0.159434 0.456992
0.344521 0.374623
0.522077 0.313256
0.994697 0.320953
0.95104 0.900526
0.921854 0.729311
0.000922581 0.148822
0.449128 0.761714
0.650277 0.135061
0.688773 0.513845

julia> collapsed = collapse(Val{1}, (:price, :size), price_data, size_data)
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:collapsed, Tuple{Symbol,Vararg{Symbol,N} where N}[(:price,), (:size, :area), (:size, :volume)]
And data, a 10×3 Array{Float64,2}:
0.885014 0.159434 0.456992
0.418562 0.344521 0.374623
0.609344 0.522077 0.313256
0.72221 0.994697 0.320953
0.43656 0.95104 0.900526
0.840304 0.921854 0.729311
0.455337 0.000922581 0.148822
0.65954 0.449128 0.761714
0.393801 0.650277 0.135061
0.260207 0.688773 0.513845

julia> collapsed[Axis{:collapsed}(:size)] == size_data
true
```

"""
@generated function collapse(::Type{Val{N}},
::Type{NewArrayType},
labels::NTuple{AN, LType},
As::Vararg{AxisArray, AN}) where {N, AN, LType, NewArrayType<:AbstractArray}
if N < 0
throw(ArgumentError("collapse dimension N must be at least 0"))
end

if N > minimum(ndims.(As))
throw(ArgumentError(
"""
collapse dimension N must not be greater than the maximum number of dimensions
across all input arrays
"""
))
end

collapsed_dim = Val{N + 1}
collapsed_dim_int = Int(N) + 1

common_axes, trailing_axes = zip(_splitall(Val{N}, axisparams.(As)...)...)

foreach(_check_common_axes, zip(common_axes...))

new_common_axes = first(common_axes)
collapsed_axis_eltype = _collapsed_axis_eltype(LType, trailing_axes)
collapsed_axis_type = CategoricalVector{collapsed_axis_eltype, Vector{collapsed_axis_eltype}}

new_axes_type = Tuple{new_common_axes..., Axis{:collapsed, collapsed_axis_type}}
new_eltype = Base.promote_eltype(As...)

quote
common_axes, trailing_axes = zip(_splitall(Val{N}, axes.(As)...)...)

for common_axis_tuple in zip(common_axes...)
if !isempty(common_axis_tuple)
for common_axis in common_axis_tuple[2:end]
if !all(axisvalues(common_axis) .== axisvalues(common_axis_tuple[1]))
throw(ArgumentError(
"""
Leading common axes must be identical across
all input arrays"""
))
end
end
end
end

array_data = cat($collapsed_dim, _reshapeall($collapsed_dim, As...)...)

axis_array_type = AxisArray{
$new_eltype,
$collapsed_dim_int,
$NewArrayType,
$new_axes_type
}

new_axes = (
first(common_axes)...,
Axis{:collapsed, $collapsed_axis_type}($collapsed_axis_type(_collapse_axes(labels, trailing_axes))),
)

return axis_array_type(array_data, new_axes)
end
end
9 changes: 9 additions & 0 deletions src/core.jl
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,15 @@ end
axes(A::AbstractArray) = default_axes(A)
axes(A::AbstractArray, dim::Int) = default_axes(A)[dim]

"""
axisparams(::AxisArray) -> Vararg{::Type{Axis}}
axisparams(::Type{AxisArray}) -> Vararg{::Type{Axis}}

Returns the axis parameters for an AxisArray.
"""
axisparams{T,N,D,Ax}(::AxisArray{T,N,D,Ax}) = (Ax.parameters...)
axisparams{T,N,D,Ax}(::Type{AxisArray{T,N,D,Ax}}) = (Ax.parameters...)

### Axis traits ###
abstract type AxisTrait end
immutable Dimensional <: AxisTrait end
Expand Down
17 changes: 16 additions & 1 deletion src/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@ end
ex = Expr(:tuple)
n = 0
for i=1:length(I)
if axistrait(I[i]) <: Categorical && i <= length(Ax.parameters)
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1

continue
end

if I[i] <: Idx
push!(ex.args, :(I[$i]))
n += 1
Expand All @@ -243,7 +254,11 @@ end
end
n += length(I[i])
elseif i <= length(Ax.parameters)
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1
else
push!(ex.args, :(error("dimension ", $i, " does not have an axis to index")))
Expand Down
21 changes: 21 additions & 0 deletions test/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Test CategoricalVector with a hierarchical index (indexed using Tuples)
srand(1234)
data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
idx = sortperm(v)
A = AxisArray(data[idx,:], AxisArrays.CategoricalVector(v[idx]), [:a, :b])
@test A[:b, :] == A[5:12, :]
@test A[[:a,:c], :] == A[[1:4;13:end], :]
@test A[(:a,:y), :] == A[2:4, :]
@test A[(:c,:y,:y), :] == A[16:end, :]
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

v = AxisArrays.CategoricalVector(collect([1; 8; 10:15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
@test A[Axis{:row}(AxisArrays.CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), AxisArrays.CategoricalVector([15]), [:a, :b])
@test A[Axis{:row}(AxisArrays.CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...)
# @test A[AxisArrays.CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15]))
Loading