Skip to content

Commit

Permalink
Add CategoricalVector and collapse (#88)
Browse files Browse the repository at this point in the history
  • Loading branch information
iamed2 authored and timholy committed Aug 3, 2017
1 parent fe61993 commit f0743ce
Show file tree
Hide file tree
Showing 10 changed files with 362 additions and 4 deletions.
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
julia 0.6
IntervalSets 0.1
IterTools
RangeArrays
5 changes: 4 additions & 1 deletion src/AxisArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ __precompile__()
module AxisArrays

using Base: tail
import Base.Iterators: repeated
using RangeArrays, IntervalSets
using IterTools

export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue
export AxisArray, Axis, axisnames, axisvalues, axisdim, axes, atindex, atvalue, collapse

# From IntervalSets:
export ClosedInterval, ..
Expand All @@ -15,6 +17,7 @@ include("intervals.jl")
include("search.jl")
include("indexing.jl")
include("sortedvector.jl")
include("categoricalvector.jl")
include("combine.jl")

end
82 changes: 82 additions & 0 deletions src/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
A CategoricalVector is an AbstractVector which is treated as a categorical axis regardless
of the element type. Duplicate values are not allowed but are not filtered out.
A CategoricalVector axis can be indexed with an ClosedInterval, with a value, or with a
vector of values. Use of a CategoricalVector{Tuple} axis allows indexing similar to the
hierarchical index of the Python Pandas package or the R data.table package.
In general, indexing into a CategoricalVector will be much slower than the corresponding
SortedVector or another sorted axis type, as linear search is required.
### Constructors
```julia
CategoricalVector(x::AbstractVector)
```
### Arguments
* `x::AbstractVector` : the wrapped vector
### Examples
```julia
v = CategoricalVector(collect([1; 8; 10:15]))
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
A[Axis{:row}(1), :]
A[Axis{:row}(10), :]
A[Axis{:row}([1, 10]), :]
## Hierarchical index example with three key levels
data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
A = AxisArray(data, CategoricalVector(v), [:a, :b])
A[:b, :]
A[[:a,:c], :]
A[(:a,:x), :]
A[(:a,:x,:x), :]
```
"""
immutable CategoricalVector{T, A<:AbstractVector{T}} <: AbstractVector{T}
data::A
end

function CategoricalVector(data::AbstractVector{T}) where T
CategoricalVector{T, typeof(data)}(data)
end

Base.getindex(v::CategoricalVector, idx::Int) = v.data[idx]
Base.getindex(v::CategoricalVector, idx::AbstractVector) = CategoricalVector(v.data[idx])

Base.length(v::CategoricalVector) = length(v.data)
Base.size(v::CategoricalVector) = size(v.data)
Base.size(v::CategoricalVector, i) = size(v.data, i)
Base.indices(v::CategoricalVector) = indices(v.data)

axistrait(::Type{CategoricalVector{T,A}}) where {T,A} = Categorical
checkaxis(::CategoricalVector) = nothing


## Add some special indexing for CategoricalVector{Tuple}'s to achieve something like
## Panda's hierarchical indexing

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx) = axisindexes(ax, (idx,))

function axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::Tuple)
collect(filter(ax_idx->_tuple_matches(ax.val[ax_idx], idx), indices(ax.val)...))
end

function _tuple_matches(element::Tuple, idx::Tuple)
length(idx) <= length(element) || return false

for (x, y) in zip(element, idx)
x == y || return false
end

return true
end

axisindexes{T<:Tuple,S,A}(ax::Axis{S,CategoricalVector{T,A}}, idx::AbstractArray) =
vcat([axisindexes(ax, i) for i in idx]...)
194 changes: 194 additions & 0 deletions src/combine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,197 @@ function Base.join{T,N,D,Ax}(As::AxisArray{T,N,D,Ax}...; fillvalue::T=zero(T),
return result

end #join

function _collapse_array_axes(array_name, array_axes...)
((array_name, (idx isa Tuple ? idx : (idx,))...) for idx in product((Ax.val for Ax in array_axes)...))
end

function _collapse_axes(array_names, array_axes)
collect(Iterators.flatten(map(array_names, array_axes) do tup_name, tup_array_axes
_collapse_array_axes(tup_name, tup_array_axes...)
end))
end

function _splitall{N}(::Type{Val{N}}, As...)
tuple((Base.IteratorsMD.split(A, Val{N}) for A in As)...)
end

function _reshapeall{N}(::Type{Val{N}}, As...)
tuple((reshape(A, Val{N}) for A in As)...)
end

function _check_common_axes(common_axis_tuple)
if !all(axisname(first(common_axis_tuple)) .=== axisname.(common_axis_tuple[2:end]))
throw(ArgumentError("Leading common axes must have the same name in each array"))
end

return nothing
end

function _collapsed_axis_eltype(LType, trailing_axes)
eltypes = map(trailing_axes) do array_trailing_axes
Tuple{LType, eltype.(array_trailing_axes)...}
end

return typejoin(eltypes...)
end

function collapse{N, AN}(::Type{Val{N}}, As::Vararg{AxisArray, AN})
collapse(Val{N}, ntuple(identity, Val{AN}), As...)
end

function collapse{N, AN, NewArrayType<:AbstractArray}(::Type{Val{N}}, ::Type{NewArrayType}, As::Vararg{AxisArray, AN})
collapse(Val{N}, NewArrayType, ntuple(identity, Val{AN}), As...)
end

@generated function collapse{N, AN, LType}(::Type{Val{N}}, labels::NTuple{AN, LType}, As::Vararg{AxisArray, AN})
collapsed_dim_int = Int(N) + 1
new_eltype = Base.promote_eltype(As...)

quote
collapse(Val{N}, Array{$new_eltype, $collapsed_dim_int}, labels, As...)
end
end

"""
collapse(::Type{Val{N}}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, labels::Tuple, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, As::AxisArray...) -> AxisArray
collapse(::Type{Val{N}}, ::Type{NewArrayType}, labels::Tuple, As::AxisArray...) -> AxisArray
Collapses `AxisArray`s with `N` equal leading axes into a single `AxisArray`.
All additional axes in any of the arrays are collapsed into a single additional
axis of type `Axis{:collapsed, CategoricalVector{Tuple}}`.
### Arguments
* `::Type{Val{N}}`: the greatest common dimension to share between all input
arrays. The remaining axes are collapsed. All `N` axes must be common
to each input array, at the same dimension. Values from `0` up to the
minimum number of dimensions across all input arrays are allowed.
* `labels::Tuple`: (optional) an index for each array in `As` used as the leading element in
the index tuples in the `:collapsed` axis. Defaults to `1:length(As)`.
* `::Type{NewArrayType<:AbstractArray{_, N+1}}`: (optional) the desired underlying array
type for the returned `AxisArray`.
* `As::AxisArray...`: `AxisArray`s to be collapsed together.
### Examples
```
julia> price_data = AxisArray(rand(10), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)))
1-dimensional AxisArray{Float64,1,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
And data, a 10-element Array{Float64,1}:
0.885014
0.418562
0.609344
0.72221
0.43656
0.840304
0.455337
0.65954
0.393801
0.260207
julia> size_data = AxisArray(rand(10,2), Axis{:time}(Date(2016,01,01):Day(1):Date(2016,01,10)), Axis{:measure}([:area, :volume]))
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:measure, Symbol[:area, :volume]
And data, a 10×2 Array{Float64,2}:
0.159434 0.456992
0.344521 0.374623
0.522077 0.313256
0.994697 0.320953
0.95104 0.900526
0.921854 0.729311
0.000922581 0.148822
0.449128 0.761714
0.650277 0.135061
0.688773 0.513845
julia> collapsed = collapse(Val{1}, (:price, :size), price_data, size_data)
2-dimensional AxisArray{Float64,2,...} with axes:
:time, 2016-01-01:1 day:2016-01-10
:collapsed, Tuple{Symbol,Vararg{Symbol,N} where N}[(:price,), (:size, :area), (:size, :volume)]
And data, a 10×3 Array{Float64,2}:
0.885014 0.159434 0.456992
0.418562 0.344521 0.374623
0.609344 0.522077 0.313256
0.72221 0.994697 0.320953
0.43656 0.95104 0.900526
0.840304 0.921854 0.729311
0.455337 0.000922581 0.148822
0.65954 0.449128 0.761714
0.393801 0.650277 0.135061
0.260207 0.688773 0.513845
julia> collapsed[Axis{:collapsed}(:size)] == size_data
true
```
"""
@generated function collapse(::Type{Val{N}},
::Type{NewArrayType},
labels::NTuple{AN, LType},
As::Vararg{AxisArray, AN}) where {N, AN, LType, NewArrayType<:AbstractArray}
if N < 0
throw(ArgumentError("collapse dimension N must be at least 0"))
end

if N > minimum(ndims.(As))
throw(ArgumentError(
"""
collapse dimension N must not be greater than the maximum number of dimensions
across all input arrays
"""
))
end

collapsed_dim = Val{N + 1}
collapsed_dim_int = Int(N) + 1

common_axes, trailing_axes = zip(_splitall(Val{N}, axisparams.(As)...)...)

foreach(_check_common_axes, zip(common_axes...))

new_common_axes = first(common_axes)
collapsed_axis_eltype = _collapsed_axis_eltype(LType, trailing_axes)
collapsed_axis_type = CategoricalVector{collapsed_axis_eltype, Vector{collapsed_axis_eltype}}

new_axes_type = Tuple{new_common_axes..., Axis{:collapsed, collapsed_axis_type}}
new_eltype = Base.promote_eltype(As...)

quote
common_axes, trailing_axes = zip(_splitall(Val{N}, axes.(As)...)...)

for common_axis_tuple in zip(common_axes...)
if !isempty(common_axis_tuple)
for common_axis in common_axis_tuple[2:end]
if !all(axisvalues(common_axis) .== axisvalues(common_axis_tuple[1]))
throw(ArgumentError(
"""
Leading common axes must be identical across
all input arrays"""
))
end
end
end
end

array_data = cat($collapsed_dim, _reshapeall($collapsed_dim, As...)...)

axis_array_type = AxisArray{
$new_eltype,
$collapsed_dim_int,
$NewArrayType,
$new_axes_type
}

new_axes = (
first(common_axes)...,
Axis{:collapsed, $collapsed_axis_type}($collapsed_axis_type(_collapse_axes(labels, trailing_axes))),
)

return axis_array_type(array_data, new_axes)
end
end
9 changes: 9 additions & 0 deletions src/core.jl
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,15 @@ end
axes(A::AbstractArray) = default_axes(A)
axes(A::AbstractArray, dim::Int) = default_axes(A)[dim]

"""
axisparams(::AxisArray) -> Vararg{::Type{Axis}}
axisparams(::Type{AxisArray}) -> Vararg{::Type{Axis}}
Returns the axis parameters for an AxisArray.
"""
axisparams{T,N,D,Ax}(::AxisArray{T,N,D,Ax}) = (Ax.parameters...)
axisparams{T,N,D,Ax}(::Type{AxisArray{T,N,D,Ax}}) = (Ax.parameters...)

### Axis traits ###
abstract type AxisTrait end
immutable Dimensional <: AxisTrait end
Expand Down
17 changes: 16 additions & 1 deletion src/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@ end
ex = Expr(:tuple)
n = 0
for i=1:length(I)
if axistrait(I[i]) <: Categorical && i <= length(Ax.parameters)
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1

continue
end

if I[i] <: Idx
push!(ex.args, :(I[$i]))
n += 1
Expand All @@ -243,7 +254,11 @@ end
end
n += length(I[i])
elseif i <= length(Ax.parameters)
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
if I[i] <: Axis
push!(ex.args, :(axisindexes(A.axes[$i], I[$i].val)))
else
push!(ex.args, :(axisindexes(A.axes[$i], I[$i])))
end
n += 1
else
push!(ex.args, :(error("dimension ", $i, " does not have an axis to index")))
Expand Down
21 changes: 21 additions & 0 deletions test/categoricalvector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Test CategoricalVector with a hierarchical index (indexed using Tuples)
srand(1234)
data = reshape(1.:40., 20, 2)
v = collect(zip([:a, :b, :c][rand(1:3,20)], [:x,:y][rand(1:2,20)], [:x,:y][rand(1:2,20)]))
idx = sortperm(v)
A = AxisArray(data[idx,:], AxisArrays.CategoricalVector(v[idx]), [:a, :b])
@test A[:b, :] == A[5:12, :]
@test A[[:a,:c], :] == A[[1:4;13:end], :]
@test A[(:a,:y), :] == A[2:4, :]
@test A[(:c,:y,:y), :] == A[16:end, :]
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

v = AxisArrays.CategoricalVector(collect([1; 8; 10:15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical
A = AxisArray(reshape(1:16, 8, 2), v, [:a, :b])
@test A[Axis{:row}(AxisArrays.CategoricalVector([15]))] == AxisArray(reshape(A.data[8, :], 1, 2), AxisArrays.CategoricalVector([15]), [:a, :b])
@test A[Axis{:row}(AxisArrays.CategoricalVector([15])), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15]))
@test AxisArrays.axistrait(axes(A)[1]) <: AxisArrays.Categorical

# TODO: maybe make this work? Would require removing or modifying Base.getindex(A::AxisArray, idxs::Idx...)
# @test A[AxisArrays.CategoricalVector([15]), 1] == AxisArray([A.data[8, 1]], AxisArrays.CategoricalVector([15]))
Loading

0 comments on commit f0743ce

Please sign in to comment.