Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

rewrite groupby #3

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ SortingAlgorithms
Reexport
Compat 0.19.0
FileIO 0.1.2
DataStructures
1 change: 1 addition & 0 deletions src/DataTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import NullableArrays: dropnull, dropnull!
@reexport using CategoricalArrays
using GZip
using SortingAlgorithms
using DataStructures

using FileIO # remove after read_rda deprecation period

Expand Down
34 changes: 34 additions & 0 deletions src/abstractdatatable/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,40 @@ similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{
similar_nullable(dt::AbstractDataTable, dims::Int) =
DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt)))

function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NULL
n = length(x)
# counts = x.pool
counts = fill(0, ngroups + 1)
for i = 1:n
Copy link
Member

@ararslan ararslan Feb 28, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to @inbounds this loop (and the others in this function)

counts[x[i] + 1] += 1
end

# mark the start of each contiguous group of like-indexed data
where = fill(1, ngroups + 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how much it really matters, but it might be best to avoid using where as a variable name since it has syntactic meaning in 0.6.

if null_last
for i = 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
where[1] = where[end] + counts[end]
else
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
end

# this is our indexer
result = fill(0, n)
for i = 1:n
label = x[i] + 1
result[where[label]] = i
where[label] += 1
end
result, where, counts
end

function join_idx(left, right, max_groups)
## adapted from Wes McKinney's full_outer_join in pandas (file: src/join.pyx).

Expand Down
94 changes: 25 additions & 69 deletions src/groupeddatatable/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,39 +26,6 @@ end
# Split
#

function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NULL
n = length(x)
# counts = x.pool
counts = fill(0, ngroups + 1)
for i = 1:n
counts[x[i] + 1] += 1
end

# mark the start of each contiguous group of like-indexed data
where = fill(1, ngroups + 1)
if null_last
for i = 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
where[1] = where[end] + counts[end]
else
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
end

# this is our indexer
result = fill(0, n)
for i = 1:n
label = x[i] + 1
result[where[label]] = i
where[label] += 1
end
result, where, counts
end

"""
A view of an AbstractDataTable split into row groups
Expand Down Expand Up @@ -117,49 +84,38 @@ dt |> groupby([:a, :b]) |> [sum, length]
```

"""
function groupby{T}(d::AbstractDataTable, cols::Vector{T})
## a subset of Wes McKinney's algorithm here:
## http://wesmckinney.com/blog/?p=489

ncols = length(cols)
# use CategoricalArray to get a set of integer references for each unique item
nv = NullableCategoricalArray(d[cols[ncols]])
# if there are NULLs, add 1 to the refs to avoid underflows in x later
anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
# use UInt32 instead of the original array's integer size since the number of levels can be high
x = similar(nv.refs, UInt32)
for i = 1:nrow(d)
if nv.refs[i] == 0
x[i] = 1
function groupby(d::AbstractDataTable, cols::Vector)
intersect = d[cols]
mappings = OrderedDict{DataTableRow, Vector{Int}}()
for i = 1:nrow(intersect)
row = DataTableRow(intersect, i)
if !haskey(mappings, row)
mappings[row] = [i]
else
x[i] = CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls
push!(mappings[row], i)
end
end
# also compute the number of groups, which is the product of the set lengths
ngroups = length(levels(nv)) + anynulls
# if there's more than 1 column, do roughly the same thing repeatedly
for j = (ncols - 1):-1:1
nv = NullableCategoricalArray(d[cols[j]])
anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0)
for i = 1:nrow(d)
if nv.refs[i] != 0
x[i] += (CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - 1) * ngroups
end
end
ngroups = ngroups * (length(levels(nv)) + anynulls)
# TODO if ngroups is really big, shrink it
ngroups = length(mappings.keys)
idx = Vector{Int}(nrow(d))
starts = fill(1, ngroups)
stops = Vector{Int}(ngroups)

rows = mappings.vals[1]
idx[1:length(rows)] = rows
stops[1] = length(rows)
for i = 2:ngroups
rows = mappings.vals[i]
starts[i] = stops[i-1] + 1
stops[i] = stops[i-1] + length(rows)
idx[starts[i]:stops[i]] = rows
end
(idx, starts) = groupsort_indexer(x, ngroups)
# Remove zero-length groupings
starts = _uniqueofsorted(starts)
ends = starts[2:end] - 1
GroupedDataTable(d, cols, idx, starts[1:end-1], ends)
GroupedDataTable(d, cols, idx, starts, stops)
end
groupby(d::AbstractDataTable, cols) = groupby(d, [cols])
groupby(d::AbstractDataTable, cols::Union{Int, Symbol}) = groupby(d, [cols])

# add a function curry
groupby{T}(cols::Vector{T}) = x -> groupby(x, cols)
groupby(cols) = x -> groupby(x, cols)
groupby(cols::Vector) = x -> groupby(x, cols)
groupby(cols::Union{Int, Symbol}) = x -> groupby(x, [cols])

Base.start(gd::GroupedDataTable) = 1
Base.next(gd::GroupedDataTable, state::Int) =
Expand Down
18 changes: 6 additions & 12 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ module TestGrouping
@test groupby(dt2, [:v1, :v2]).starts == collect(1:1000)
@test groupby(dt2, [:v2, :v1]).starts == collect(1:1000)

# grouping empty frame
@test groupby(DataTable(A=Int[]), :A).starts == Int[]
# grouping single row
@test groupby(DataTable(A=Int[1]), :A).starts == Int[1]

Expand All @@ -47,10 +45,6 @@ module TestGrouping
dt = DataTable(v1=x, v2=x)
groupby(dt, [:v1, :v2])

dt2 = by(e->1, DataTable(x=Int64[]), :x)
@test size(dt2) == (0,1)
@test isequal(sum(dt2[:x]), Nullable(0))

# Check that reordering levels does not confuse groupby
dt = DataTable(Key1 = CategoricalArray(["A", "A", "B", "B"]),
Key2 = CategoricalArray(["A", "B", "A", "B"]),
Expand All @@ -67,11 +61,11 @@ module TestGrouping
levels!(dt[:Key1], ["Z", "B", "A"])
levels!(dt[:Key2], ["Z", "B", "A"])
gd = groupby(dt, :Key1)
@test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
@test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
@test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
@test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
gd = groupby(dt, [:Key1, :Key2])
@test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4))
@test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1))
@test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1))
@test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2))
@test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3))
@test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4))
end