Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

Commit

Permalink
Enhance joining and grouping (#17)
Browse files Browse the repository at this point in the history
Using a hashing approach rather than converting all columns to categorical arrays. Based on work by @alyst in DataFrames.
  • Loading branch information
cjprybol authored and nalimilan committed Mar 6, 2017
1 parent 9902e6e commit 20c71d6
Show file tree
Hide file tree
Showing 20 changed files with 640 additions and 453 deletions.
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
julia 0.5
NullableArrays 0.1.0
CategoricalArrays 0.0.6
CategoricalArrays 0.1.2
StatsBase 0.11.0
GZip
SortingAlgorithms
Expand Down
10 changes: 5 additions & 5 deletions docs/src/man/joins.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Cross joins are the only kind of join that does not use a key:
join(a, b, kind = :cross)
```

In order to join data frames on keys which have different names, you must first rename them so that they match. This can be done using rename!:
In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!:

```julia
a = DataTable(ID = [1, 2], Name = ["A", "B"])
Expand All @@ -63,11 +63,11 @@ join(a, b, on = :ID, kind = :inner)
Or renaming multiple columns at a time:

```julia
a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"],
Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"],
Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
Category = [1, 2, 3, 4, 5])
b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"],
Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"],
Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
Name = ["a", "b", "c", "d", "e"])
rename!(b, [:Location => :City, :Work => :Job])
join(a, b, on = [:City, :Job])
Expand Down
1 change: 1 addition & 0 deletions src/DataTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ for (dir, filename) in [
("subdatatable", "subdatatable.jl"),
("groupeddatatable", "grouping.jl"),
("datatablerow", "datatablerow.jl"),
("datatablerow", "utils.jl"),

("abstractdatatable", "iteration.jl"),
("abstractdatatable", "join.jl"),
Expand Down
17 changes: 7 additions & 10 deletions src/abstractdatatable/abstractdatatable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -602,17 +602,14 @@ nonunique(dt, 1)
"""
function nonunique(dt::AbstractDataTable)
res = fill(false, nrow(dt))
rows = Set{DataTableRow}()
for i in 1:nrow(dt)
arow = DataTableRow(dt, i)
if in(arow, rows)
res[i] = true
else
push!(rows, arow)
end
gslots = row_group_slots(dt)[3]
# unique rows are the first encountered group representatives,
# nonunique are everything else
res = fill(true, nrow(dt))
@inbounds for g_row in gslots
(g_row > 0) && (res[g_row] = false)
end
res
return res
end

nonunique(dt::AbstractDataTable, cols::Union{Real, Symbol}) = nonunique(dt[[cols]])
Expand Down

0 comments on commit 20c71d6

Please sign in to comment.