Enhance joining and grouping (#17)

Using a hashing approach rather than converting all columns to categorical arrays. Based on work by @alyst in DataFrames.
JuliaData · Mar 6, 2017 · 20c71d6 · 20c71d6
1 parent 9902e6e
commit 20c71d6
Show file tree

Hide file tree

Showing 20 changed files with 640 additions and 453 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,6 @@
 julia 0.5
 NullableArrays 0.1.0
-CategoricalArrays 0.0.6
+CategoricalArrays 0.1.2
 StatsBase 0.11.0
 GZip
 SortingAlgorithms

diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md
@@ -51,7 +51,7 @@ Cross joins are the only kind of join that does not use a key:
 join(a, b, kind = :cross)
 ```
 
-In order to join data frames on keys which have different names, you must first rename them so that they match. This can be done using rename!:
+In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!:
 
 ```julia
 a = DataTable(ID = [1, 2], Name = ["A", "B"])
@@ -63,11 +63,11 @@ join(a, b, on = :ID, kind = :inner)
 Or renaming multiple columns at a time:
 
 ```julia
-a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"], 
-              Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], 
+a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"],
+              Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
               Category = [1, 2, 3, 4, 5])
-b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"], 
-              Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], 
+b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"],
+              Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
               Name = ["a", "b", "c", "d", "e"])
 rename!(b, [:Location => :City, :Work => :Job])
 join(a, b, on = [:City, :Job])

diff --git a/src/DataTables.jl b/src/DataTables.jl
@@ -104,6 +104,7 @@ for (dir, filename) in [
         ("subdatatable", "subdatatable.jl"),
         ("groupeddatatable", "grouping.jl"),
         ("datatablerow", "datatablerow.jl"),
+        ("datatablerow", "utils.jl"),
 
         ("abstractdatatable", "iteration.jl"),
         ("abstractdatatable", "join.jl"),

diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl
@@ -602,17 +602,14 @@ nonunique(dt, 1)
 
 """
 function nonunique(dt::AbstractDataTable)
-    res = fill(false, nrow(dt))
-    rows = Set{DataTableRow}()
-    for i in 1:nrow(dt)
-        arow = DataTableRow(dt, i)
-        if in(arow, rows)
-            res[i] = true
-        else
-            push!(rows, arow)
-        end
+    gslots = row_group_slots(dt)[3]
+    # unique rows are the first encountered group representatives,
+    # nonunique are everything else
+    res = fill(true, nrow(dt))
+    @inbounds for g_row in gslots
+        (g_row > 0) && (res[g_row] = false)
     end
-    res
+    return res
 end
 
 nonunique(dt::AbstractDataTable, cols::Union{Real, Symbol}) = nonunique(dt[[cols]])