Merge pull request #817 from alyst/enhance_nonunique

Enhance nonunique()
JuliaData · Jul 3, 2015 · 181ac2c · 181ac2c
2 parents aea2c8b + 0dc552d
commit 181ac2c
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 7 deletions.
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -559,7 +559,7 @@ nonunique(df::AbstractDataFrame)
 
 ### Result
 
-* `::Vector{Bool}` : indicates whether the row is a duplicate of a
+* `::Vector{Bool}` : indicates whether the row is a duplicate of some
   prior row
 
 See also `unique` and `unique!`.
@@ -574,16 +574,14 @@ nonunique(df)
 
 """
 function nonunique(df::AbstractDataFrame)
-    # Return a Vector{Bool} indicated whether the row is a duplicate
-    # of a prior row.
     res = fill(false, nrow(df))
-    di = Dict()
+    rows = Set{DataFrameRow}()
     for i in 1:nrow(df)
-        arow = convert(Array, df[i, :]) # Used to convert to Any type
-        if haskey(di, arow)
+        arow = DataFrameRow(df, i)
+        if in(arow, rows)
             res[i] = true
         else
-            di[arow] = 1
+            push!(rows, arow)
         end
     end
     res

diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl
@@ -36,3 +36,50 @@ Base.next(r::DataFrameRow, s) = ((_names(r)[s], r[s]), s + 1)
 Base.done(r::DataFrameRow, s) = s > length(r)
 
 Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, r.df[r.row,:])
+
+# hash of DataFrame rows based on its values
+# so that duplicate rows would have the same hash
+function Base.hash(r::DataFrameRow, h::UInt)
+    for col in columns(r.df)
+        if isna(col, r.row)
+            h = hash(false, h)
+        else
+            h = hash(true, hash(col[r.row], h))
+        end
+    end
+    return h
+end
+
+# compare two elements in the array
+_isequalelms(a::Array, i::Int, j::Int) = isequal(a[i], a[j])
+
+# compare the two elements in the data array
+function _isequalelms(a::DataArray, i::Int, j::Int)
+    if isna(a, i)
+        return isna(a, j)
+    else
+        return !isna(a, j) && isequal(a.data[i], a.data[j])
+    end
+end
+
+# compare two elements in the pooled array
+# NOTE assume there are no duplicated elements in the pool
+_isequalelms(a::PooledDataArray, i::Int, j::Int) = isequal(a.refs[i], a.refs[j])
+
+# comparison of DataFrame rows
+# only the rows of the same DataFrame could be compared
+# rows are equal if they have the same values (while the row indices could differ)
+function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
+    if r1.df !== r2.df
+        throw(ArgumentError("Comparing rows from different frames not supported"))
+    end
+    if r1.row == r2.row
+        return true
+    end
+    for col in columns(r1.df)
+        if !_isequalelms(col, r1.row, r2.row)
+            return false
+        end
+    end
+    return true
+end
diff --git a/test/dataframerow.jl b/test/dataframerow.jl
@@ -0,0 +1,28 @@
+module TestDataFrameRow
+    using Base.Test
+    using DataFrames, Compat
+
+    df = DataFrame(a=@data([1,   2,   3,   1,   2,   2 ]),
+                   b=@data([2.0, NA,  1.2, 2.0, NA,  NA]),
+                   c=@data(["A", "B", "C", "A", "B", NA]),
+                   d=PooledDataArray(
+                     @data([:A,  NA,  :C,  :A,  NA,  :C])))
+    df2 = DataFrame(a = @data([1, 2, 3]))
+
+    #
+    # Equality
+    #
+    @test_throws ArgumentError isequal(DataFrameRow(df, 1), DataFrameRow(df2, 1))
+    @test !isequal(DataFrameRow(df, 1), DataFrameRow(df, 2))
+    @test !isequal(DataFrameRow(df, 1), DataFrameRow(df, 3))
+    @test isequal(DataFrameRow(df, 1), DataFrameRow(df, 4))
+    @test isequal(DataFrameRow(df, 2), DataFrameRow(df, 5))
+    @test !isequal(DataFrameRow(df, 2), DataFrameRow(df, 6))
+
+    # hashing
+    @test !isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 2)))
+    @test !isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 3)))
+    @test isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 4)))
+    @test isequal(hash(DataFrameRow(df, 2)), hash(DataFrameRow(df, 5)))
+    @test !isequal(hash(DataFrameRow(df, 2)), hash(DataFrameRow(df, 6)))
+end
diff --git a/test/duplicates.jl b/test/duplicates.jl
@@ -8,4 +8,14 @@ module TestDuplicates
     @test isequal(udf, unique(df))
     unique!(df)
     @test isequal(df, udf)
+
+    pdf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, NA, "b", NA, "a", NA] ),
+                     b = PooledDataArray( @data ["a", "b", NA, NA, "b", "a", "a", "a"] ) )
+    updf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, "b", NA] ),
+                      b = PooledDataArray( @data ["a", "b", NA, "b", "a"] ) )
+    @test isequal(nonunique(pdf), [false, false, false, true, false, false, true, true])
+    @test isequal(nonunique(updf), falses(5) )
+    @test isequal(updf, unique(pdf))
+    unique!(pdf)
+    @test isequal(pdf, updf)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -14,6 +14,7 @@ my_tests = ["utils.jl",
             "data.jl",
             "index.jl",
             "dataframe.jl",
+            "dataframerow.jl",
             "io.jl",
             "formula.jl",
             "constructors.jl",