Skip to content

Commit

Permalink
Merge pull request #817 from alyst/enhance_nonunique
Browse files Browse the repository at this point in the history
Enhance nonunique()
  • Loading branch information
alyst committed Jul 3, 2015
2 parents aea2c8b + 0dc552d commit 181ac2c
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 7 deletions.
12 changes: 5 additions & 7 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ nonunique(df::AbstractDataFrame)
### Result
* `::Vector{Bool}` : indicates whether the row is a duplicate of a
* `::Vector{Bool}` : indicates whether the row is a duplicate of some
prior row
See also `unique` and `unique!`.
Expand All @@ -574,16 +574,14 @@ nonunique(df)
"""
function nonunique(df::AbstractDataFrame)
# Return a Vector{Bool} indicated whether the row is a duplicate
# of a prior row.
res = fill(false, nrow(df))
di = Dict()
rows = Set{DataFrameRow}()
for i in 1:nrow(df)
arow = convert(Array, df[i, :]) # Used to convert to Any type
if haskey(di, arow)
arow = DataFrameRow(df, i)
if in(arow, rows)
res[i] = true
else
di[arow] = 1
push!(rows, arow)
end
end
res
Expand Down
47 changes: 47 additions & 0 deletions src/dataframerow/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,50 @@ Base.next(r::DataFrameRow, s) = ((_names(r)[s], r[s]), s + 1)
Base.done(r::DataFrameRow, s) = s > length(r)

Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, r.df[r.row,:])

# hash of DataFrame rows based on its values
# so that duplicate rows would have the same hash
function Base.hash(r::DataFrameRow, h::UInt)
for col in columns(r.df)
if isna(col, r.row)
h = hash(false, h)
else
h = hash(true, hash(col[r.row], h))
end
end
return h
end

# compare two elements in the array
_isequalelms(a::Array, i::Int, j::Int) = isequal(a[i], a[j])

# compare the two elements in the data array
function _isequalelms(a::DataArray, i::Int, j::Int)
if isna(a, i)
return isna(a, j)
else
return !isna(a, j) && isequal(a.data[i], a.data[j])
end
end

# compare two elements in the pooled array
# NOTE assume there are no duplicated elements in the pool
_isequalelms(a::PooledDataArray, i::Int, j::Int) = isequal(a.refs[i], a.refs[j])

# comparison of DataFrame rows
# only the rows of the same DataFrame could be compared
# rows are equal if they have the same values (while the row indices could differ)
function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
if r1.df !== r2.df
throw(ArgumentError("Comparing rows from different frames not supported"))
end
if r1.row == r2.row
return true
end
for col in columns(r1.df)
if !_isequalelms(col, r1.row, r2.row)
return false
end
end
return true
end
28 changes: 28 additions & 0 deletions test/dataframerow.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
module TestDataFrameRow
using Base.Test
using DataFrames, Compat

df = DataFrame(a=@data([1, 2, 3, 1, 2, 2 ]),
b=@data([2.0, NA, 1.2, 2.0, NA, NA]),
c=@data(["A", "B", "C", "A", "B", NA]),
d=PooledDataArray(
@data([:A, NA, :C, :A, NA, :C])))
df2 = DataFrame(a = @data([1, 2, 3]))

#
# Equality
#
@test_throws ArgumentError isequal(DataFrameRow(df, 1), DataFrameRow(df2, 1))
@test !isequal(DataFrameRow(df, 1), DataFrameRow(df, 2))
@test !isequal(DataFrameRow(df, 1), DataFrameRow(df, 3))
@test isequal(DataFrameRow(df, 1), DataFrameRow(df, 4))
@test isequal(DataFrameRow(df, 2), DataFrameRow(df, 5))
@test !isequal(DataFrameRow(df, 2), DataFrameRow(df, 6))

# hashing
@test !isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 2)))
@test !isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 3)))
@test isequal(hash(DataFrameRow(df, 1)), hash(DataFrameRow(df, 4)))
@test isequal(hash(DataFrameRow(df, 2)), hash(DataFrameRow(df, 5)))
@test !isequal(hash(DataFrameRow(df, 2)), hash(DataFrameRow(df, 6)))
end
10 changes: 10 additions & 0 deletions test/duplicates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,14 @@ module TestDuplicates
@test isequal(udf, unique(df))
unique!(df)
@test isequal(df, udf)

pdf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, NA, "b", NA, "a", NA] ),
b = PooledDataArray( @data ["a", "b", NA, NA, "b", "a", "a", "a"] ) )
updf = DataFrame( a = PooledDataArray( @data ["a", "a", NA, "b", NA] ),
b = PooledDataArray( @data ["a", "b", NA, "b", "a"] ) )
@test isequal(nonunique(pdf), [false, false, false, true, false, false, true, true])
@test isequal(nonunique(updf), falses(5) )
@test isequal(updf, unique(pdf))
unique!(pdf)
@test isequal(pdf, updf)
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ my_tests = ["utils.jl",
"data.jl",
"index.jl",
"dataframe.jl",
"dataframerow.jl",
"io.jl",
"formula.jl",
"constructors.jl",
Expand Down

0 comments on commit 181ac2c

Please sign in to comment.