Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Override getproperty, setproperty! and propertynames for AbstractDataFrame and DataFrameRow #1406

Merged
merged 1 commit into from
May 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ Compat.axes(df, i) = axes(df)[i]

Base.ndims(::AbstractDataFrame) = 2

if VERSION >= v"0.7.0-DEV.3067"
Base.getproperty(df::AbstractDataFrame, col_ind::Symbol) = getindex(df, col_ind)
Base.setproperty!(df::AbstractDataFrame, col_ind::Symbol, x) = setindex!(df, x, col_ind)
# Private fields are never exposed since they can conflict with column names
Base.propertynames(df::AbstractDataFrame, private::Bool=false) = names(df)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what's the best solution here. This situation is going to arise for other types where field names are dynamic. We could also throw an error when private == true.

end

##############################################################################
##
## Similar
Expand Down
12 changes: 5 additions & 7 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,11 @@ struct DataFrameStream{T}
columns::T
header::Vector{String}
end
DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(df.columns), string.(names(df)))
DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(columns(df)), string.(names(df)))

# DataFrame Data.Source implementation
function Data.schema(df::DataFrame)
return Data.Schema(Type[eltype(A) for A in df.columns],
string.(names(df)), length(df) == 0 ? 0 : length(df.columns[1]))
end
Data.schema(df::DataFrame) =
Data.Schema(Type[eltype(A) for A in columns(df)], string.(names(df)), size(df, 1))

Data.isdone(source::DataFrame, row, col, rows, cols) = row > rows || col > cols
function Data.isdone(source::DataFrame, row, col)
Expand Down Expand Up @@ -283,7 +281,7 @@ function DataFrame(sch::Data.Schema{R}, ::Type{S}=Data.Field,
# to the # of rows in the source
newsize = ifelse(S == Data.Column || !R, 0,
ifelse(append, sinkrows + sch.rows, sch.rows))
foreach(col->resize!(col, newsize), sink.columns)
foreach(col->resize!(col, newsize), columns(sink))
sch.rows = newsize
end
# take care of a possible reference from source by addint to WeakRefStringArrays
Expand Down Expand Up @@ -322,7 +320,7 @@ DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
row, col::Int, knownrows)
append!(sink.columns[col], column)
end

Data.close!(df::DataFrameStream) =
DataFrame(collect(Any, df.columns), Symbol.(df.header), makeunique=true)

56 changes: 28 additions & 28 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,11 @@ end
##
##############################################################################

index(df::DataFrame) = df.colindex
columns(df::DataFrame) = df.columns
index(df::DataFrame) = getfield(df, :colindex)
columns(df::DataFrame) = getfield(df, :columns)

# TODO: Remove these
nrow(df::DataFrame) = ncol(df) > 0 ? length(df.columns[1])::Int : 0
nrow(df::DataFrame) = ncol(df) > 0 ? length(columns(df)[1])::Int : 0
ncol(df::DataFrame) = length(index(df))

##############################################################################
Expand All @@ -247,7 +247,7 @@ ncol(df::DataFrame) = length(index(df))
#
# Let getindex(index(df), col_inds) from Index() handle the resolution
# of column indices
# Let getindex(df.columns[j], row_inds) from AbstractVector() handle
# Let getindex(columns(df)[j], row_inds) from AbstractVector() handle
# the resolution of row indices

# TODO: change Real to Integer in this union after deprecation period
Expand All @@ -256,13 +256,13 @@ const ColumnIndex = Union{Real, Symbol}
# df[SingleColumnIndex] => AbstractDataVector
function Base.getindex(df::DataFrame, col_ind::ColumnIndex)
selected_column = index(df)[col_ind]
return df.columns[selected_column]
return columns(df)[selected_column]
end

# df[MultiColumnIndex] => DataFrame
function Base.getindex(df::DataFrame, col_inds::AbstractVector)
selected_columns = index(df)[col_inds]
new_columns = df.columns[selected_columns]
new_columns = columns(df)[selected_columns]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
end

Expand All @@ -272,26 +272,26 @@ Base.getindex(df::DataFrame, col_inds::Colon) = copy(df)
# df[SingleRowIndex, SingleColumnIndex] => Scalar
function Base.getindex(df::DataFrame, row_ind::Real, col_ind::ColumnIndex)
selected_column = index(df)[col_ind]
return df.columns[selected_column][row_ind]
return columns(df)[selected_column][row_ind]
end

# df[SingleRowIndex, MultiColumnIndex] => DataFrame
function Base.getindex(df::DataFrame, row_ind::Real, col_inds::AbstractVector)
selected_columns = index(df)[col_inds]
new_columns = Any[dv[[row_ind]] for dv in df.columns[selected_columns]]
new_columns = Any[dv[[row_ind]] for dv in columns(df)[selected_columns]]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
end

# df[MultiRowIndex, SingleColumnIndex] => AbstractVector
function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_ind::ColumnIndex)
selected_column = index(df)[col_ind]
return df.columns[selected_column][row_inds]
return columns(df)[selected_column][row_inds]
end

# df[MultiRowIndex, MultiColumnIndex] => DataFrame
function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::AbstractVector)
selected_columns = index(df)[col_inds]
new_columns = Any[dv[row_inds] for dv in df.columns[selected_columns]]
new_columns = Any[dv[row_inds] for dv in columns(df)[selected_columns]]
return DataFrame(new_columns, Index(_names(df)[selected_columns]))
end

Expand All @@ -304,7 +304,7 @@ Base.getindex(df::DataFrame, row_ind::Real, col_inds::Colon) = df[[row_ind], col

# df[MultiRowIndex, :] => DataFrame
function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::Colon)
new_columns = Any[dv[row_inds] for dv in df.columns]
new_columns = Any[dv[row_inds] for dv in columns(df)]
return DataFrame(new_columns, copy(index(df)))
end

Expand Down Expand Up @@ -339,15 +339,15 @@ function insert_single_column!(df::DataFrame,
dv = isa(v, AbstractRange) ? collect(v) : v
if haskey(index(df), col_ind)
j = index(df)[col_ind]
df.columns[j] = dv
columns(df)[j] = dv
else
if typeof(col_ind) <: Symbol
push!(index(df), col_ind)
push!(df.columns, dv)
push!(columns(df), dv)
else
if ncol(df) + 1 == Int(col_ind)
push!(index(df), nextcolname(df))
push!(df.columns, dv)
push!(columns(df), dv)
else
throw(ArgumentError("Cannot assign to non-existent column: $col_ind"))
end
Expand All @@ -358,7 +358,7 @@ end

function insert_single_entry!(df::DataFrame, v::Any, row_ind::Real, col_ind::ColumnIndex)
if haskey(index(df), col_ind)
df.columns[index(df)[col_ind]][row_ind] = v
columns(df)[index(df)[col_ind]][row_ind] = v
return v
else
error("Cannot assign to non-existent column: $col_ind")
Expand All @@ -370,7 +370,7 @@ function insert_multiple_entries!(df::DataFrame,
row_inds::AbstractVector{<:Real},
col_ind::ColumnIndex)
if haskey(index(df), col_ind)
df.columns[index(df)[col_ind]][row_inds] = v
columns(df)[index(df)[col_ind]][row_inds] = v
return v
else
error("Cannot assign to non-existent column: $col_ind")
Expand Down Expand Up @@ -604,8 +604,8 @@ function Base.setindex!(df::DataFrame,
new_df::DataFrame,
row_inds::Colon,
col_inds::Colon=Colon())
df.columns = copy(new_df.columns)
df.colindex = copy(new_df.colindex)
setfield!(df, :columns, copy(columns(new_df)))
setfield!(df, :colindex, copy(index(new_df)))
df
end

Expand All @@ -630,7 +630,7 @@ Base.setindex!(df::DataFrame, x::Nothing, col_ind::Int) = delete!(df, col_ind)
##
##############################################################################

Base.empty!(df::DataFrame) = (empty!(df.columns); empty!(index(df)); df)
Base.empty!(df::DataFrame) = (empty!(columns(df)); empty!(index(df)); df)

"""
Insert a column into a data frame in place.
Expand Down Expand Up @@ -708,7 +708,7 @@ function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::S
end
end
insert!(index(df), col_ind, name)
insert!(df.columns, col_ind, item)
insert!(columns(df), col_ind, item)
df
end

Expand Down Expand Up @@ -784,7 +784,7 @@ end
function Base.delete!(df::DataFrame, inds::Vector{Int})
for ind in sort(inds, rev = true)
if 1 <= ind <= ncol(df)
splice!(df.columns, ind)
splice!(columns(df), ind)
delete!(index(df), ind)
else
throw(ArgumentError("Can't delete a non-existent DataFrame column"))
Expand All @@ -798,7 +798,7 @@ Base.delete!(df::DataFrame, c::Any) = delete!(df, index(df)[c])
# deleterows!()
function deleterows!(df::DataFrame, ind::Union{Integer, UnitRange{Int}})
for i in 1:ncol(df)
df.columns[i] = deleteat!(df.columns[i], ind)
columns(df)[i] = deleteat!(columns(df)[i], ind)
end
df
end
Expand All @@ -824,7 +824,7 @@ function deleterows!(df::DataFrame, ind::AbstractVector{Int})
keep[ikeep:end] = idf:n

for i in 1:ncol(df)
df.columns[i] = df.columns[i][keep]
columns(df)[i] = columns(df)[i][keep]
end
df
end
Expand Down Expand Up @@ -1017,18 +1017,18 @@ end

# array and tuple like collections
function Base.push!(df::DataFrame, iterable::Any)
if length(iterable) != length(df.columns)
if length(iterable) != size(df, 2)
msg = "Length of iterable does not match DataFrame column count."
throw(ArgumentError(msg))
end
i = 1
for t in iterable
try
push!(df.columns[i], t)
push!(columns(df)[i], t)
catch
#clean up partial row
for j in 1:(i - 1)
pop!(df.columns[j])
pop!(columns(df)[j])
end
msg = "Error adding $t to column :$(_names(df)[i]). Possible type mis-match."
throw(ArgumentError(msg))
Expand Down Expand Up @@ -1094,9 +1094,9 @@ function permutecols!(df::DataFrame, p::AbstractVector)
throw(ArgumentError("$p is not a valid column permutation for this DataFrame"))
end
permute!(columns(df), p)
df.colindex = Index(names(df)[p])
setfield!(df, :colindex, Index(names(df)[p]))
end

function permutecols!(df::DataFrame, p::AbstractVector{Symbol})
permutecols!(df, getindex.(df.colindex.lookup, p))
permutecols!(df, getindex.(index(df).lookup, p))
end
48 changes: 32 additions & 16 deletions src/dataframerow/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,44 @@ struct DataFrameRow{T <: AbstractDataFrame}
row::Int
end


"""
parent(r::DataFrameRow)

Return the parent data frame of `r`.
"""
Base.parent(r::DataFrameRow) = getfield(r, :df)
row(r::DataFrameRow) = getfield(r, :row)

function Base.getindex(r::DataFrameRow, idx::AbstractArray)
return DataFrameRow(r.df[idx], r.row)
return DataFrameRow(parent(r)[idx], row(r))
end

function Base.getindex(r::DataFrameRow, idx::Any)
return r.df[r.row, idx]
return parent(r)[row(r), idx]
end

function Base.setindex!(r::DataFrameRow, value::Any, idx::Any)
return setindex!(r.df, value, r.row, idx)
return setindex!(parent(r), value, row(r), idx)
end

Base.names(r::DataFrameRow) = names(r.df)
_names(r::DataFrameRow) = _names(r.df)
Base.names(r::DataFrameRow) = names(parent(r))
_names(r::DataFrameRow) = _names(parent(r))

if VERSION >= v"0.7.0-DEV.3067"
Base.getproperty(r::DataFrameRow, idx::Symbol) = getindex(r, idx)
Base.setproperty!(r::DataFrameRow, idx::Symbol, x::Any) = setindex!(r, x, idx)
# Private fields are never exposed since they can conflict with column names
Base.propertynames(r::DataFrameRow, private::Bool=false) = names(r)
end

Base.view(r::DataFrameRow, c) = DataFrameRow(r.df[[c]], r.row)
Base.view(r::DataFrameRow, c) = DataFrameRow(parent(r)[[c]], row(r))

index(r::DataFrameRow) = index(r.df)
index(r::DataFrameRow) = index(parent(r))

Base.length(r::DataFrameRow) = size(r.df, 2)
Base.length(r::DataFrameRow) = size(parent(r), 2)

Compat.lastindex(r::DataFrameRow) = size(r.df, 2)
Compat.lastindex(r::DataFrameRow) = size(parent(r), 2)

Base.collect(r::DataFrameRow) = Tuple{Symbol, Any}[x for x in r]

Expand All @@ -35,7 +51,7 @@ Base.next(r::DataFrameRow, s) = ((_names(r)[s], r[s]), s + 1)

Base.done(r::DataFrameRow, s) = s > length(r)

Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, r.df[r.row,:])
Base.convert(::Type{Array}, r::DataFrameRow) = convert(Array, parent(r)[row(r),:])

# hash column element
Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h)
Expand All @@ -57,7 +73,7 @@ function rowhash(cols::Tuple{Vararg{AbstractVector}}, r::Int, h::UInt = zero(UIn
end

Base.hash(r::DataFrameRow, h::UInt = zero(UInt)) =
rowhash(ntuple(i -> r.df[i], ncol(r.df)), r.row, h)
rowhash(ntuple(i -> parent(r)[i], ncol(parent(r))), row(r), h)

# comparison of DataFrame rows
# only the rows of the same DataFrame could be compared
Expand All @@ -66,7 +82,7 @@ Base.hash(r::DataFrameRow, h::UInt = zero(UInt)) =
Base.:(==)(r1::DataFrameRow, r2::DataFrameRow) = isequal(r1, r2)

function Base.isequal(r1::DataFrameRow, r2::DataFrameRow)
isequal_row(r1.df, r1.row, r2.df, r2.row)
isequal_row(parent(r1), row(r1), parent(r2), row(r2))
end

# internal method for comparing the elements of the same data table column
Expand Down Expand Up @@ -102,11 +118,11 @@ end

# lexicographic ordering on DataFrame rows, missing > !missing
function Base.isless(r1::DataFrameRow, r2::DataFrameRow)
(ncol(r1.df) == ncol(r2.df)) ||
(ncol(parent(r1)) == ncol(parent(r2))) ||
throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(df1)) and $(ncol(df2)))"))
@inbounds for i in 1:ncol(r1.df)
if !isequal(r1.df[i][r1.row], r2.df[i][r2.row])
return isless(r1.df[i][r1.row], r2.df[i][r2.row])
@inbounds for i in 1:ncol(parent(r1))
if !isequal(parent(r1)[i][row(r1)], parent(r2)[i][row(r2)])
return isless(parent(r1)[i][row(r1)], parent(r2)[i][row(r2)])
end
end
return false
Expand Down
2 changes: 1 addition & 1 deletion src/dataframerow/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#' end
function Base.show(io::IO, r::DataFrameRow)
labelwidth = mapreduce(n -> length(string(n)), max, _names(r)) + 2
@printf(io, "DataFrameRow (row %d)\n", r.row)
@printf(io, "DataFrameRow (row %d)\n", row(r))
for (label, value) in r
println(io, rpad(label, labelwidth, ' '), value)
end
Expand Down
4 changes: 2 additions & 2 deletions src/dataframerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ function findrows(gd::RowGroupDict,
end

function Base.getindex(gd::RowGroupDict, dfr::DataFrameRow)
g_row = findrow(gd, dfr.df, ntuple(i -> gd.df[i], ncol(gd.df)),
ntuple(i -> dfr.df[i], ncol(dfr.df)), dfr.row)
g_row = findrow(gd, parent(dfr), ntuple(i -> gd.df[i], ncol(gd.df)),
ntuple(i -> parent(dfr)[i], ncol(parent(dfr))), row(dfr))
(g_row == 0) && throw(KeyError(dfr))
gix = gd.groups[g_row]
return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
Expand Down
Loading