Skip to content

Commit

Permalink
Make describe more type-agnostic (#1418)
Browse files Browse the repository at this point in the history
  • Loading branch information
pdeffebach authored and nalimilan committed Jun 15, 2018
1 parent 30fe634 commit 86ee09b
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 318 deletions.
139 changes: 72 additions & 67 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -380,57 +380,70 @@ end
Report descriptive statistics for a data frame
```julia
describe(df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :datatype])
describe(io, df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :datatype])
describe(df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :nunique, :eltype])
```
**Arguments**
* `df` : the AbstractDataFrame
* `io` : optional output descriptor
* `stats::AbstractVector{Symbol}`: the summary statistics to report. Allowed
fields are `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`,
`:nunique`, and `:nmissing`
* `stats::Union{Symbol,AbstractVector{Symbol}}` : the summary statistics to report. If
a vector, allowed fields are `:mean`, `:std`, `:min`, `:q25`, `:median`,
`:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and `:nmissing`. If set to
`:all`, all summary statistics are reported.
**Result**
* A `DataFrame` where each row represents a variable and each column a summary statistic.
**Details**
If the column's base type derives from `Number`, compute the mean, standard
deviation, minimum, first quantile, median, third quantile, and maximum. If
a column is not numeric, these statistics are populated with `nothing`s.
For `Real` columns, compute the mean, standard deviation, minimum, first quantile, median,
third quantile, and maximum. If a column does not derive from `Real`, `describe` will
attempt to calculate all statistics, using `nothing` as a fall-back in the case of an error.
When `stats` contains `:nunique`, `describe` will report the
number of unique values in a column. If a column's base type derives from `Number`,
number of unique values in a column. If a column's base type derives from `Real`,
`:nunique` will return `nothing`s.
Missing values are filtered in the calculation of all statistics, however the column
`:nmissing` will report the number of missing values of that variable.
If the column does not allow missing values, `nothing` is returned.
Consequently, `nmissing = 0` indicates that the column allows
missing values, but does not currently contain any.
missing values, but does not currently contain any.
**Examples**
```julia
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
describe(df)
describe(df, stats = :all)
describe(df, stats = [:min, :max])
```
"""
StatsBase.describe(df::AbstractDataFrame; kwargs...) = describe(stdout, df; kwargs...)
function StatsBase.describe(io::IO, df::AbstractDataFrame; stats::AbstractVector{Symbol} =
[:mean, :min, :median, :max, :nmissing, :eltype])
function StatsBase.describe(df::AbstractDataFrame; stats::Union{Symbol,AbstractVector{Symbol}} =
[:mean, :min, :median, :max, :nunique, :nmissing, :eltype])
# Check that people don't specify the wrong fields.
allowed_fields = [:mean, :std, :min, :q25, :median, :q75,
:max, :nunique, :nmissing, :eltype]
if !issubset(stats, allowed_fields)
:max, :nunique, :nmissing, :first, :last, :eltype]
if stats == :all
stats = allowed_fields
end

if stats isa Symbol
if !(stats in allowed_fields)
allowed_msg = "\nAllowed fields are: :" * join(allowed_fields, ", :")
throw(ArgumentError(":$stats not allowed." * allowed_msg))
else
stats = [stats]
end
end

if !issubset(stats, allowed_fields)
disallowed_fields = setdiff(stats, allowed_fields)
not_allowed = "Field(s) not allowed: $disallowed_fields. "
allowed = "Allowed fields are: $allowed_fields."
throw(ArgumentError(not_allowed * allowed))
allowed_msg = "\nAllowed fields are: :" * join(allowed_fields, ", :")
not_allowed = "Field(s) not allowed: :" * join(disallowed_fields, ", :") * "."
throw(ArgumentError(not_allowed * allowed_msg))
end


Expand All @@ -448,68 +461,60 @@ function StatsBase.describe(io::IO, df::AbstractDataFrame; stats::AbstractVector
return data
end

# Define 4 functions for getting summary statistics
# Define functions for getting summary statistics
# use a dict because we dont know which measures the user wants
# Outside of the `describe` function due to something with 0.7
function get_stats(col::AbstractArray{<:Real})
sumstats = summarystats(col)
Dict(
:mean => sumstats.mean,
:std => Compat.std(col, mean = sumstats.mean),
:min => sumstats.min,
:q25 => sumstats.q25,
:median => sumstats.median,
:q75 => sumstats.q75,
:max => sumstats.max,
:nmissing => nothing,
:nunique => nothing,
:eltype => eltype(col)
)
end

function get_stats(col::AbstractArray{<:Union{Real, Missing}})
function get_stats(col::AbstractArray{>:Missing})
nomissing = collect(skipmissing(col))
sumstats = summarystats(nomissing)
Dict(
:mean => sumstats.mean,
:std => Compat.std(nomissing, mean = sumstats.mean),
:min => sumstats.min,
:q25 => sumstats.q25,
:median => sumstats.median,
:q75 => sumstats.q75,
:max => sumstats.max,
:nmissing => count(ismissing, col),
:nunique => nothing,
:eltype => Missings.T(eltype(col))
)
end

q = try quantile(nomissing, [.25, .5, .75]) catch [nothing, nothing, nothing] end
ex = try extrema(nomissing) catch (nothing, nothing) end
m = try mean(nomissing) catch end
if eltype(nomissing) <: Real
u = nothing
else
u = try length(unique(nomissing)) catch end
end

function get_stats(col::AbstractArray{>:Missing})
Dict(
:mean => nothing,
:std => nothing,
:min => nothing,
:q25 => nothing,
:median => nothing,
:q75 => nothing,
:max => nothing,
:mean => m,
:std => try Compat.std(nomissing, mean = m) catch end,
:min => ex[1],
:q25 => q[1],
:median => q[2],
:q75 => q[3],
:max => ex[2],
:nmissing => count(ismissing, col),
:nunique => length(unique(col)),
:nunique => u,
:first => first(col),
:last => last(col),
:eltype => Missings.T(eltype(col))
)
end

function get_stats(col)
q = try quantile(col, [.25, .5, .75]) catch [nothing, nothing, nothing] end
ex = try extrema(col) catch (nothing, nothing) end
m = try mean(col) catch end
if eltype(col) <: Real
u = nothing
else
u = try length(unique(col)) catch end
end

Dict(
:mean => nothing,
:std => nothing,
:min => nothing,
:q25 => nothing,
:median => nothing,
:q75 => nothing,
:max => nothing,
:mean => m,
:std => try Compat.std(col, mean = m) catch end,
:min => ex[1],
:q25 => q[1],
:median => q[2],
:q75 => q[3],
:max => ex[2],
:nmissing => nothing,
:nunique => length(unique(col)),
:nunique => u,
:first => first(col),
:last => last(col),
:eltype => eltype(col)
)
end
Expand Down
60 changes: 0 additions & 60 deletions src/abstractdataframe/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -547,63 +547,3 @@ function Base.showall(df::AbstractDataFrame,
showall(stdout, df, allcols)
return
end

#' @exported
#' @description
#'
#' Render a summary of the column names, column types and column missingness
#' count.
#'
#' @param io::IO The `io` to be rendered to.
#' @param df::AbstractDataFrame An AbstractDataFrame.
#' @param all::Bool If `false` (default), only a subset of columns
#' fitting on the screen is printed.
#' @param values::Bool If `true` (default), the first and the last value of
#' each column are printed.
#'
#' @returns `nothing` value.
#'
#' @examples
#'
#' df = DataFrame(A = 1:3, B = ["x", "y", "z"])
#' showcols(df)
function showcols(io::IO, df::AbstractDataFrame, all::Bool = false,
values::Bool = true) # -> Nothing
print(io, summary(df))
metadata = DataFrame(Name = _names(df),
Eltype = eltypes(df),
Missing = colmissing(df))
nrows, ncols = size(df)
if values && nrows > 0
if nrows == 1
metadata[:Values] = [sprint(ourshowcompact, df[1, i]) for i in 1:ncols]
else
metadata[:Values] = [sprint(ourshowcompact, df[1, i]) * "" *
sprint(ourshowcompact, df[end, i]) for i in 1:ncols]
end
end
(all ? showall : show)(io, metadata, true, Symbol("Col #"), false)
return
end

#' @exported
#' @description
#'
#' Render a summary of the column names, column types and column missingness
#' count.
#'
#' @param df::AbstractDataFrame An AbstractDataFrame.
#' @param all::Bool If `false` (default), only a subset of columns
#' fitting on the screen is printed.
#' @param values::Bool If `true` (default), first and last value of
#' each column is printed.
#'
#' @returns `nothing` value.
#'
#' @examples
#'
#' df = DataFrame(A = 1:3, B = ["x", "y", "z"])
#' showcols(df)
function showcols(df::AbstractDataFrame, all::Bool=false, values::Bool=true)
showcols(stdout, df, all, values) # -> Nothing
end
2 changes: 2 additions & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1299,3 +1299,5 @@ end
import Base: vcat
@deprecate vcat(x::Vector{<:AbstractDataFrame}) vcat(x...)

@deprecate showcols(df::AbstractDataFrame, all::Bool=false, values::Bool=true) describe(df, stats = [:eltype, :nmissing, :first, :last])
@deprecate showcols(io::IO, df::AbstractDataFrame, all::Bool=false, values::Bool=true) show(io, describe(df, stats = [:eltype, :nmissing, :first, :last]), all)
30 changes: 17 additions & 13 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,43 +332,47 @@ module TestDataFrame
# Construct the test dataframe
df = DataFrame(number = [1, 2, 3, 4],
number_missing = [1,2, 3, missing],
non_number = ["a", "b", "c", "d"],
non_number_missing = ["a", "b", "c", missing],
string = ["a", "b", "c", "d"],
string_missing = ["a", "b", "c", missing],
dates = Date.([2000, 2001, 2003, 2004]),
catarray = CategoricalArray([1,2,1,2]))

describe_output = DataFrame(variable = [:number, :number_missing, :non_number,
:non_number_missing, :dates, :catarray],
describe_output = DataFrame(variable = [:number, :number_missing, :string,
:string_missing, :dates, :catarray],
mean = [2.5, 2.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, "a", "a", Date(2000), nothing],
median = [2.5, 2.0, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, "d", "c", Date(2004), nothing],
nunique = [nothing, nothing, 4, 3, 4, 2],
nmissing = [nothing, 1, nothing, 1, nothing, nothing],
eltype = [Int, Int, String, String, Date, eltype(df[:catarray])])
describe_output_all_stats = DataFrame(variable = [:number, :number_missing,
:non_number, :non_number_missing,
:string, :string_missing,
:dates, :catarray],
mean = [2.5, 2.0, nothing, nothing, nothing, nothing],
std = [Compat.std(df[:number]), 1.0, nothing,
nothing, nothing, nothing],
min = [1.0, 1.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, "a", "a", Date(2000), nothing],
q25 = [1.75, 1.5, nothing, nothing, nothing, nothing],
median = [2.5, 2.0, nothing, nothing, nothing, nothing],
q75 = [3.25, 2.5, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, nothing, nothing, nothing, nothing],
nunique = [nothing, nothing, 4, 4, 4, 2],
max = [4.0, 3.0, "d", "c", Date(2004), nothing],
nunique = [nothing, nothing, 4, 3, 4, 2],
nmissing = [nothing, 1, nothing, 1, nothing, nothing],
first = [1, 1, "a", "a", Date(2000), 1],
last = [4, missing, "d", missing, Date(2004), 2],
eltype = [Int, Int, String, String, Date,
eltype(df[:catarray])])


# Test that it works as a whole, without keyword arguments
@test describe_output == describe(df)

# Test that it works with one stats argument
@test describe_output[[:variable, :mean]] == describe(df, stats = [:mean])

# Test that it works with all keyword arguments
@test describe_output_all_stats ==
describe(df, stats = [:mean, :std, :min, :q25, :median, :q75, :max,
:nunique, :nmissing, :eltype])
@test describe_output_all_stats describe(df, stats = :all)
end

#Check the output of unstack
Expand Down
Loading

0 comments on commit 86ee09b

Please sign in to comment.