Skip to content

Commit

Permalink
Merge 97b50e9 into 72a6304
Browse files Browse the repository at this point in the history
  • Loading branch information
pdeffebach committed Jun 12, 2018
2 parents 72a6304 + 97b50e9 commit 0c9de49
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 47 deletions.
88 changes: 53 additions & 35 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,8 @@ end
Report descriptive statistics for a data frame
```julia
describe(df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :datatype])
describe(io, df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :datatype])
describe(df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :nunique, :eltype])
describe(io, df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissing, :nunique, :eltype])
```
**Arguments**
Expand All @@ -390,20 +390,23 @@ describe(io, df::AbstractDataFrame; stats = [:mean, :min, :median, :max, :nmissi
* `io` : optional output descriptor
* `stats::AbstractVector{Symbol}`: the summary statistics to report. Allowed
fields are `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`,
`:nunique`, and `:nmissing`
`:nunique`, `:first`, `:last`, and `:nmissing`
**Result**
* A `DataFrame` where each row represents a variable and each column a summary statistic.
**Details**
If the column's base type derives from `Number`, compute the mean, standard
deviation, minimum, first quantile, median, third quantile, and maximum. If
a column is not numeric, these statistics are populated with `nothing`s.
* If the column's base type derives from `Real`, compute the mean, standard
deviation, minimum, first quantile, median, third quantile, and maximum.
* If a column's type derives from `String`, these statistics are populated with `nothing`s.
* If a column derives from neither `Real` nor `String`, `describe` will attempt to
calculate all summary statistics and return `nothing` for summary statistics that
are not defined for that type.
When `stats` contains `:nunique`, `describe` will report the
number of unique values in a column. If a column's base type derives from `Number`,
number of unique values in a column. If a column's base type derives from `Real`,
`:nunique` will return `nothing`s.
Missing values are filtered in the calculation of all statistics, however the column
Expand All @@ -422,17 +425,20 @@ describe(df)
"""
StatsBase.describe(df::AbstractDataFrame; kwargs...) = describe(stdout, df; kwargs...)
function StatsBase.describe(io::IO, df::AbstractDataFrame; stats::AbstractVector{Symbol} =
[:mean, :min, :median, :max, :nmissing, :eltype])
[:mean, :min, :median, :max, :nunique, :nmissing, :eltype],
all = false)
# Check that people don't specify the wrong fields.
allowed_fields = [:mean, :std, :min, :q25, :median, :q75,
:max, :nunique, :nmissing, :eltype]
:max, :nunique, :nmissing, :first, :last, :eltype]
if !issubset(stats, allowed_fields)
disallowed_fields = setdiff(stats, allowed_fields)
not_allowed = "Field(s) not allowed: $disallowed_fields. "
allowed = "Allowed fields are: $allowed_fields."
throw(ArgumentError(not_allowed * allowed))
end

# See if the user wants to show all summary statistics
all == true ? stats = allowed_fields : nothing

# Put the summary stats into the return dataframe
data = DataFrame()
Expand All @@ -448,68 +454,80 @@ function StatsBase.describe(io::IO, df::AbstractDataFrame; stats::AbstractVector
return data
end

# Define 4 functions for getting summary statistics
# Define functions for getting summary statistics
# use a dict because we dont know which measures the user wants
# Outside of the `describe` function due to something with 0.7
function get_stats(col::AbstractArray{<:Real})
sumstats = summarystats(col)

function get_stats(col::AbstractArray{<:Union{Real, Missing}})
nomissing = collect(skipmissing(col))
sumstats = summarystats(nomissing)
Dict(
:mean => sumstats.mean,
:std => Compat.std(col, mean = sumstats.mean),
:std => Compat.std(nomissing, mean = sumstats.mean),
:min => sumstats.min,
:q25 => sumstats.q25,
:median => sumstats.median,
:q75 => sumstats.q75,
:max => sumstats.max,
:nmissing => nothing,
:nmissing => count(ismissing, col),
:nunique => nothing,
:eltype => eltype(col)
:first => first(col),
:last => last(col),
:eltype => Missings.T(eltype(col))
)
end

function get_stats(col::AbstractArray{<:Union{Real, Missing}})
nomissing = collect(skipmissing(col))
sumstats = summarystats(nomissing)
function get_stats(col::AbstractArray{<:Real})
sumstats = summarystats(col)
Dict(
:mean => sumstats.mean,
:std => Compat.std(nomissing, mean = sumstats.mean),
:std => Compat.std(col, mean = sumstats.mean),
:min => sumstats.min,
:q25 => sumstats.q25,
:median => sumstats.median,
:q75 => sumstats.q75,
:max => sumstats.max,
:nmissing => count(ismissing, col),
:nmissing => nothing,
:nunique => nothing,
:eltype => Missings.T(eltype(col))
:first => first(col),
:last => last(col),
:eltype => eltype(col)
)
end

function get_stats(col::AbstractArray{>:Missing})
nomissing = collect(skipmissing(col))
q = try quantile(nomissing, [.25, .5, .75]) catch [nothing, nothing, nothing] end
Dict(
:mean => nothing,
:std => nothing,
:min => nothing,
:q25 => nothing,
:median => nothing,
:q75 => nothing,
:max => nothing,
:mean => try mean(nomissing) catch end,
:std => try Compat.std(nomissing) catch end,
:min => try minimum(nomissing) catch end,
:q25 => q[1],
:median => q[2],
:q75 => q[3],
:max => try maximum(nomissing) catch end,
:nmissing => count(ismissing, col),
:nunique => length(unique(col)),
:first => first(col),
:last => last(col),
:eltype => Missings.T(eltype(col))
)
end

function get_stats(col)
q = try quantile(col, [.25, .5, .75]) catch [nothing, nothing, nothing] end
Dict(
:mean => nothing,
:std => nothing,
:min => nothing,
:q25 => nothing,
:median => nothing,
:q75 => nothing,
:max => nothing,
:mean => try mean(col) catch end,
:std => try Compat.std(col) catch end,
:min => try minimum(col) catch end,
:q25 => q[1],
:median => q[2],
:q75 => q[3],
:max => try maximum(col) catch end,
:nmissing => nothing,
:nunique => length(unique(col)),
:first => first(col),
:last => last(col),
:eltype => eltype(col)
)
end
Expand Down
1 change: 1 addition & 0 deletions src/abstractdataframe/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ end
#' showcols(df)
function showcols(io::IO, df::AbstractDataFrame, all::Bool = false,
values::Bool = true) # -> Nothing
Base.depwarn("showcols has been deprecated. Use `describe(df, stats = [:eltype, :nmissing, :first, :last]` instead", :showcols)
print(io, summary(df))
metadata = DataFrame(Name = _names(df),
Eltype = eltypes(df),
Expand Down
29 changes: 17 additions & 12 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,43 +332,48 @@ module TestDataFrame
# Construct the test dataframe
df = DataFrame(number = [1, 2, 3, 4],
number_missing = [1,2, 3, missing],
non_number = ["a", "b", "c", "d"],
non_number_missing = ["a", "b", "c", missing],
string = ["a", "b", "c", "d"],
string_missing = ["a", "b", "c", missing],
dates = Date.([2000, 2001, 2003, 2004]),
catarray = CategoricalArray([1,2,1,2]))

describe_output = DataFrame(variable = [:number, :number_missing, :non_number,
:non_number_missing, :dates, :catarray],
describe_output = DataFrame(variable = [:number, :number_missing, :string,
:string_missing, :dates, :catarray],
mean = [2.5, 2.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, "a", "a", Date(2000), nothing],
median = [2.5, 2.0, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, "d", "c", Date(2004), nothing],
nunique = [nothing, nothing, 4, 4, 4, 2],
nmissing = [nothing, 1, nothing, 1, nothing, nothing],
eltype = [Int, Int, String, String, Date, eltype(df[:catarray])])
describe_output_all_stats = DataFrame(variable = [:number, :number_missing,
:non_number, :non_number_missing,
:string, :string_missing,
:dates, :catarray],
mean = [2.5, 2.0, nothing, nothing, nothing, nothing],
std = [Compat.std(df[:number]), 1.0, nothing,
nothing, nothing, nothing],
min = [1.0, 1.0, nothing, nothing, nothing, nothing],
min = [1.0, 1.0, "a", "a", Date(2000), nothing],
q25 = [1.75, 1.5, nothing, nothing, nothing, nothing],
median = [2.5, 2.0, nothing, nothing, nothing, nothing],
q75 = [3.25, 2.5, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, nothing, nothing, nothing, nothing],
max = [4.0, 3.0, "d", "c", Date(2004), nothing],
nunique = [nothing, nothing, 4, 4, 4, 2],
nmissing = [nothing, 1, nothing, 1, nothing, nothing],
first = [1, 1, "a", "a", Date(2000), 1],
last = [4, missing, "d", missing, Date(2004), 2],
eltype = [Int, Int, String, String, Date,
eltype(df[:catarray])])


# Test that it works as a whole, without keyword arguments
@test describe_output == describe(df)

# Test that it works with one stats argument
@test describe_output[[:variable, :mean]] == describe(df, stats = [:mean])

# Test that it works with all keyword arguments
@test describe_output_all_stats ==
describe(df, stats = [:mean, :std, :min, :q25, :median, :q75, :max,
:nunique, :nmissing, :eltype])
# Use isqual because we have `missing`s in this dataframe
@test describe_output_all_stats describe(df, all = true)
end

#Check the output of unstack
Expand Down

0 comments on commit 0c9de49

Please sign in to comment.