JuliaPlots · mkborregaard · Jan 18, 2017 · Dec 1, 2016 · Dec 2, 2016 · Dec 9, 2016
diff --git a/README.md b/README.md
@@ -121,3 +121,47 @@ groupedbar(rand(10,3), bar_position = :dodge, bar_width=0.7)
 ```
 
 ![tmp](https://cloud.githubusercontent.com/assets/933338/18962092/673f6c78-863d-11e6-9ee9-8ca104e5d2a3.png)
+
+
+## groupapply for population analysis
+There is a groupapply function that splits the data across a keyword argument "group", then applies "summarize" to get average and variability of a given analysis (density, cumulative and local regression are supported so far, but one can also add their own function). To get average and variability there are 3 ways:
+
+- `compute_error = (:across, col_name)`, where the data is split according to column `col_name` before being summarized. `compute_error = :across` splits across all observations. Default summary is `(mean, sem)` but it can be changed with keyword `summarize` to any pair of functions.
+
+- `compute_error = (:bootstrap, n_samples)`, where `n_samples` fake datasets are generated using nonparametric bootstrapping and then summarized. `compute_error = :bootstrap` defaults to `compute_error = (:bootstrap, 1000)`. Default summary is `(mean, std)`. This method will work with any analysis but is computationally very expensive.
+
+- `compute_error = :none`, where no error is computed or displayed and the analysis is carried out normally.
+
+The local regression uses [Loess.jl](https://github.com/JuliaStats/Loess.jl) and the density plot uses [KernelDensity.jl](https://github.com/JuliaStats/KernelDensity.jl). In case of categorical x variable, these function are computed by splitting the data across the x variable and then computing the density/average per bin. The choice of continuous or discrete axis can be forced via `axis_type = :continuous` or `axis_type = :discrete`
+
+Example use:
+
+```julia
+using DataFrames
+import RDatasets
+using StatPlots
+gr()
+school = RDatasets.dataset("mlmRev","Hsb82");
+grp_error = groupapply(:cumulative, school, :MAch; compute_error = (:across,:School), group = :Sx)
+plot(grp_error, line = :path)
+```
+<img width="494" alt="screenshot 2016-12-19 12 28 27" src="https://cloud.githubusercontent.com/assets/6333339/21313005/316e0f0c-c5e7-11e6-9464-f0921dee3d29.png">
+
+Keywords for loess or kerneldensity can be given to groupapply:
+
+```julia
+df = groupapply(:density, school, :CSES; bandwidth = 1., compute_error = (:bootstrap,500), group = :Minrty)
+plot(df, line = :path)
+```
+
+<img width="487" alt="screenshot 2017-01-10 18 36 48" src="https://cloud.githubusercontent.com/assets/6333339/21819500/cb788fb8-d763-11e6-89b9-91018f2b9a2a.png">
+
+
+The bar plot
+
+```julia
+pool!(school, :Sx)
+grp_error = groupapply(school, :Sx, :MAch; compute_error = :across, group = :Minrty)
+plot(grp_error, line = :bar)
+```
+<img width="489" alt="screenshot 2017-01-10 18 20 51" src="https://cloud.githubusercontent.com/assets/6333339/21819413/7923681e-d763-11e6-907d-c81447b4cc99.png">
diff --git a/REQUIRE b/REQUIRE
@@ -6,3 +6,4 @@ StatsBase
 Distributions
 DataFrames
 KernelDensity
+Loess
diff --git a/src/StatPlots.jl b/src/StatPlots.jl
@@ -9,11 +9,15 @@ using Distributions
 using DataFrames
 
 import KernelDensity
+import Loess
 @recipe f(k::KernelDensity.UnivariateKDE) = k.x, k.density
 @recipe f(k::KernelDensity.BivariateKDE) = k.x, k.y, k.density
 
 @shorthands cdensity
 
+export groupapply
+export get_summary
+
 include("dataframes.jl")
 include("corrplot.jl")
 include("cornerplot.jl")
@@ -24,5 +28,8 @@ include("hist.jl")
 include("marginalhist.jl")
 include("bar.jl")
 include("shadederror.jl")
+include("groupederror.jl")
+
+
 
 end # module
diff --git a/src/bar.jl b/src/bar.jl
@@ -53,7 +53,7 @@ grouped_xy(y::AbstractMatrix) = 1:size(y,1), y
         end
         fr
     else
-        get(d, :fillrange, 0)
+        get(d, :fillrange, nothing)
     end
 
     seriestype := :bar

diff --git a/src/groupederror.jl b/src/groupederror.jl
@@ -0,0 +1,308 @@
+##### List of functions to analyze data
+
+function extend_axis(df::AbstractDataFrame, xlabel, ylabel, xaxis, val)
+    aux = DataFrame()
+    aux[xlabel] = xaxis
+    extended = join(aux, df, on = xlabel, kind = :left)
+    sort!(extended, cols = [xlabel])
+    return convert(Array, extended[ylabel], val)
+end
+
+function extend_axis(xsmall, ysmall, xaxis, val)
+    df = DataFrame(x = xsmall, y = ysmall)
+    return extend_axis(df, :x, :y, xaxis, val)
+end
+
+"""
+    `_locreg(df, xaxis::LinSpace, x, y; kwargs...)`
+
+Apply loess regression, training the regressor with `x` and `y` and
+predicting `xaxis`
+"""
+function _locreg(df, xaxis::LinSpace, x, y; kwargs...)
+    predicted = fill(NaN,length(xaxis))
+    within = minimum(df[x]).<= xaxis .<= maximum(df[x])
+    if any(within)
+        model = Loess.loess(convert(Vector{Float64},df[x]),convert(Vector{Float64},df[y]); kwargs...)
+        predicted[within] = Loess.predict(model,xaxis[within])
+    end
+    return predicted
+end
+
+
+"""
+    `_locreg(df, xaxis, x, y; kwargs...)`
+
+In the discrete case, the function computes the conditional expectation of `y` for
+a given value of `x`
+"""
+function _locreg(df, xaxis, x,  y)
+  ymean = by(df, x) do dd
+      DataFrame(m = mean(dd[y]))
+  end
+  return extend_axis(ymean, x, :m, xaxis, NaN)
+end
+
+"""
+    `_density(df,xaxis::LinSpace, x; kwargs...)`
+
+Kernel density of `x`, computed along `xaxis`
+"""
+_density(df,xaxis::LinSpace, x; kwargs...) = pdf(KernelDensity.kde(df[x]; kwargs...),xaxis)
+
+"""
+    `_density(df, xaxis, x)`
+
+Normalized histogram of `x` (which is discrete: every value is its own bin)
+"""
+function _density(df,xaxis, x)
+    xhist = by(df, x) do dd
+        DataFrame(length = size(dd,1)/size(df,1))
+    end
+    return extend_axis(xhist, x, :length, xaxis, 0.)
+end
+
+"""
+    `_cumulative(df, xaxis, x) = ecdf(df[x])(xaxis)`
+
+Cumulative density function of `x`, computed along `xaxis`
+"""
+_cumulative(df, xaxis, x) = ecdf(df[x])(xaxis)
+
+
+#### Method to compute and plot grouped error plots using the above functions
+
+type GroupedError{S, T<:AbstractString}
+    x::Vector{Vector{S}}
+    y::Vector{Vector{Float64}}
+    err::Vector{Vector{Float64}}
+    group::Vector{T}
+    axis_type::Symbol
+    show_error::Bool
+end
+
+get_axis(column) = sort!(union(column))
+get_axis(column, npoints) = linspace(minimum(column),maximum(column),npoints)
+
+# f is the function used to analyze dataset: define it as nan when it is not defined,
+# the input is: dataframe used, points chosen on the x axis, x (and maybe y) column labels
+# the output is the y value for the given xvalues
+
+errortype(s::Symbol) = s
+errortype(s) = s[1]
+
+function newsymbol(s, l::AbstractArray{Symbol})
+    ns = s
+    while ns in l
+        ns = Symbol(ns,:_)
+    end
+    return ns
+end
+
+newsymbol(s, df::AbstractDataFrame) = newsymbol(s, names(df))
+
+
+"""
+    get_summary(trend,variation, f, splitdata::GroupedDataFrame, xvalues, args...; kwargs...)
+
+Apply function `f` to `splitdata`, then compute summary statistics
+`trend` and `variation` of those values. A shared x axis `xvalues` is needed: use
+`LinSpace` for continuous x variable and a normal vector for the discrete case. Remaining arguments
+are label of x axis variable and extra arguments for function `f`. `kwargs...` are passed
+to `f`
+"""
+function get_summary(trend,variation, f, splitdata::GroupedDataFrame, xvalues, args...; kwargs...)
+    v = Array(Float64, length(xvalues), length(splitdata));
+    for i in 1:length(splitdata)
+        v[:,i] = f(splitdata[i],xvalues, args...; kwargs...)
+    end
+    mean_across_pop = Array(Float64, length(xvalues));
+    sem_across_pop = Array(Float64, length(xvalues));
+    for j in 1:length(xvalues)
+        notnan = !isnan(v[j,:])
+        mean_across_pop[j] = trend(v[j,notnan])
+        sem_across_pop[j] = variation(v[j,notnan])
+    end
+    valid = !isnan(mean_across_pop) & !isnan(sem_across_pop)
+    return xvalues[valid], mean_across_pop[valid], sem_across_pop[valid]
+end
+
+"""
+    get_summary(trend,variation, f, df::AbstractDataFrame, ce, axis_type, args...; kwargs...)
+
+Get GropedDataFrame from `df` according to `ce`. `ce = (:across, col_name)` will split
+across column `col_name`, whereas `ce = (:bootstrap, n_samples)` will generate `n_samples`
+fake datasets distribute like the real dataset (nonparametric bootstrapping). Choose shared axis
+according to `axis_type` (`:continuous` or `:discrete`) then compute `get_summary`
+"""
+function get_summary(trend,variation, f, df::AbstractDataFrame, ce, axis_type, args...; kwargs...)
+    # define points on x axis
+    if axis_type == :discrete
+        xvalues = get_axis(df[args[1]])
+    elseif axis_type == :continuous
+        xvalues = get_axis(df[args[1]], 100)
+    else
+        error("Unexpected axis_type: only :discrete and :continuous allowed!")
+    end
+
+    if ce == :none
+        mean_across_pop = f(df,xvalues, args...; kwargs...)
+        sem_across_pop = zeros(length(xvalues));
+        valid = ~isnan(mean_across_pop)
+        return xvalues[valid], mean_across_pop[valid], sem_across_pop[valid]
+    elseif ce[1] == :across
+        # get mean value and sem of function of interest
+        splitdata = groupby(df, ce[2])
+        return get_summary(trend,variation, f, splitdata, xvalues, args...; kwargs...)
+    elseif ce[1] == :bootstrap
+        n_samples = ce[2]
+        indexes = Array(Int64,0)
+        split_var = Array(Int64,0)
+        for i = 1:n_samples
+            append!(indexes, rand(1:size(df,1),size(df,1)))
+            append!(split_var, fill(i,size(df,1)))
+        end
+        split_col = newsymbol(:split_col, df)
+        bootstrap_data = df[indexes,:]
+        bootstrap_data[split_col] = split_var
+        ends = collect(size(df,1)*(1:n_samples))
+        starts = ends - size(df,1) + 1
+        splitdata = GroupedDataFrame(bootstrap_data,[split_col],collect(1:ends[end]), starts, ends)
+        return get_summary(trend,variation, f, splitdata, xvalues, args...; kwargs...)
+    else
+        error("compute_error can only be equal to :none, :across,
+        (:across, col_name), :bootstrap or (:bootstrap, n_samples)")
+    end
+end
+
+"""
+    groupapply(f::Function, df, args...; axis_type = :auto, across = [], group = [], summarize = (mean, sem), kwargs...)
+
+Split `df` by `group`. Then apply `get_summary` to get a population summary of the grouped data.
+Output is a `GroupedError` with error computed according to the keyword `compute_error`.
+It can be plotted using `plot(g::GroupedError)`
+Seriestype can be specified to be `:path`, `:scatter` or `:bar`
+"""
+function groupapply(f::Function, df, args...;
+                    axis_type = :auto, compute_error = :none, group = [],
+                    summarize = (errortype(compute_error) == :bootstrap) ? (mean, std) : (mean, sem), kwargs...)
+    if !(axis_type in [:discrete, :continuous])
+        axis_type = (typeof(df[args[1]])<:PooledDataArray) ? :discrete : :continuous
+    end
+    if (axis_type == :continuous) & !(eltype(df[args[1]])<:Real)
+        warn("Changing to discrete axis, x values are not real numbers!")
+        axis_type = :discrete
+    end
+    mutated_xtype = (axis_type == :continuous) ? Float64 : eltype(df[args[1]])
+
+    # Add default for :across and :bootstrap
+    if compute_error == :across
+        row_name = newsymbol(:rows, df)
+        df[row_name] = 1:size(df,1)
+        ce = (:across, row_name)
+    elseif compute_error == :bootstrap
+        ce = (:bootstrap, 1000)
+    else
+        ce = compute_error
+    end
+
+    g = GroupedError(
+                    Array(Vector{mutated_xtype},0),
+                    Array(Vector{Float64},0),
+                    Array(Vector{Float64},0),
+                    Array(AbstractString,0),
+                    axis_type,
+                    ce != :none
+                    )
+    if group == []
+        xvalues,yvalues,shade = get_summary(summarize..., f, df, ce, axis_type, args...; kwargs...)
+        push!(g.x, xvalues)
+        push!(g.y, yvalues)
+        push!(g.err, shade)
+        push!(g.group, "")
+    else
+        #group_array = isa(group, AbstractArray) ? group : [group]
+        by(df,group) do dd
+            label = isa(group, AbstractArray) ?
+                    string(["$(dd[1,column]) " for column in group]...) : string(dd[1,group])
+            xvalues,yvalues,shade = get_summary(summarize...,f, dd, ce, axis_type, args...; kwargs...)
+            push!(g.x, xvalues)
+            push!(g.y, yvalues)
+            push!(g.err, shade)
+            push!(g.group, label)
+            return
+        end
+    end
+    if compute_error == :across; delete!(df, row_name); end
+
+    return g
+end
+
+builtin_funcs = Dict(zip([:locreg, :density, :cumulative], [_locreg, _density, _cumulative]))
+
+"""
+    groupapply(s::Symbol, df, args...; kwargs...)
+
+`s` can be `:locreg`, `:density` or `:cumulative`, in which case the corresponding built in
+analysis function is used. `s` can also be a symbol of a column of `df`, in which case the call
+is equivalent to `groupapply(:locreg, df, args[1], s; kwargs...)`
+"""
+function groupapply(s::Symbol, df, args...; kwargs...)
+    if s in keys(builtin_funcs)
+        analysisfunction = builtin_funcs[s]
+        return groupapply(analysisfunction, df, args...; kwargs...)
+    else
+        return groupapply(_locreg, df, args[1], s; kwargs...)
+    end
+end
+
+"""
+    groupapply(df::AbstractDataFrame, x, y; kwargs...)
+
+Equivalent to `groupapply(:locreg, df::AbstractDataFrame, x, y; kwargs...)`
+"""
+
+groupapply(df::AbstractDataFrame, x, y; kwargs...) = groupapply(_locreg, df, x, y; kwargs...)
+
+@recipe function f(g::GroupedError)
+    if !(:seriestype in keys(d)) || (d[:seriestype] == :path)
+        for i = 1:length(g.group)
+            @series begin
+                seriestype := :shadederror
+                x := cycle(g.x,i)
+                y := cycle(g.y, i)
+                shade := cycle(g.err,i)
+                label --> cycle(g.group,i)
+                ()
+            end
+        end
+    elseif d[:seriestype] == :scatter
+        for i = 1:length(g.group)
+            @series begin
+                seriestype := :scatter
+                x := cycle(g.x,i)
+                y := cycle(g.y, i)
+                if g.show_error
+                    err := cycle(g.err,i)
+                end
+                label --> cycle(g.group,i)
+                ()
+            end
+        end
+    elseif d[:seriestype] == :bar
+        if g.axis_type == :continuous
+            warn("Bar plot with continuous x axis doesn't make sense!")
+        end
+        xaxis = sort!(union((g.x)...))
+        ys = extend_axis.(g.x, g.y, [xaxis], [NaN])
+        y = hcat(ys...)
+        if g.show_error
+            errs = extend_axis.(g.x, g.err, [xaxis], [NaN])
+            err := hcat(errs...)
+        end
+        label --> hcat(g.group...)
+        StatPlots.GroupedBar((xaxis,y))
+    end
+end
+
+@shorthands GroupedError