From 6090f47fcd5a7281b9e822377bc612b7e5e53ff9 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Wed, 8 Mar 2017 17:41:02 -0800 Subject: [PATCH] Live Free or Doc Hard --- .gitignore | 5 +++++ docs/make.jl | 21 +++++++++++++++++++++ docs/src/da.md | 40 ++++++++++++++++++++++++++++++++++++++++ docs/src/index.md | 26 ++++++++++++++++++++++++++ docs/src/util.md | 14 ++++++++++++++ src/pooleddataarray.jl | 17 +++++++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 .gitignore create mode 100644 docs/make.jl create mode 100644 docs/src/da.md create mode 100644 docs/src/index.md create mode 100644 docs/src/util.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..606a907 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.jl.cov +*.jl.*.cov +*.jl.mem +docs/build +docs/site diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..5aa85ab --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,21 @@ +using DataArrays, Documenter + +makedocs( + modules = [DataArrays], + clean = false, + format = :html, + sitename = "DataArrays.jl", + authors = "Simon Kornblith, John Myles White, and other contributors", + pages = [ + "Home" => "index.md", + "Missing Data and Arrays" => "da.md", + "Utilities" => "util.md", + ], +) + +deploydocs( + repo = "github.com/JuliaStats/DataArrays.jl.git", + target = "build", + deps = nothing, + make = nothing, +) diff --git a/docs/src/da.md b/docs/src/da.md new file mode 100644 index 0000000..da06f5c --- /dev/null +++ b/docs/src/da.md @@ -0,0 +1,40 @@ +# Representing missing data + +```@meta +CurrentModule = DataArrays +``` + +```@docs +NA +NAtype +``` + +## Arrays with possibly missing data + +```@docs +AbstractDataArray +AbstractDataVector +AbstractDataMatrix +DataArray +DataVector +DataMatrix +@data +isna +allna +anyna +dropna +levels +``` + +## Pooled arrays + +```@docs +PooledDataArray +@pdata +compact +setlevels +setlevels! +replace! +PooledDataVecs +getpoolidx +``` diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..b2be19c --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,26 @@ +# DataArrays.jl + +This package provides functionality for working with [missing data](https://en.wikipedia.org/wiki/Missing_data) +in Julia. +In particular, it provides the following: + +* `NA`: A singleton representing a missing value +* `DataArray{T}`: An array type that can house both values of type `T` and missing values +* `PooledDataArray{T}`: An array type akin to `DataArray` but optimized for arrays with a smaller set of unique + values, as commonly occurs with categorical data + +## Installation + +This package is available for Julia versions 0.6 and up. +To install it, run + +```julia +Pkg.add("DataArrays") +``` + +from the Julia REPL. + +## Contents + +```@contents +``` diff --git a/docs/src/util.md b/docs/src/util.md new file mode 100644 index 0000000..fb03365 --- /dev/null +++ b/docs/src/util.md @@ -0,0 +1,14 @@ +# Utility functions + +```@meta +CurrentModule = DataArrays +``` + +```@docs +cut +gl +xtab +xtabs +reldiff +percent_change +``` diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 9de0f6a..dd08bb5 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -542,6 +542,13 @@ Base.find(pdv::PooledDataVector{Bool}) = find(convert(Vector{Bool}, pdv, false)) ## ############################################################################## +""" + getpoolidx(pda::PooledDataArray, val) + +Return the index of the first occurrence of `val` in the value pool for `pda`. +If `val` is not already in the value pool, `pda` is modified to include it in +the pool. +""" function getpoolidx{T,R}(pda::PooledDataArray{T,R}, val::Any) val::T = convert(T,val) pool_idx = findfirst(pda.pool, val) @@ -587,6 +594,11 @@ end ## ############################################################################## +""" + replace!(x::PooledDataArray, from, to) + +Replace all occurrences of `from` in `x` with `to`, modifying `x` in place. +""" function replace!(x::PooledDataArray{NAtype}, fromval::NAtype, toval::NAtype) NA # no-op to deal with warning end @@ -676,7 +688,12 @@ Perm{O<:Base.Sort.Ordering}(o::O, v::PooledDataVector) = FastPerm(o, v) ## ############################################################################## +""" + PooledDataVecs(v1, v2) -> (pda1, pda2) +Return a tuple of `PooledDataArray`s created from the data in `v1` and `v2`, +respectively, but sharing a common value pool. +""" function PooledDataVecs{S,Q<:Integer,R<:Integer,N}(v1::PooledDataArray{S,Q,N}, v2::PooledDataArray{S,R,N}) pool = sort(unique([v1.pool; v2.pool]))