JuliaData · kmsquire · Apr 1, 2013 · Apr 9, 2013 · Apr 30, 2013 · Apr 30, 2013
diff --git a/doc/sections/03_design_details.md b/doc/sections/03_design_details.md
@@ -196,9 +196,28 @@ Like the similar functions in Julia's Base, we can specify the length and type o
 
 	dv = datatrues(5)
 
-# The PooledDataVector Type
+# The PooledDataArray Type
+
+On the surface, `PooledDataArray`s look like `DataArray`s, but their implementation allows the efficient storage and manipulation of `DataVector`s and `DataArrays` which only contain a small number of values.  Internally, `PooledDataArray`s hold a pool of unique values, and the actual `DataArray` simply indexes into this pool, rather than storing each value individually.
+
+A `PooledDataArray` can be constructed from an `Array` or `DataArray`, and as with regular `DataArray`s, it can hold `NA` values:
+
+	pda  = PooledDataArray([1, 1, 1, 1, 2, 3, 2, 2, 3, 3, 3])
+        pda2 = PooledDataArray(DataArray["red", "green", "yellow", "yellow", "red", "orange", "red", "green"])
+
+`PooledDataArray`s can also be created empty or with a fixed size and a specific type:
+
+	pda3 = PooledDataArray(String, 2000)   # A pooled data array of 2000 strings, intially filled with NAs
+	pda4 = PooledDataArray(Float64)        # An empty pooled data array of floats
+
+By default, the index into the pool of values is a Uint32, allowing 2^32 possible pool values.  If you know that you will only have a much smaller number of unique values, you can specify a smaller reference index type, to save space:
+
+	pda5 = PooledDataArray(String, Uint8, 5000, 2)  # Create a 5000x2 array of String values, 
+	                                                # initialized to NA, 
+                                                        # with at most 2^8=256 unique values
+
+`PooledDataVectors`s can be used as columns in DataFrames.
 
-_TO BE FILLED IN_
 
 # The DataFrame Type
 

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -25,8 +25,7 @@ using Stats
 ##############################################################################
 
 const DEFAULT_COLUMN_TYPE = Float64
-const POOLED_DATA_VEC_REF_TYPE = Uint16
-const POOLED_DATA_VEC_REF_CONVERTER = uint16
+const DEFAULT_POOLED_REF_TYPE = Uint32
 
 ##############################################################################
 ##
@@ -76,6 +75,7 @@ export # reconcile_groups,
        colvars,
        colwise,
        combine,
+       compact,
        complete_cases,
        complete_cases!,
        cut,
@@ -232,9 +232,9 @@ export # reconcile_groups,
 include("utils.jl")
 include("natype.jl")
 include("dataarray.jl")
+include("pooleddataarray.jl")
 include("datavector.jl")
 include("datamatrix.jl")
-include("pooleddataarray.jl")
 include("index.jl")
 include("namedarray.jl")
 include("dataframe.jl")

diff --git a/src/RDA.jl b/src/RDA.jl
@@ -291,8 +291,15 @@ function data(ri::RInteger)
     dd = ri.data
     msng = dd .== R_NA_INT32
     if !inherits(ri, "factor") return DataArray(dd, msng) end
-    dd[msng] = zero(eltype(dd))
-    PooledDataArray(POOLED_DATA_VEC_REF_CONVERTER(dd), ri.attr["levels"].data)
+    pool = ri.attr["levels"].data
+    sz = length(pool)
+    REFTYPE = sz <= typemax(Uint8)  ? Uint8 :
+              sz <= typemax(Uint16) ? Uint16 :
+              sz <= typemax(Uint32) ? Uint32 :
+                                      Uint64
+    refs = convert(Vector{REFTYPE}, dd)
+    refs[msng] = zero(REFTYPE)
+    PooledDataArray(RefArray(refs), pool)
 end
 
 data(rs::RString) = DataArray(rs.data, falses(length(rs.data)))

diff --git a/src/dataframe.jl b/src/dataframe.jl
@@ -1115,8 +1115,8 @@ nas{T}(dv::DataArray{T}, dims) =   # TODO move to datavector.jl?
 
 zeros{T<:ByteString}(::Type{T},args...) = fill("",args...) # needed for string arrays in the `nas` method above
 
-nas{T}(dv::PooledDataVector{T}, dims) =
-    PooledDataArray(fill(POOLED_DATA_VEC_REF_TYPE(1), dims), dv.pool)
+nas{T,R}(dv::PooledDataVector{T,R}, dims) =
+    PooledDataArray(RefArray(fill(one(R), dims)), dv.pool)
 
 nas(df::DataFrame, dims) = 
     DataFrame([nas(x, dims) for x in df.columns], colnames(df)) 

diff --git a/src/datavector.jl b/src/datavector.jl
@@ -65,7 +65,6 @@ tail(dv::AbstractDataVector) = dv[max(length(dv) - 6, 1):length(dv)]
 ##
 ##############################################################################
 
-# TODO: Fill in definitions for PooledDataVector's
 # TODO: Macroize these definitions
 
 function push!{T}(dv::DataVector{T}, v::NAtype)
@@ -119,5 +118,31 @@ function map(f::Function, dv::DataVector)   # should this be an AbstractDataVect
     return res
 end
 
+function push!{T,R}(pdv::PooledDataVector{T,R}, v::NAtype)
+    push!(pdv.refs, zero(R))
+    return v
+end
+
+function push!{S,R,T}(pdv::PooledDataVector{S,R}, v::T)
+    v = convert(S,v)
+    push!(pdv.refs, getpoolidx(pdv, v))
+    return v
+end
+
+pop!(pdv::PooledDataVector) = pdv.pool[pop!(pdv.refs)]
+
+function unshift!{T,R}(pdv::PooledDataVector{T,R}, v::NAtype)
+    unshift!(pdv.refs, zero(R))
+    return v
+end
+
+function unshift!{S,R,T}(pdv::PooledDataVector{S,R}, v::T)
+    v = convert(S,v)
+    unshift!(pdv.refs, getpoolidx(pdv, v))
+    return v
+end
+
+shift!(pdv::PooledDataVector) = pdv.pool[shift!(pdv.refs)]
+
 reverse(x::AbstractDataVector) = x[end:-1:1]
 
diff --git a/src/extras.jl b/src/extras.jl
@@ -96,7 +96,7 @@ function cut{S, T}(x::Vector{S}, breaks::Vector{T})
     if breaks[end] < max_x
         push!(breaks, max_x)
     end
-    refs = fill(POOLED_DATA_VEC_REF_CONVERTER(0), length(x))
+    refs = fill(zero(DEFAULT_POOLED_REF_TYPE), length(x))
     for i in 1:length(x)
         if x[i] == min_x
             refs[i] = 1
@@ -116,7 +116,7 @@ function cut{S, T}(x::Vector{S}, breaks::Vector{T})
     for i in 2:(n - 1)
         pool[i] = string("(", from[i], ",", to[i], "]")
     end
-    PooledDataArray(refs, pool)
+    PooledDataArray(RefArray(refs), pool)
 end
 cut(x::Vector, ngroups::Int) = cut(x, quantile(x, [1 : ngroups - 1] / ngroups))
 

diff --git a/src/indexing.jl b/src/indexing.jl
@@ -306,17 +306,17 @@ end
 maxShowLength(v::IndexedVector) = length(v) > 0 ? max([length(_string(x)) for x = v.x]) : 0
 
 # Methods to speed up grouping and merging
-function PooledDataArray(d::IndexedVector)
-    refs = zeros(POOLED_DATA_VEC_REF_TYPE, size(d))
-    oneval = one(POOLED_DATA_VEC_REF_TYPE)
+function PooledDataArray{R}(d::IndexedVector, ::Type{R})
+    refs = zeros(R, size(d))
+    oneval = one(R)
     local idx::Int
     ## local lastval::T
-    local poolidx::POOLED_DATA_VEC_REF_TYPE
+    local poolidx::R
     pool = Array(eltype(d), 0)
     # skip over NAs
     nna = length(d) - length(d.x)
     if nna == length(d)
-        return PooledDataArray(refs, pool)
+        return PooledDataArray(RefArray(refs), pool)
     end
     lastval = d.x[d.idx[nna+1]]
     push!(pool, d.x[d.idx[nna+1]])
@@ -331,8 +331,9 @@ function PooledDataArray(d::IndexedVector)
         end
         refs[idx] = poolidx
     end
-    return PooledDataArray(refs, pool)
+    return PooledDataArray(RefArray(refs), pool)
 end
+PooledDataArray(d::IndexedVector) = PooledDataArray(d, DEFAULT_POOLED_REF_TYPE)
 
 DataArray(d::IndexedVector) = DataArray(x.x)
 

diff --git a/src/merge.jl b/src/merge.jl
@@ -95,10 +95,10 @@ function PooledDataVecs(df1::AbstractDataFrame,
         ngroups = ngroups * (length(dv1.pool) + 1)
     end
     pool = [1:ngroups]
-    (PooledDataArray(refs1, pool), PooledDataArray(refs2, pool))
+    (PooledDataArray(RefArray(refs1), pool), PooledDataArray(RefArray(refs2), pool))
 end
 
-function PooledDataArray(df::AbstractDataFrame)
+function PooledDataArray{R}(df::AbstractDataFrame, ::Type{R})
     # This method exists to allow another way for merge to work with
     # multiple columns. It takes the columns of the DataFrame and
     # returns a DataArray with a merged pool that "keys" the
@@ -107,7 +107,7 @@ function PooledDataArray(df::AbstractDataFrame)
     #   - I skipped the sort to make it faster.
     #   - Converting each individual one-row DataFrame to a Tuple
     #     might be faster.
-    refs = zeros(POOLED_DATA_VEC_REF_TYPE, nrow(df))
+    refs = zeros(R, nrow(df))
     poolref = Dict{AbstractDataFrame, Int}()
     pool = Array(Uint64, 0)
     j = 1
@@ -122,8 +122,9 @@ function PooledDataArray(df::AbstractDataFrame)
             j += 1
         end
     end
-    return PooledDataArray(refs, pool)
+    return PooledDataArray(RefArray(refs), pool)
 end
+PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE)
 
 function merge(df1::AbstractDataFrame, df2::AbstractDataFrame, bycol, jointype)