JuliaData · kmsquire · Apr 1, 2013 · Apr 9, 2013
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -25,8 +25,7 @@ using Stats
 ##############################################################################
 
 const DEFAULT_COLUMN_TYPE = Float64
-const POOLED_DATA_VEC_REF_TYPE = Uint16
-const POOLED_DATA_VEC_REF_CONVERTER = uint16
+const DEFAULT_POOLED_REF_TYPE = Uint16
 
 ##############################################################################
 ##
@@ -73,6 +72,7 @@ export # reconcile_groups,
        colvars,
        colwise,
        combine,
+       compact,
        complete_cases,
        complete_cases!,
        cut,

diff --git a/src/dataframe.jl b/src/dataframe.jl
@@ -1114,8 +1114,8 @@ nas{T}(dv::DataArray{T}, dims) =   # TODO move to datavector.jl?
 
 zeros{T<:ByteString}(::Type{T},args...) = fill("",args...) # needed for string arrays in the `nas` method above
 
-nas{T}(dv::PooledDataVector{T}, dims) =
-    PooledDataArray(fill(uint16(1), dims), dv.pool)
+nas{T,R}(dv::PooledDataVector{T,R}, dims) =
+    PooledDataArray(RefArray(fill(one(R), dims)), dv.pool)
 
 nas(df::DataFrame, dims) = 
     DataFrame([nas(x, dims) for x in df.columns], colnames(df)) 

diff --git a/src/extras.jl b/src/extras.jl
@@ -96,7 +96,7 @@ function cut{S, T}(x::Vector{S}, breaks::Vector{T})
     if breaks[end] < max_x
         push!(breaks, max_x)
     end
-    refs = fill(POOLED_DATA_VEC_REF_CONVERTER(0), length(x))
+    refs = fill(zero(DEFAULT_POOLED_REF_TYPE), length(x))
     for i in 1:length(x)
         if x[i] == min_x
             refs[i] = 1
@@ -116,7 +116,7 @@ function cut{S, T}(x::Vector{S}, breaks::Vector{T})
     for i in 2:(n - 1)
         pool[i] = string("(", from[i], ",", to[i], "]")
     end
-    PooledDataArray(refs, pool)
+    PooledDataArray(RefArray(refs), pool)
 end
 cut(x::Vector, ngroups::Int) = cut(x, quantile(x, [1 : ngroups - 1] / ngroups))
 

diff --git a/src/indexing.jl b/src/indexing.jl
@@ -329,17 +329,17 @@ end
 maxShowLength(v::IndexedVector) = length(v) > 0 ? max([length(_string(x)) for x = v.x]) : 0
 
 # Methods to speed up grouping and merging
-function PooledDataArray(d::IndexedVector)
-    refs = zeros(POOLED_DATA_VEC_REF_TYPE, size(d))
-    oneval = one(POOLED_DATA_VEC_REF_TYPE)
+function PooledDataArray{R}(d::IndexedVector, ::Type{R})
+    refs = zeros(R, size(d))
+    oneval = one(R)
     local idx::Int
     ## local lastval::T
-    local poolidx::POOLED_DATA_VEC_REF_TYPE
+    local poolidx::R
     pool = Array(eltype(d), 0)
     # skip over NAs
     nna = length(d) - length(d.x)
     if nna == length(d)
-        return PooledDataArray(refs, pool)
+        return PooledDataArray(RefArray(refs), pool)
     end
     lastval = d.x[d.idx[nna+1]]
     push!(pool, d.x[d.idx[nna+1]])
@@ -354,8 +354,9 @@ function PooledDataArray(d::IndexedVector)
         end
         refs[idx] = poolidx
     end
-    return PooledDataArray(refs, pool)
+    return PooledDataArray(RefArray(refs), pool)
 end
+PooledDataArray(d::IndexedVector) = PooledDataArray(d, DEFAULT_POOLED_REF_TYPE)
 
 DataArray(d::IndexedVector) = DataArray(x.x)
 

diff --git a/src/merge.jl b/src/merge.jl
@@ -95,10 +95,10 @@ function PooledDataVecs(df1::AbstractDataFrame,
         ngroups = ngroups * (length(dv1.pool) + 1)
     end
     pool = [1:ngroups]
-    (PooledDataArray(refs1, pool), PooledDataArray(refs2, pool))
+    (PooledDataArray(RefArray(refs1), pool), PooledDataArray(RefArray(refs2), pool))
 end
 
-function PooledDataArray(df::AbstractDataFrame)
+function PooledDataArray{R}(df::AbstractDataFrame, ::Type{R})
     # This method exists to allow another way for merge to work with
     # multiple columns. It takes the columns of the DataFrame and
     # returns a DataArray with a merged pool that "keys" the
@@ -107,7 +107,7 @@ function PooledDataArray(df::AbstractDataFrame)
     #   - I skipped the sort to make it faster.
     #   - Converting each individual one-row DataFrame to a Tuple
     #     might be faster.
-    refs = zeros(POOLED_DATA_VEC_REF_TYPE, nrow(df))
+    refs = zeros(R, nrow(df))
     poolref = Dict{AbstractDataFrame, Int}()
     pool = Array(Uint64, 0)
     j = 1
@@ -122,8 +122,9 @@ function PooledDataArray(df::AbstractDataFrame)
             j += 1
         end
     end
-    return PooledDataArray(refs, pool)
+    return PooledDataArray(RefArray(refs), pool)
 end
+PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE)
 
 function merge(df1::AbstractDataFrame, df2::AbstractDataFrame, bycol, jointype)