From 21899d9c4238e0108c4d3e11eb82579dd2358500 Mon Sep 17 00:00:00 2001 From: abieler Date: Sun, 30 Apr 2017 21:26:47 +0200 Subject: [PATCH 01/18] Initial commit --- src/MLPreprocessing.jl | 17 +++-- src/fixedrange.jl | 154 +++++++++++++++++++++++++++++++++++++++++ src/standardize.jl | 124 +++++++++++++++++++++++++++++++++ 3 files changed, 291 insertions(+), 4 deletions(-) create mode 100644 src/fixedrange.jl create mode 100644 src/standardize.jl diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index 5ed2d8a..6ee7870 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -13,16 +13,25 @@ export expand_poly, center!, - rescale!, + standardize!, + fixedrange!, - FeatureNormalizer, + StandardScaler, + FixedRangeScaler, fit, - predict, - predict! + transform, + transform! + + #= rescale!, =# + #= FeatureNormalizer, =# + #= predict, =# + #= predict! =# include("basis_expansion.jl") include("center.jl") include("rescale.jl") include("featurenormalizer.jl") +include("standardize.jl") +include("fixedrange.jl") end # module diff --git a/src/fixedrange.jl b/src/fixedrange.jl new file mode 100644 index 0000000..5ffd7c3 --- /dev/null +++ b/src/fixedrange.jl @@ -0,0 +1,154 @@ +""" + lower, upper, xmin, xmax = fixedrange!(X[, lower, upper, xmin, xmax; obsdim]) + +Rescale `X` to the interval (lower:upper) along `obsdim`. If `upper` and `lower` are not +provided they default to 0 and 1 respectively, rescaling the data to the unit range (0:1). +`xmin` and `xmax` are vectors consisiting of the maximum and minimum values of `X` along obsdim. +`xmin`, `xmax` default to minimum(X, obsdim) and maximum(X, obsdim) respectively. +`obsdim` refers to the dimension of observations, e.g. `obsdim`=1 if the rows of `X` correspond to +measurements. `obsdim`=2 if columns of `X` represent measurements. + +Examples: + + X = rand(10, 4) + + fixedrange!(X, obsdim=1) + fixedrange!(X, -1, 1, obsdim=2) + +""" + +function fixedrange!(X; obsdim=LearnBase.default_obsdim(X)) + fixedrange!(X, convert(ObsDimension, obsdim)) +end + +function fixedrange!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) + fixedrange!(X, ObsDim.Constant{N}()) +end + +function fixedrange!{M}(X, obsdim::ObsDim.Constant{M}) + lower = 0 + upper = 1 + xmin = minimum(X, M) + xmax = maximum(X, M) + fixedrange!(X, lower, upper, xmin, xmax, obsdim) +end + +function fixedrange!(X, lower, upper; obsdim=LearnBase.default_obsdim(X)) + fixedrange!(X, lower, upper, convert(ObsDimension, obsdim)) +end + +function fixedrange!{M}(X, lower, upper, obsdim::ObsDim.Constant{M}) + xmin = minimum(X, M) + xmax = maximum(X, M) + fixedrange!(X, lower, upper, xmin, xmax, obsdim) +end + +function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdim(X)) + fixedrange!(X, lower, upper, xmin, xmax, convert(ObsDimension, obsdim)) +end + +function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Constant{1}) + xrange = xmax .- xmin + scale = upper - lower + nObs, nVars = size(X) + + for iVar in 1:nVars + @inbounds for iObs in 1:nObs + X[iObs, iVar] = lower + (X[iObs, iVar] - xmin[iVar]) / xrange[iVar] * scale + end + end + lower, upper, xmin, xmax +end + +function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Constant{2}) + xrange = xmax .- xmin + scale = upper - lower + nVars, nObs = size(X) + + for iObs in 1:nObs + @inbounds for iVar in 1:nVars + X[iVar, iObs] = lower + (X[iVar, iObs] - xmin[iVar]) / xrange[iVar] * scale + end + end + lower, upper, xmin, xmax +end + +function fixedrange!{N}(X::AbstractVector, lower::Real, upper::Real, xmin::Vector, xmax::Vector, ::ObsDim.Constant{N}) + @assert length(xmin) == length(xmax) == length(X) + xrange = xmax .- xmin + scale = upper - lower + nVars = length(X) + + @inbounds for iVar in eachindex(X) + X[iVar] = lower + (X[iVar] - xmin[iVar]) / xrange[iVar] * scale + end + lower, upper, xmin, xmax +end + +function fixedrange!{N}(X::AbstractVector, lower::Real, upper::Real, xmin::Real, xmax::Real, ::ObsDim.Constant{N}) + xrange = xmax - xmin + scale = upper - lower + nVars = length(X) + + @inbounds for iVar in eachindex(X) + X[iVar] = lower + (X[iVar] - xmin) / xrange * scale + end + lower, upper, xmin, xmax +end + + +immutable FixedRangeScaler + lower::Float64 + upper::Float64 + xmin::Vector + xmax::Vector + obsdim::ObsDim.Constant{} + + function FixedRangeScaler(lower, upper, xmin, xmax, obsdim) + @assert length(xmin) == length(xmax) + new(lower, upper, xmin, xmax, convert(ObsDimension, obsdim)) + end +end + + +function FixedRangeScaler{T<:Real}(X::AbstractArray{T}; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, convert(ObsDimension, obsdim)) +end + +function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last) + FixedRangeScaler(X, ObsDim.Constant{M}()) +end + +function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T}, obsdim::ObsDim.Constant{M}) + FixedRangeScaler(0, 1, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) +end + +function FixedRangeScaler{T<:Real}(X::AbstractArray{T}, lower, upper; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim)) +end + +function FixedRangeScaler{T<:Real,M}(X::AbstractMatrix{T}, lower, upper, obsdim::ObsDim.Constant{M}) + FixedRangeScaler(lower, upper, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) +end + +function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T,M}, lower, upper, ::ObsDim.Last) + FixedRangeScaler(X, lower, upper, ObsDim.Constant{M}()) +end + +function StatsBase.fit{T<:Real}(::Type{FixedRangeScaler}, X::AbstractArray{T}; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, obsdim=obsdim) +end + +function StatsBase.fit{T<:Real}(::Type{FixedRangeScaler}, X::AbstractArray{T}, lower, upper; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, lower, upper, obsdim=obsdim) +end + +function transform!{T<:AbstractFloat}(X::AbstractArray{T}, cs::FixedRangeScaler) + unitrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) +end + +function transform{T<:AbstractFloat}(X::AbstractArray{T}, cs::FixedRangeScaler) + Xnew = copy(X) + unitrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + Xnew +end diff --git a/src/standardize.jl b/src/standardize.jl new file mode 100644 index 0000000..56000a2 --- /dev/null +++ b/src/standardize.jl @@ -0,0 +1,124 @@ +""" + μ, σ = standardize!(X[, μ, σ, obsdim]) + +Center `X` along `obsdim` around the corresponding entry in the +vector `μ` and then standardize each feature using the corresponding +entry in the vector `σ`. +""" +function standardize!(X, μ, σ; obsdim=LearnBase.default_obsdim(X)) + standardize!(X, μ, σ, convert(ObsDimension, obsdim)) +end + +function standardize!{T,N}(X::AbstractArray{T,N}, μ, σ, ::ObsDim.Last) + standardize!(X, μ, σ, ObsDim.Constant{N}()) +end + +function standardize!(X; obsdim=LearnBase.default_obsdim(X)) + standardize!(X, convert(ObsDimension, obsdim)) +end + +function standardize!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) + standardize!(X, ObsDim.Constant{N}()) +end + +function standardize!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) + μ = vec(mean(X, M)) + σ = vec(std(X, M)) + standardize!(X, μ, σ, obsdim) +end + +function standardize!(X::AbstractVector, ::ObsDim.Constant{1}) + μ = mean(X) + σ = std(X) + for i in 1:length(X) + X[i] = (X[i] - μ) / σ + end + μ, σ +end + +function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{2}) + σ[σ .== 0] = 1 + nVars, nObs = size(X) + for iObs in 1:nObs + @inbounds for iVar in 1:nVars + X[iVar, iObs] = (X[iVar, iObs] - μ[iVar]) / σ[iVar] + end + end + μ, σ +end + +function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) + σ[σ .== 0] = 1 + nObs, nVars = size(X) + for iVar in 1:nVars + @inbounds for iObs in 1:nObs + X[iObs, iVar] = (X[iObs, iVar] - μ[iVar]) / σ[iVar] + end + end + μ, σ +end + +function standardize!(X::AbstractVector, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) + @inbounds for i in 1:length(X) + X[i] = (X[i] - μ[i]) / σ[i] + end + μ, σ +end + +function standardize!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::ObsDim.Constant{1}) + @inbounds for i in 1:length(X) + X[i] = (X[i] - μ) / σ + end + μ, σ +end + +immutable StandardScaler + offset::Vector{Float64} + scale::Vector{Float64} + obsdim::ObsDim.Constant{} + + function StandardScaler(offset, scale, obsdim) + @assert length(offset) == length(scale) + new(offset, scale, convert(ObsDimension, obsdim)) + end +end + +function StandardScaler{T<:Real}(X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X)) + StandardScaler(X, convert(ObsDimension, obsdim)) +end + +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last) + StandardScaler(X, ObsDim.Constant{M}) +end + +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, obsdim::ObsDim.Constant{M}) + StandardScaler(mean(X, M), std(X, M), obsdim) +end + +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale; obsdim=LearnBase.default_obsdim(X)) + StandardScaler(offset, scale, convert(ObsDimension, obsdim)) +end + +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale, ::ObsDim.Last) + StandardScaler(offset, scale, ObsDim.Constant{M}) +end + +function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X)) + StandardScaler(X, obsdim=obsdim) +end + +function transform!{T<:Real}(cs::StandardScaler, X::AbstractMatrix{T}) + @assert length(cs.offset) == size(X, 1) + standardize!(X, cs.offset, cs.scale, obsdim=cs.obsdim) + X +end + +function transform{T<:AbstractFloat}(cs::StandardScaler, X::AbstractMatrix{T}) + Xnew = copy(X) + transform!(cs, Xnew) +end + +function transform{T<:Real}(cs::StandardScaler, X::AbstractMatrix{T}) + X = convert(AbstractMatrix{AbstractFloat}, X) + transform!(cs, X) +end From 04ea237ac0fe045fc07e10581fa52452ad2f0f00 Mon Sep 17 00:00:00 2001 From: abieler Date: Mon, 8 May 2017 21:42:19 +0200 Subject: [PATCH 02/18] Add some tests --- src/center.jl | 10 +--- src/fixedrange.jl | 77 +++++++++++-------------- src/standardize.jl | 106 +++++++++++++++++++++++++++-------- test/runtests.jl | 4 +- test/tst_fixedrangescaler.jl | 48 ++++++++++++++++ test/tst_rescale.jl | 28 ++++----- test/tst_standardize.jl | 93 ++++++++++++++++++++++++++++++ 7 files changed, 274 insertions(+), 92 deletions(-) create mode 100644 test/tst_fixedrangescaler.jl create mode 100644 test/tst_standardize.jl diff --git a/src/center.jl b/src/center.jl index 12c7de9..e5f93a0 100644 --- a/src/center.jl +++ b/src/center.jl @@ -92,15 +92,7 @@ end # -------------------------------------------------------------------- function center!(D::AbstractDataFrame) - μ_vec = Float64[] - - flt = Bool[T <: Real for T in eltypes(D)] - for colname in names(D)[flt] - μ = mean(D[colname]) - center!(D, colname, μ) - push!(μ_vec, μ) - end - μ_vec + center!(D, names(D)) end function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 5ffd7c3..a406c6e 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -47,7 +47,7 @@ function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdi fixedrange!(X, lower, upper, xmin, xmax, convert(ObsDimension, obsdim)) end -function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Constant{1}) +function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{1}) xrange = xmax .- xmin scale = upper - lower nObs, nVars = size(X) @@ -60,7 +60,7 @@ function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Const lower, upper, xmin, xmax end -function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Constant{2}) +function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{2}) xrange = xmax .- xmin scale = upper - lower nVars, nObs = size(X) @@ -73,8 +73,7 @@ function fixedrange!(X::AbstractMatrix, lower, upper, xmin, xmax, ::ObsDim.Const lower, upper, xmin, xmax end -function fixedrange!{N}(X::AbstractVector, lower::Real, upper::Real, xmin::Vector, xmax::Vector, ::ObsDim.Constant{N}) - @assert length(xmin) == length(xmax) == length(X) +function fixedrange!{M}(X::AbstractVector, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{M}) xrange = xmax .- xmin scale = upper - lower nVars = length(X) @@ -85,70 +84,58 @@ function fixedrange!{N}(X::AbstractVector, lower::Real, upper::Real, xmin::Vecto lower, upper, xmin, xmax end -function fixedrange!{N}(X::AbstractVector, lower::Real, upper::Real, xmin::Real, xmax::Real, ::ObsDim.Constant{N}) - xrange = xmax - xmin - scale = upper - lower - nVars = length(X) - - @inbounds for iVar in eachindex(X) - X[iVar] = lower + (X[iVar] - xmin) / xrange * scale - end - lower, upper, xmin, xmax +immutable FixedRangeScaler{T<:Real,U<:Real,V<:Real,W<:Real,M} + lower::T + upper::U + xmin::Vector{V} + xmax::Vector{W} + obsdim::ObsDim.Constant{M} end - -immutable FixedRangeScaler - lower::Float64 - upper::Float64 - xmin::Vector - xmax::Vector - obsdim::ObsDim.Constant{} - - function FixedRangeScaler(lower, upper, xmin, xmax, obsdim) - @assert length(xmin) == length(xmax) - new(lower, upper, xmin, xmax, convert(ObsDimension, obsdim)) - end -end - - -function FixedRangeScaler{T<:Real}(X::AbstractArray{T}; obsdim=LearnBase.default_obsdim(X)) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X)) FixedRangeScaler(X, convert(ObsDimension, obsdim)) end -function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last) - FixedRangeScaler(X, ObsDim.Constant{M}()) +function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) + FixedRangeScaler(0, 1, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) end -function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T}, obsdim::ObsDim.Constant{M}) - FixedRangeScaler(0, 1, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, ::ObsDim.Last) + FixedRangeScaler(X, ObsDim.Constant{N}()) end -function FixedRangeScaler{T<:Real}(X::AbstractArray{T}, lower, upper; obsdim=LearnBase.default_obsdim(X)) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X)) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim)) end -function FixedRangeScaler{T<:Real,M}(X::AbstractMatrix{T}, lower, upper, obsdim::ObsDim.Constant{M}) +function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, lower, upper, obsdim::ObsDim.Constant{M}) FixedRangeScaler(lower, upper, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) end -function FixedRangeScaler{T<:Real,M}(X::AbstractArray{T,M}, lower, upper, ::ObsDim.Last) - FixedRangeScaler(X, lower, upper, ObsDim.Constant{M}()) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper, ::ObsDim.Last) + FixedRangeScaler(X, lower, upper, ObsDim.Constant{N}()) end -function StatsBase.fit{T<:Real}(::Type{FixedRangeScaler}, X::AbstractArray{T}; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, obsdim=obsdim) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, convert(ObsDimension, obsdim)) end -function StatsBase.fit{T<:Real}(::Type{FixedRangeScaler}, X::AbstractArray{T}, lower, upper; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, lower, upper, obsdim=obsdim) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X)) + FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim)) end -function transform!{T<:AbstractFloat}(X::AbstractArray{T}, cs::FixedRangeScaler) - unitrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) +function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) + fixedrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) end -function transform{T<:AbstractFloat}(X::AbstractArray{T}, cs::FixedRangeScaler) +function transform{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) Xnew = copy(X) - unitrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + Xnew +end + +function transform{T<:Real,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) + Xnew = convert(AbstractArray{Float64, N}, X) + fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) Xnew end diff --git a/src/standardize.jl b/src/standardize.jl index 56000a2..229a0a6 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -72,53 +72,115 @@ function standardize!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, : μ, σ end -immutable StandardScaler - offset::Vector{Float64} - scale::Vector{Float64} - obsdim::ObsDim.Constant{} - - function StandardScaler(offset, scale, obsdim) - @assert length(offset) == length(scale) - new(offset, scale, convert(ObsDimension, obsdim)) +# -------------------------------------------------------------------- + +function standardize!(D::AbstractDataFrame) + standardize!(D, names(D)) +end + +function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) + μ_vec = Float64[] + σ_vec = Float64[] + + for colname in colnames + if eltype(D[colname]) <: Real + μ = mean(D[colname]) + σ = std(D[colname]) + if isna(μ) + warn("Column \"$colname\" contains NA values, skipping rescaling of this column!") + continue + end + standardize!(D, colname, μ, σ) + push!(μ_vec, μ) + push!(σ_vec, σ) + else + warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") + end + end + μ_vec, σ_vec +end + +function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}, μ::AbstractVector, σ::AbstractVector) + for (icol, colname) in enumerate(colnames) + if eltype(D[colname]) <: Real + standardize!(D, colname, μ[icol], σ[icol]) + else + warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") + end end + μ, σ +end + +function standardize!(D::AbstractDataFrame, colname::Symbol, μ::Real, σ::Real) + if sum(isna(D[colname])) > 0 + warn("Column \"$colname\" contains NA values, skipping rescaling on this column!") + else + newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) + nobs = length(newcol) + @inbounds for i in eachindex(newcol) + newcol[i] = (newcol[i] - μ) / σ + end + D[colname] = newcol + end + μ, σ end -function StandardScaler{T<:Real}(X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X)) + +immutable StandardScaler{T,U,M} + offset::Vector{T} + scale::Vector{U} + obsdim::ObsDim.Constant{M} +end + +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}; obsdim=LearnBase.default_obsdim(X)) StandardScaler(X, convert(ObsDimension, obsdim)) end +function StandardScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) + StandardScaler(vec(mean(X, M)), vec(std(X, M)), obsdim) +end + function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last) StandardScaler(X, ObsDim.Constant{M}) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, obsdim::ObsDim.Constant{M}) - StandardScaler(mean(X, M), std(X, M), obsdim) +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale, obsdim=LearnBase.default_obsdim(X)) + StandardScaler(offset, scale, convert(ObsDimension, obsdim)) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale; obsdim=LearnBase.default_obsdim(X)) - StandardScaler(offset, scale, convert(ObsDimension, obsdim)) +function StandardScaler{T<:Real,N}(X::AbstractArray{T,N}, offset, scale, ::ObsDim.Last) + StandardScaler(offset, scale, ObsDim.Constant{N}) +end + +function StandardScaler(D::AbstractDataFrame) + flt_1 = Bool[T <: Real for T in eltypes(D)] + flt_2 = Bool[any(isna(D[colname])) for colname in names(D)] + flt = !(flt_1 | flt_2) + offset = Float64[mean(D[colname]) for colname in names(D)[flt]] + scale = Float64[std(D[colname]) for colname in names(D)[flt]] + StandardScaler(offset, scale, ObsDim.Constant{1}) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale, ::ObsDim.Last) - StandardScaler(offset, scale, ObsDim.Constant{M}) +function StandardScaler(D::AbstractDataFrame, offset, scale) + StandardScaler(offset, scale, ObsDim.Constant{1}) end function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X)) StandardScaler(X, obsdim=obsdim) end -function transform!{T<:Real}(cs::StandardScaler, X::AbstractMatrix{T}) - @assert length(cs.offset) == size(X, 1) +function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) standardize!(X, cs.offset, cs.scale, obsdim=cs.obsdim) X end -function transform{T<:AbstractFloat}(cs::StandardScaler, X::AbstractMatrix{T}) +function transform{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) Xnew = copy(X) - transform!(cs, Xnew) + transform!(Xnew, cs) end -function transform{T<:Real}(cs::StandardScaler, X::AbstractMatrix{T}) - X = convert(AbstractMatrix{AbstractFloat}, X) - transform!(cs, X) +function transform{T<:Real,N}(X::AbstractArray{T,N}, cs::StandardScaler) + Xnew = convert(AbstractArray{Float64, N}, X) + transform!(Xnew, cs) + Xnew end diff --git a/test/runtests.jl b/test/runtests.jl index 00c53a1..3f52c72 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,8 +5,8 @@ using Base.Test tests = [ "tst_expand.jl" "tst_center.jl" - "tst_rescale.jl" - "tst_featurenormalizer.jl" + "tst_standardize.jl" + "tst_fixedrangescaler.jl" ] for t in tests diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl new file mode 100644 index 0000000..ed0e214 --- /dev/null +++ b/test/tst_fixedrangescaler.jl @@ -0,0 +1,48 @@ +R = reshape(1:40, 10, 4) +F = convert(Matrix{Float64}, R) +r1 = collect(1:4) +r2 = collect(1:10) +@testset "Array" begin + scaler = fit(FixedRangeScaler, F) + X = transform(F, scaler) + @test mean(X[:,end]) ≈ 1 + @test mean(X[:,1]) ≈ 0 + @test maximum(X) == 1 + @test minimum(X) == 0 + + scaler = fit(FixedRangeScaler, F, obsdim=1) + X = transform(F, scaler) + @test mean(X[1,:]) ≈ 0 + @test mean(X[end,:]) ≈ 1 + @test maximum(X) == 1 + @test minimum(X) == 0 + + scaler = fit(FixedRangeScaler, F, -2, 2) + X = transform(F, scaler) + @test mean(X[:,end]) ≈ 2 + @test mean(X[:,1]) ≈ -2 + @test maximum(X) == 2 + @test minimum(X) == -2 + + scaler = fit(FixedRangeScaler, F, -2, 2, obsdim=1) + X = transform(F, scaler) + @test mean(X[1,:]) ≈ -2 + @test mean(X[end,:]) ≈ 2 + @test maximum(X) == 2 + @test minimum(X) == -2 + + scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=1) + X = transform(R, scaler) + @test mean(X[1,:]) ≈ -2 + @test mean(X[end,:]) ≈ 2 + @test maximum(X) == 2 + @test minimum(X) == -2 + + scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=1) + r = transform(r1, scaler) + @test r == -[2, 6, 10, 14] + + scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=2) + r = transform(r2, scaler) + @test r == -2 * ones(size(R, 1)) +end diff --git a/test/tst_rescale.jl b/test/tst_rescale.jl index ef05bc8..a8c04a3 100644 --- a/test/tst_rescale.jl +++ b/test/tst_rescale.jl @@ -7,76 +7,76 @@ df_na[1, :A] = NA @testset "Array" begin # Rescale Vector xa = copy(e_x) - mu, sigma = rescale!(xa) + mu, sigma = standardize!(xa) @test mu ≈ mean(e_x) @test sigma ≈ std(e_x) @test abs(mean(xa)) <= 10e-10 @test std(xa) ≈ 1 xa = copy(e_x) - mu, sigma = rescale!(xa, mu, sigma) + mu, sigma = standardize!(xa, mu, sigma) @test abs(mean(xa)) <= 10e-10 @test std(xa) ≈ 1 xa = copy(e_x) - mu, sigma = rescale!(xa, mu, sigma, obsdim=1) + mu, sigma = standardize!(xa, mu, sigma, obsdim=1) @test abs(mean(xa)) <= 10e-10 @test std(xa) ≈ 1 xa = copy(e_x) mu = copy(e_x) .- 1 sigma = ones(e_x) - mu, sigma = rescale!(xa, mu, sigma, obsdim=1) + mu, sigma = standardize!(xa, mu, sigma, obsdim=1) @test mean(xa) ≈ 1 Xa = copy(e_X) - rescale!(Xa) + standardize!(Xa) @test abs(sum(mean(Xa, 2))) <= 10e-10 @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] Xa = copy(e_X) - rescale!(Xa, obsdim=2) + standardize!(Xa, obsdim=2) @test abs(sum(mean(Xa, 2))) <= 10e-10 @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] Xa = copy(e_X) - rescale!(Xa, obsdim=1) + standardize!(Xa, obsdim=1) @test abs(sum(mean(Xa, 1))) <= 10e-10 Xa = copy(e_X) mu = vec(mean(Xa, 1)) sigma = vec(std(Xa, 1)) - rescale!(Xa, mu, sigma, obsdim=1) + standardize!(Xa, mu, sigma, obsdim=1) @test abs(sum(mean(Xa, 1))) <= 10e-10 Xa = copy(e_X) mu = vec(mean(Xa, 2)) sigma = vec(std(Xa, 2)) - rescale!(Xa, mu, sigma, obsdim=2) + standardize!(Xa, mu, sigma, obsdim=2) @test abs(sum(mean(Xa, 2))) <= 10e-10 end @testset "DataFrame" begin D = copy(df) - mu, sigma = rescale!(D) + mu, sigma = standardize!(D) @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 D = copy(df) - mu, sigma = rescale!(D, [:A, :B]) + mu, sigma = standardize!(D, [:A, :B]) @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 D = copy(df) mu_check = [mean(D[colname]) for colname in names(D)[1:2]] sigma_check = [std(D[colname]) for colname in names(D)[1:2]] - mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check) + mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 # skip columns that contain NA values D = copy(df_na) - mu, sigma = rescale!(D, [:A, :B]) + mu, sigma = standardize!(D, [:A, :B]) @test isna(D[1, :A]) @test all(D[2:end, :A] .== df_na[2:end, :A]) @test abs(mean(D[:B])) < 10e-10 @@ -85,7 +85,7 @@ end D = copy(df_na) mu_check = [mean(D[colname]) for colname in names(D)[1:2]] sigma_check = [std(D[colname]) for colname in names(D)[1:2]] - mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check) + mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) #= @test isna(D[1, :A]) =# #= @test all(D[2:end, :A] .== df_na[2:end, :A]) =# #= @test abs(mean(D[:B])) < 10e-10 =# diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl new file mode 100644 index 0000000..202eb34 --- /dev/null +++ b/test/tst_standardize.jl @@ -0,0 +1,93 @@ +e_x = collect(-2:0.5:10) +e_X = expand_poly(e_x, 5) +df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) +df_na = deepcopy(df) +df_na[1, :A] = NA + +@testset "Array" begin + # Rescale Vector + xa = copy(e_x) + mu, sigma = standardize!(xa) + @test mu ≈ mean(e_x) + @test sigma ≈ std(e_x) + @test abs(mean(xa)) <= 10e-10 + @test std(xa) ≈ 1 + + xa = copy(e_x) + mu, sigma = standardize!(xa, mu, sigma) + @test abs(mean(xa)) <= 10e-10 + @test std(xa) ≈ 1 + + xa = copy(e_x) + mu, sigma = standardize!(xa, mu, sigma, obsdim=1) + @test abs(mean(xa)) <= 10e-10 + @test std(xa) ≈ 1 + + xa = copy(e_x) + mu = copy(e_x) .- 1 + sigma = ones(e_x) + mu, sigma = standardize!(xa, mu, sigma, obsdim=1) + @test mean(xa) ≈ 1 + + Xa = copy(e_X) + standardize!(Xa) + @test abs(sum(mean(Xa, 2))) <= 10e-10 + @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] + + Xa = copy(e_X) + standardize!(Xa, obsdim=2) + @test abs(sum(mean(Xa, 2))) <= 10e-10 + @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] + + Xa = copy(e_X) + standardize!(Xa, obsdim=1) + @test abs(sum(mean(Xa, 1))) <= 10e-10 + + Xa = copy(e_X) + mu = vec(mean(Xa, 1)) + sigma = vec(std(Xa, 1)) + standardize!(Xa, mu, sigma, obsdim=1) + @test abs(sum(mean(Xa, 1))) <= 10e-10 + + Xa = copy(e_X) + mu = vec(mean(Xa, 2)) + sigma = vec(std(Xa, 2)) + standardize!(Xa, mu, sigma, obsdim=2) + @test abs(sum(mean(Xa, 2))) <= 10e-10 +end + +#= @testset "DataFrame" begin =# +#= D = copy(df) =# +#= mu, sigma = standardize!(D) =# +#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# +#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# + +#= D = copy(df) =# +#= mu, sigma = standardize!(D, [:A, :B]) =# +#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# +#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# + +#= D = copy(df) =# +#= mu_check = [mean(D[colname]) for colname in names(D)[1:2]] =# +#= sigma_check = [std(D[colname]) for colname in names(D)[1:2]] =# +#= mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) =# +#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# +#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# + +#= # skip columns that contain NA values =# +#= D = copy(df_na) =# +#= mu, sigma = standardize!(D, [:A, :B]) =# +#= @test isna(D[1, :A]) =# +#= @test all(D[2:end, :A] .== df_na[2:end, :A]) =# +#= @test abs(mean(D[:B])) < 10e-10 =# +#= @test abs(std(D[:B])) - 1 < 10e-10 =# + +#= D = copy(df_na) =# +#= mu_check = [mean(D[colname]) for colname in names(D)[1:2]] =# +#= sigma_check = [std(D[colname]) for colname in names(D)[1:2]] =# +#= mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) =# +#= #1= @test isna(D[1, :A]) =1# =# +#= #1= @test all(D[2:end, :A] .== df_na[2:end, :A]) =1# =# +#= #1= @test abs(mean(D[:B])) < 10e-10 =1# =# +#= #1= @test (abs(std(D[:B])) - 1) < 10e-10 =1# =# +#= end =# From 8114ffed20807704ac57ef3e347e0489f0df0316 Mon Sep 17 00:00:00 2001 From: abieler Date: Fri, 12 May 2017 16:54:24 +0200 Subject: [PATCH 03/18] WIP: start writing tests --- src/MLPreprocessing.jl | 7 +- src/center.jl | 173 ++++++++++++++++---------- src/fixedrange.jl | 267 ++++++++++++++++++++++++++++++++++------- src/scalerange.jl | 31 +++++ src/standardize.jl | 103 ++++++++-------- test/tst_center.jl | 173 +++++++++++++------------- 6 files changed, 511 insertions(+), 243 deletions(-) create mode 100644 src/scalerange.jl diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index 6ee7870..d41129a 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -26,11 +26,12 @@ export #= FeatureNormalizer, =# #= predict, =# #= predict! =# - + +include("scalerange.jl") include("basis_expansion.jl") include("center.jl") -include("rescale.jl") -include("featurenormalizer.jl") +#include("rescale.jl") +#include("featurenormalizer.jl") include("standardize.jl") include("fixedrange.jl") diff --git a/src/center.jl b/src/center.jl index e5f93a0..b8c9067 100644 --- a/src/center.jl +++ b/src/center.jl @@ -1,138 +1,189 @@ """ - μ = center!(X[, μ, obsdim]) + μ = center!(X[, μ; obsdim, operate_on]) or - μ = center!(D[, colnames, μ]) + μ = center!(D[, μ; operate_on]) where `X` is of type Matrix or Vector and `D` of type DataFrame. -Center `X` along `obsdim` around the corresponding entry in the -vector `μ`. If `μ` is not specified then it defaults to the -feature specific means. +Shift `X` along `obsdim` by `μ` according to X = X - μ -For DataFrames, `obsdim` is obsolete and centering is done column wise. -Instead the vector `colnames` allows to specify which columns to center. -If `colnames` is not provided all columns of type T<:Real are centered. -Example: +`μ` : Vector or value describing the translation. + Defaults to mean(X, 2) + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and centering occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: X = rand(4, 100) + x = rand(10) D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) μ = center!(X, obsdim=2) μ = center!(X, ObsDim.First()) + μ = center!(X, obsdim=1, operate_on=[1,3] + μ = center!(X, [7.0, 8.0], obsdim=1, operate_on=[1,3] μ = center!(D) - μ = center!(D, [:A, :B]) - + μ = center!(D, operate_on=[:A, :B]) + μ = center!(D, [-1,-1], operate_on=[:A, :B]) """ -function center!(X, μ; obsdim=LearnBase.default_obsdim(X)) - center!(X, μ, convert(ObsDimension, obsdim)) + +function center!(X, μ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, μ, convert(ObsDimension, obsdim), operate_on) end -function center!(X; obsdim=LearnBase.default_obsdim(X)) - center!(X, convert(ObsDimension, obsdim)) +function center!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, convert(ObsDimension, obsdim), operate_on) end -function center!{T,N}(X::AbstractArray{T,N}, μ::AbstractVector, ::ObsDim.Last) - center!(X, μ, ObsDim.Constant{N}()) +function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, ObsDim.Constant{M}(), operate_on) end -function center!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) - center!(X, ObsDim.Constant{N}()) +function center!{T,N}(X::AbstractArray{T,N}, obsdim::ObsDim.Last; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, ObsDim.Constant{N}(), operate_on) end -function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) - μ = vec(mean(X, M)) - center!(X, μ, obsdim) +function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on::AbstractVector) + center!(X, ObsDim.Constant{M}(), operate_on) end -function center!{T}(X::AbstractVector{T}, ::ObsDim.Constant{1}) - μ = mean(X) - for i in 1:length(X) - X[i] = X[i] - μ - end - μ +function center!{T,N}(X::AbstractArray{T,N}, obsdim::ObsDim.Last, operate_on::AbstractVector) + center!(X, ObsDim.Constant{N}(), operate_on) end -function center!(X::AbstractVector, μ::AbstractVector, ::ObsDim.Constant{1}) - @inbounds for i in 1:length(X) - X[i] = X[i] - μ[i] - end - μ +function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on::AbstractVector) + μ = vec(mean(X, M))[operate_on] + center!(X, μ, obsdim, operate_on) end -function center!(X::AbstractVector, μ::AbstractFloat, ::ObsDim.Constant{1}) - @inbounds for i in 1:length(X) - X[i] = X[i] - μ - end - μ +function center!{T,N,M}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Constant{M}; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, μ, ObsDim.Constant{M}(), operate_on) +end + +function center!{T,N}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Last; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + center!(X, μ, ObsDim.Constant{N}(), operate_on) end -function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{1}) +function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{1}, operate_on) + @assert length(μ) == length(operate_on) nObs, nVars = size(X) - for iVar in 1:nVars + for (i, iVar) in enumerate(operate_on) @inbounds for iObs in 1:nObs - X[iObs, iVar] = X[iObs, iVar] - μ[iVar] + X[iObs, iVar] = X[iObs, iVar] - μ[i] end end μ end -function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{2}) +function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{2}, operate_on) + @assert length(μ) == length(operate_on) nVars, nObs = size(X) for iObs in 1:nObs - @inbounds for iVar in 1:nVars - X[iVar, iObs] = X[iVar, iObs] - μ[iVar] + @inbounds for (i, iVar) in enumerate(operate_on) + X[iVar, iObs] = X[iVar, iObs] - μ[i] end end μ end + +function center!(x::AbstractVector; obsdim=LearnBase.default_obsdim(x), operate_on=default_scalerange(x)) + center!(x, convert(ObsDimension, obsdim), operate_on) +end + +function center!{T,M}(x::AbstractVector{T}, ::ObsDim.Constant{M}, operate_on::AbstractVector) + μ = mean(x) + for iVar in operate_on + x[iVar] = x[iVar] - μ + end + μ +end + +function center!(x::AbstractVector, μ::AbstractVector, ::ObsDim.Constant{1}, operate_on::AbstractVector) + @assert length(μ) == length(operate_on) + @inbounds for (i, iVar) in enumerate(operate_on) + x[iVar] = x[iVar] - μ[i] + end + μ +end + +function center!(x::AbstractVector, μ::AbstractVector, ::ObsDim.Last, operate_on::AbstractVector) + center!(x, μ, ObsDim.Constant{1}(), operate_on) +end + +function center!(x::AbstractVector, μ::Real, ::ObsDim.Constant{1}, operate_on) + @inbounds for i in operate_on + x[i] = x[i] - μ + end + μ +end + +function center!(x::AbstractVector, μ::Real, ::ObsDim.Last, operate_on) + center!(x, μ, ObsDim.Constant{1}(), operate_on) +end + # -------------------------------------------------------------------- -function center!(D::AbstractDataFrame) - center!(D, names(D)) +function center!(D::AbstractDataFrame; operate_on=default_scalerange(D)) + center!(D, operate_on) end -function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) +function center!(D::AbstractDataFrame, operate_on::AbstractVector{Symbol}) μ_vec = Float64[] - for colname in colnames + for colname in operate_on if eltype(D[colname]) <: Real μ = mean(D[colname]) if isna(μ) - warn("Column \"$colname\" contains NA values, skipping rescaling of this column!") + warn("Skipping \"$colname\" because it contains NA values") continue end - center!(D, colname, μ) + center!(D, μ, colname) push!(μ_vec, μ) else - warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.") + warn("Skipping \"$colname\" because data is not of type T <: Real.") end end μ_vec end -function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}, μ::AbstractVector) - for (icol, colname) in enumerate(colnames) +function center!(D::AbstractDataFrame, μ::AbstractVector; operate_on=default_scalerange(D)) + center!(D, μ, operate_on) +end + +function center!(D::AbstractDataFrame, μ::AbstractVector, operate_on::AbstractVector{Symbol}) + for (icol, colname) in enumerate(operate_on) if eltype(D[colname]) <: Real - center!(D, colname, μ[icol]) + center!(D, μ[icol], colname) else - warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.") + warn("Skipping \"$colname\" because data is not of type T <: Real.") end end μ end -function center!(D::AbstractDataFrame, colname::Symbol, μ) +function center!(D::AbstractDataFrame, μ::Real, colname::Symbol) if sum(isna(D[colname])) > 0 - warn("Column \"$colname\" contains NA values, skipping centering on this column!") + warn("Skipping \"$colname\" because it contains NA values") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) - nobs = length(newcol) - @inbounds for i in eachindex(newcol) - newcol[i] -= μ - end + center!(newcol, μ) D[colname] = newcol end μ diff --git a/src/fixedrange.jl b/src/fixedrange.jl index a406c6e..53bc7f5 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -15,127 +15,302 @@ Examples: fixedrange!(X, obsdim=1) fixedrange!(X, -1, 1, obsdim=2) + +where `X` is of type Matrix or Vector and `D` of type DataFrame. + +Center `X` along `obsdim` around the corresponding entry in the +vector `μ`. + + +`μ` : Vector or value describing the center. + Defaults to mean(X, 2) + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and centering occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + + X = rand(4, 100) + x = rand(10) + D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + μ = center!(X, obsdim=2) + μ = center!(X, ObsDim.First()) + μ = center!(X, obsdim=1, operate_on=[1,3] + μ = center!(X, [7.0, 8.0], obsdim=1, operate_on=[1,3] + μ = center!(D) + μ = center!(D, operate_on=[:A, :B]) + μ = center!(D, [-1,-1], operate_on=[:A, :B]) """ -function fixedrange!(X; obsdim=LearnBase.default_obsdim(X)) - fixedrange!(X, convert(ObsDimension, obsdim)) +function fixedrange!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + fixedrange!(X, convert(ObsDimension, obsdim), operate_on) end -function fixedrange!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) - fixedrange!(X, ObsDim.Constant{N}()) +function fixedrange!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last, operate_on) + fixedrange!(X, ObsDim.Constant{N}(), operate_on) end -function fixedrange!{M}(X, obsdim::ObsDim.Constant{M}) +function fixedrange!{M}(X, obsdim::ObsDim.Constant{M}, operate_on) lower = 0 upper = 1 - xmin = minimum(X, M) - xmax = maximum(X, M) - fixedrange!(X, lower, upper, xmin, xmax, obsdim) + xmin = minimum(X, M)[operate_on] + xmax = maximum(X, M)[operate_on] + fixedrange!(X, lower, upper, xmin, xmax, obsdim, operate_on) end -function fixedrange!(X, lower, upper; obsdim=LearnBase.default_obsdim(X)) - fixedrange!(X, lower, upper, convert(ObsDimension, obsdim)) +function fixedrange!(X, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + fixedrange!(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function fixedrange!{M}(X, lower, upper, obsdim::ObsDim.Constant{M}) - xmin = minimum(X, M) - xmax = maximum(X, M) - fixedrange!(X, lower, upper, xmin, xmax, obsdim) +function fixedrange!{M}(X, lower, upper, obsdim::ObsDim.Constant{M}, operate_on) + xmin = minimum(X, M)[operate_on] + xmax = maximum(X, M)[operate_on] + fixedrange!(X, lower, upper, xmin, xmax, obsdim, operate_on) end -function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdim(X)) - fixedrange!(X, lower, upper, xmin, xmax, convert(ObsDimension, obsdim)) +function fixedrange!{T,M}(X::AbstractArray{T,M}, lower, upper, obsdim::ObsDim.Last, operate_on) + fixedrange!(X, lower, upper, ObsDim.Constant{M}(), operate_on) end -function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{1}) +function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + fixedrange!(X, lower, upper, xmin, xmax, convert(ObsDimension, obsdim), operate_on) +end + +function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{1}, operate_on::AbstractVector) + @assert length(xmin) == length(xmax) == length(operate_on) xrange = xmax .- xmin scale = upper - lower nObs, nVars = size(X) - for iVar in 1:nVars + for (i, iVar) in enumerate(operate_on) @inbounds for iObs in 1:nObs - X[iObs, iVar] = lower + (X[iObs, iVar] - xmin[iVar]) / xrange[iVar] * scale + X[iObs, iVar] = lower + (X[iObs, iVar] - xmin[i]) / xrange[i] * scale end end lower, upper, xmin, xmax end -function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{2}) +function fixedrange!(X::AbstractMatrix, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{2}, operate_on::AbstractVector) + @assert length(xmin) == length(xmax) == length(operate_on) xrange = xmax .- xmin scale = upper - lower nVars, nObs = size(X) for iObs in 1:nObs - @inbounds for iVar in 1:nVars - X[iVar, iObs] = lower + (X[iVar, iObs] - xmin[iVar]) / xrange[iVar] * scale + @inbounds for (i, iVar) in enumerate(operate_on) + X[iVar, iObs] = lower + (X[iVar, iObs] - xmin[i]) / xrange[i] * scale end end lower, upper, xmin, xmax end -function fixedrange!{M}(X::AbstractVector, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{M}) +function fixedrange!{T,M}(X::AbstractArray{T,M}, lower::Real, upper::Real, xmin::Real, xmax::Real, ::ObsDim.Last, operate_on::AbstractVector) + fixedrange!(X, lower, upper, xmin, xmax, ObsDim.Constant{M}(), operate_on) +end + +function fixedrange!{M}(x::AbstractVector, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Constant{M}, operate_on::AbstractVector) + @assert length(xmin) == length(xmax) == length(operate_on) xrange = xmax .- xmin scale = upper - lower - nVars = length(X) + nVars = length(x) + @inbounds for (i, iVar) in enumerate(operate_on) + x[iVar] = lower + (x[iVar] - xmin[i]) / xrange[i] * scale + end + lower, upper, xmin, xmax +end + +function fixedrange!(x::AbstractVector, lower::Real, upper::Real, xmin::AbstractVector, xmax::AbstractVector, ::ObsDim.Last, operate_on::AbstractVector) + fixedrange!(x, lower, upper, xmin, xmax, ObsDim.Constant{1}(), operate_on) +end - @inbounds for iVar in eachindex(X) - X[iVar] = lower + (X[iVar] - xmin[iVar]) / xrange[iVar] * scale +function fixedrange!(x::AbstractVector, lower::Real, upper::Real, xmin::Real, xmax::Real) + xrange = xmax - xmin + scale = upper - lower + n = length(x) + @inbounds for i in 1:n + x[i] = lower + (x[i] - xmin) / xrange * scale end lower, upper, xmin, xmax end -immutable FixedRangeScaler{T<:Real,U<:Real,V<:Real,W<:Real,M} +# -------------------------------------------------------------------- + +function fixedrange!(D::AbstractDataFrame; operate_on=default_scalerange(D)) + fixedrange!(D, 0, 1, operate_on) +end + +function fixedrange!(D::AbstractDataFrame, lower, upper; operate_on=default_scalerange(D)) + fixedrange!(D, lower, upper, operate_on) +end + +function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, operate_on::AbstractArray) + xmin = Float64[] + xmax = Float64[] + + for colname in operate_on + if eltype(D[colname]) <: Real + minval = minimum(D[colname]) + maxval = maximum(D[colname]) + if isna(minval) + warn("Skipping \"$colname\" because it contains NA values") + continue + end + fixedrange!(D, lower, upper, minval, maxval, colname) + push!(xmin, minval) + push!(xmax, maxval) + else + warn("Skipping \"$colname\" because data is not of type T <: Real.") + end + end + lower, upper, xmin, xmax +end + +function fixedrange!(D::AbstractDataFrame, lower, upper, xmin, xmax; operate_on=default_scalerange(D)) + fixedrange!(D, lower, upper, xmin, xmax, operate_on) +end + +function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::AbstractArray, xmax::AbstractArray, operate_on::AbstractVector) + @assert length(xmin) == length(xmax) == length(operate_on) + for (iVar, colname) in enumerate(operate_on) + fixedrange!(D, lower, upper, xmin[iVar], xmax[iVar], colname) + end + lower, upper, xmin, xmax, operate_on +end + +function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Real, xmax::Real, colname::Symbol) + if any(isna(D[colname])) | !(eltype(D[colname]) <: Real) + warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") + else + newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) + fixedrange!(newcol, lower, upper, xmin, xmax) + D[colname] = newcol + end + lower, upper, xmin, xmax, colname +end + +immutable FixedRangeScaler{T<:Real,U<:Real,V<:Real,W<:Real,M,I} lower::T upper::U xmin::Vector{V} xmax::Vector{W} obsdim::ObsDim.Constant{M} + operate_on::Vector{I} +end + +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) +end + +function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on) + xmin = vec(minimum(X, M))[operate_on] + xmax = vec(maximum(X, M))[operate_on] + FixedRangeScaler(0, 1, xmin, xmax, obsdim, operate_on) +end + +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, ::ObsDim.Last, operate_on) + FixedRangeScaler(X, ObsDim.Constant{N}(), operate_on) end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, convert(ObsDimension, obsdim)) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) - FixedRangeScaler(0, 1, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) +function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, lower, upper, obsdim::ObsDim.Constant{M}, operate_on) + xmin = vec(minimum(X, M))[operate_on] + xmax = vec(maximum(X, M))[operate_on] + FixedRangeScaler(lower, upper, xmin, xmax, obsdim, operate_on) end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, ::ObsDim.Last) - FixedRangeScaler(X, ObsDim.Constant{N}()) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper, ::ObsDim.Last, operate_on) + FixedRangeScaler(X, lower, upper, ObsDim.Constant{N}(), operate_on) end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim)) +function FixedRangeScaler(D::AbstractDataFrame; operate_on=default_scalerange(D)) + FixedRangeScaler(D, 0, 1, operate_on) end -function FixedRangeScaler{T<:Real,N,M}(X::AbstractArray{T,N}, lower, upper, obsdim::ObsDim.Constant{M}) - FixedRangeScaler(lower, upper, vec(minimum(X, M)), vec(maximum(X, M)), obsdim) +function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real; operate_on=default_scalerange(D)) + FixedRangeScaler(D, lower, upper, operate_on) end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper, ::ObsDim.Last) - FixedRangeScaler(X, lower, upper, ObsDim.Constant{N}()) +function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operate_on::AbstractVector{Symbol}) + xmin = Float64[] + xmax = Float64[] + for colname in operate_on + push!(xmin, minimum(D[colname])) + push!(xmax, maximum(D[colname])) + end + FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), operate_on) +end + +function valid_columns(D::AbstractDataFrame) + valid_colnames = Symbol[] + for colname in names(D) + if (eltype(D[colname]) <: Real) & !(any(isnull(D[colname]))) + push!(valid_colnames, colname) + else + warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") + end + end + valid_colnames +end + +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, convert(ObsDimension, obsdim)) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X)) - FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim)) +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scalerange(D)) + FixedRangeScaler(D, 0, 1, operate_on) +end + +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scalerange(D)) + FixedRangeScaler(D, lower, upper, operate_on) end function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) - fixedrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + fixedrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim, cs.operate_on) +end + +function transform!{T<:AbstractFloat}(x::AbstractVector{T}, cs::FixedRangeScaler) + fixedrange!(x, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim, cs.operate_on) +end + +function transform!(D::AbstractDataFrame, cs::FixedRangeScaler) + fixedrange!(D, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.operate_on) end function transform{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) Xnew = copy(X) - fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim, cs.operate_on) Xnew end function transform{T<:Real,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) Xnew = convert(AbstractArray{Float64, N}, X) - fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim) + fixedrange!(Xnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim, cs.operate_on) Xnew end + +function transform(D::AbstractDataFrame, cs::FixedRangeScaler) + Dnew = deepcopy(D) + fixedrange!(Dnew, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.operate_on) + Dnew +end diff --git a/src/scalerange.jl b/src/scalerange.jl new file mode 100644 index 0000000..3fcefac --- /dev/null +++ b/src/scalerange.jl @@ -0,0 +1,31 @@ +function default_scalerange(X::AbstractMatrix, ::ObsDim.Constant{1}) + collect(1:size(X, 2)) +end + +function default_scalerange(X::AbstractMatrix, ::ObsDim.Constant{2}) + collect(1:size(X, 1)) +end + +function default_scalerange(X::AbstractMatrix, ::ObsDim.Last) + collect(1:size(X, 1)) +end + +function default_scalerange(x::AbstractVector) + collect(1:length(x)) +end + +function default_scalerange(x::AbstractVector, ::ObsDim.Last) + collect(1:length(x)) +end + +function default_scalerange{M}(x::AbstractVector, ::ObsDim.Constant{M}) + collect(1:length(x)) +end + +function default_scalerange(D::AbstractDataFrame) + flt1 = Bool[T <: Real for T in eltypes(D)] + flt2 = Bool[any(isna(D[colname])) for colname in names(D)] + flt = (flt1 | !flt2) + names(D)[flt] +end + diff --git a/src/standardize.jl b/src/standardize.jl index 229a0a6..3e4dd7f 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -5,67 +5,67 @@ Center `X` along `obsdim` around the corresponding entry in the vector `μ` and then standardize each feature using the corresponding entry in the vector `σ`. """ -function standardize!(X, μ, σ; obsdim=LearnBase.default_obsdim(X)) - standardize!(X, μ, σ, convert(ObsDimension, obsdim)) +function standardize!(X, μ, σ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + standardize!(X, μ, σ, convert(ObsDimension, obsdim), operate_on) end -function standardize!{T,N}(X::AbstractArray{T,N}, μ, σ, ::ObsDim.Last) - standardize!(X, μ, σ, ObsDim.Constant{N}()) +function standardize!{T,N}(X::AbstractArray{T,N}, μ, σ, ::ObsDim.Last, operate_on) + standardize!(X, μ, σ, ObsDim.Constant{N}(), operate_on) end -function standardize!(X; obsdim=LearnBase.default_obsdim(X)) - standardize!(X, convert(ObsDimension, obsdim)) +function standardize!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + standardize!(X, convert(ObsDimension, obsdim), operate_on) end -function standardize!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) - standardize!(X, ObsDim.Constant{N}()) +function standardize!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last, operate_on) + standardize!(X, ObsDim.Constant{N}(), operate_on) end -function standardize!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) - μ = vec(mean(X, M)) - σ = vec(std(X, M)) - standardize!(X, μ, σ, obsdim) +function standardize!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on) + μ = vec(mean(X, M))[operate_on] + σ = vec(std(X, M))[operate_on] + standardize!(X, μ, σ, obsdim, operate_on) end -function standardize!(X::AbstractVector, ::ObsDim.Constant{1}) +function standardize!{M}(X::AbstractVector, ::ObsDim.Constant{M}, operate_on) μ = mean(X) σ = std(X) - for i in 1:length(X) + for i in operate_on X[i] = (X[i] - μ) / σ end μ, σ end -function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{2}) +function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{2}, operate_on) σ[σ .== 0] = 1 nVars, nObs = size(X) for iObs in 1:nObs - @inbounds for iVar in 1:nVars - X[iVar, iObs] = (X[iVar, iObs] - μ[iVar]) / σ[iVar] + @inbounds for (i, iVar) in enumerate(operate_on) + X[iVar, iObs] = (X[iVar, iObs] - μ[i]) / σ[i] end end μ, σ end -function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) +function standardize!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}, operate_on) σ[σ .== 0] = 1 nObs, nVars = size(X) - for iVar in 1:nVars + for (i, iVar) in enumerate(operate_on) @inbounds for iObs in 1:nObs - X[iObs, iVar] = (X[iObs, iVar] - μ[iVar]) / σ[iVar] + X[iObs, iVar] = (X[iObs, iVar] - μ[i]) / σ[i] end end μ, σ end -function standardize!(X::AbstractVector, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) - @inbounds for i in 1:length(X) - X[i] = (X[i] - μ[i]) / σ[i] +function standardize!{M}(X::AbstractVector, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{M}, operate_on) + @inbounds for (i, iVar) in enumerate(operate_on) + X[iVar] = (X[iVar] - μ[i]) / σ[i] end μ, σ end -function standardize!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::ObsDim.Constant{1}) +function standardize!{M}(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::ObsDim.Constant{M}, operate_on) @inbounds for i in 1:length(X) X[i] = (X[i] - μ) / σ end @@ -126,54 +126,57 @@ function standardize!(D::AbstractDataFrame, colname::Symbol, μ::Real, σ::Real) end -immutable StandardScaler{T,U,M} +immutable StandardScaler{T,U,M,I} offset::Vector{T} scale::Vector{U} obsdim::ObsDim.Constant{M} + operate_on::Vector{I} end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}; obsdim=LearnBase.default_obsdim(X)) - StandardScaler(X, convert(ObsDimension, obsdim)) +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end -function StandardScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) - StandardScaler(vec(mean(X, M)), vec(std(X, M)), obsdim) +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last, operate_on) + StandardScaler(X, ObsDim.Constant{M}(), operate_on) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last) - StandardScaler(X, ObsDim.Constant{M}) +function StandardScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on) + StandardScaler(vec(mean(X, M))[operate_on], vec(std(X, M))[operate_on], obsdim, operate_on) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale, obsdim=LearnBase.default_obsdim(X)) - StandardScaler(offset, scale, convert(ObsDimension, obsdim)) +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + StandardScaler(offset, scale, convert(ObsDimension, obsdim), operate_on) end -function StandardScaler{T<:Real,N}(X::AbstractArray{T,N}, offset, scale, ::ObsDim.Last) - StandardScaler(offset, scale, ObsDim.Constant{N}) +function StandardScaler{T<:Real,N}(X::AbstractArray{T,N}, offset, scale, ::ObsDim.Last, operate_on) + StandardScaler(offset, scale, ObsDim.Constant{N}(), operate_on) end -function StandardScaler(D::AbstractDataFrame) - flt_1 = Bool[T <: Real for T in eltypes(D)] - flt_2 = Bool[any(isna(D[colname])) for colname in names(D)] - flt = !(flt_1 | flt_2) - offset = Float64[mean(D[colname]) for colname in names(D)[flt]] - scale = Float64[std(D[colname]) for colname in names(D)[flt]] - StandardScaler(offset, scale, ObsDim.Constant{1}) +function StandardScaler(D::AbstractDataFrame; operate_on=default_scalerange(D)) + offset = Float64[mean(D[colname]) for colname in operate_on] + scale = Float64[std(D[colname]) for colname in operate_on] + StandardScaler(offset, scale, ObsDim.Constant{1}(), operate_on) end -function StandardScaler(D::AbstractDataFrame, offset, scale) - StandardScaler(offset, scale, ObsDim.Constant{1}) +function StandardScaler(D::AbstractDataFrame, offset, scale; operate_on=default_scalerange(D)) + StandardScaler(offset, scale, ObsDim.Constant{1}(), operate_on) end -function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X)) - StandardScaler(X, obsdim=obsdim) +function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + StandardScaler(X, obsdim, operate_on) end function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) - standardize!(X, cs.offset, cs.scale, obsdim=cs.obsdim) + standardize!(X, cs.offset, cs.scale, cs.obsdim, cs.operate_on) X end +function transform!(D::AbstractDataFrame, cs::StandardScaler) + standardize!(D, cs.operate_on, cs.offset, cs.scale) + D +end + function transform{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) Xnew = copy(X) transform!(Xnew, cs) @@ -184,3 +187,9 @@ function transform{T<:Real,N}(X::AbstractArray{T,N}, cs::StandardScaler) transform!(Xnew, cs) Xnew end + +function transform(D::AbstractDataFrame, cs::StandardScaler) + Dnew = copy(D) + transform!(Dnew, cs) + Dnew +end diff --git a/test/tst_center.jl b/test/tst_center.jl index 2f513c0..42e813d 100644 --- a/test/tst_center.jl +++ b/test/tst_center.jl @@ -1,93 +1,94 @@ -e_x = collect(-2:0.5:10) -e_X = expand_poly(e_x, 5) -df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) -df_na = deepcopy(df) -df_na[1, :A] = NA +X = collect(Float64, reshape(1:40, 10, 4)) +x = rand(10) * 10 + +D = DataFrame(A=rand(10), B=collect(1:10), C=[hex(x) for x in 11:20]) +D_NA = deepcopy(D) +D_NA[1, :A] = NA @testset "Array" begin - # Center Vectors - xa = copy(e_x) - @test center!(xa) ≈ mean(e_x) - @test abs(mean(xa)) <= 10e-10 - - xa = copy(e_x) - mu = mean(xa) - center!(xa, mu, obsdim=1) - @test abs(mean(xa)) <= 10e-10 - - xa = copy(e_x) - mu = vec(ones(xa)) - center!(xa, mu, obsdim=1) - @test sum(e_x .- mean(xa)) ≈ length(mu) - - # Center Matrix w/o mu - Xa = copy(e_X) - center!(Xa) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - - Xa = copy(e_X) - center!(Xa, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - center!(Xa, ObsDim.First()) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - center!(Xa, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - - Xa = copy(e_X) - center!(Xa, ObsDim.Last()) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - - - # Center Matrix with mu as input - Xa = copy(e_X) - mu = vec(mean(Xa, 1)) - center!(Xa, mu, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 2)) - center!(Xa, mu, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 2)) - center!(Xa, mu, ObsDim.Last()) - @test abs(sum(mean(Xa, 2))) <= 10e-10 + XX = deepcopy(X) + mu = center!(XX, obsdim=1) + @test sum(abs(mean(XX, 1))) == 0 + @test all(std(XX, 1) .== std(X, 1)) + @test all(mu .== vec(mean(X, 1))) + + XX = deepcopy(X) + mu = center!(XX, ObsDim.First()) + @test sum(abs(mean(XX, 1))) == 0 + @test all(std(XX, 1) .== std(X, 1)) + @test all(mu .== vec(mean(X, 1))) + + XX = deepcopy(X) + mu = center!(XX, ObsDim.Last()) + @test sum(abs(mean(XX, 2))) == 0 + @test all(std(XX, 2) .== std(X, 2)) + @test all(mu .== vec(mean(X, 2))) + + XX = deepcopy(X) + mu = center!(XX) + @test sum(abs(mean(XX, 2))) == 0 + @test all(std(XX, 2) .== std(X, 2)) + @test all(mu .== vec(mean(X, 2))) + + XX = deepcopy(X) + mu = vec(mean(X, 1)) + center!(XX, mu, obsdim=1) + @test sum(abs(mean(XX, 1))) == 0 + @test all(std(XX, 1) .== std(X, 1)) + + XX = deepcopy(X) + mu = vec(mean(X, 1)) + center!(XX, mu, ObsDim.First()) + @test sum(abs(mean(XX, 1))) == 0 + @test all(std(XX, 1) .== std(X, 1)) + + XX = deepcopy(X) + mu = vec(mean(XX, 2)) + center!(XX, mu, obsdim=2) + @test sum(abs(mean(XX, 2))) == 0 + @test all(std(XX, 2) .== std(X, 2)) + + XX = deepcopy(X) + mu = vec(mean(XX, 2)) + center!(XX, mu, ObsDim.Last()) + @test sum(abs(mean(XX, 2))) == 0 + @test all(std(XX, 2) .== std(X, 2)) + + XX = deepcopy(X) + mu = vec(mean(X[:,[1,3]], 1)) + center!(XX, mu, obsdim=1, operate_on=[1, 3]) + @test sum(abs(mean(XX[:,[1,3]], 1))) == 0 + @test all(XX[:,2] .== X[:,2]) + @test all(std(XX, 1) .== std(X, 1)) + + XX = deepcopy(X) + mu = vec(mean(X[[1,3],:], 2)) + center!(XX, mu, obsdim=2, operate_on=[1, 3]) + @test sum(abs(mean(XX[[1,3],:], 2))) == 0 + @test all(XX[2,:] .== X[2,:]) + @test all(std(XX, 2) .== std(X, 2)) + println() + + xx = deepcopy(x) + center!(xx) + @test mean(xx) <= 10e-10 + + xx = deepcopy(x) + mu = mean(xx) + center!(xx, mu) + @test mean(xx) <= 10e-10 + + xx = deepcopy(x) + mu = ones(xx) + center!(xx, mu) + @test mean(xx) - mean(x) ≈ -1 + + xx = deepcopy(x) + mu = ones(xx) + center!(xx, mu) + @test mean(xx) - mean(x) ≈ -1e_x = collect(-2:0.5:10) end @testset "DataFrame" begin # Center DataFrame - D = copy(df) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - mu = center!(D) - @test length(mu) == 2 - @test abs(sum(mu .- mu_check)) <= 10e-10 - - D = copy(df) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - mu = center!(D, [:A, :B]) - @test abs(sum(mu .- mu_check)) <= 10e-10 - - D = copy(df) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - mu = center!(D, [:A, :B], mu_check) - @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 - - # skip columns that contain NA values - D = copy(df_na) - mu = center!(D, [:A, :B]) - @test isna(D[1, :A]) - @test all(D[2:end, :A] .== df_na[2:end, :A]) - @test abs(mean(D[:B])) < 10e-10 - - D = copy(df_na) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - mu = center!(D, [:A, :B], mu_check) - @test isna(D[1, :A]) - @test all(D[2:end, :A] .== df_na[2:end, :A]) - @test abs(mean(D[:B])) < 10e-10 end From 838f896265bb80c3b9caeec3c8f6accf7e9ad7fd Mon Sep 17 00:00:00 2001 From: abieler Date: Wed, 31 May 2017 06:04:00 +0200 Subject: [PATCH 04/18] Add tests and docs --- src/MLPreprocessing.jl | 8 +- src/center.jl | 23 ++-- src/fixedrange.jl | 164 +++++++++++++++++------- src/scalerange.jl | 31 ----- src/scaleselection.jl | 31 +++++ src/standardize.jl | 181 ++++++++++++++++++++------ test/runtests.jl | 2 +- test/tst_center.jl | 34 ++++- test/tst_fixedrangescaler.jl | 145 ++++++++++++++------- test/tst_standardize.jl | 240 ++++++++++++++++++++++------------- 10 files changed, 587 insertions(+), 272 deletions(-) delete mode 100644 src/scalerange.jl create mode 100644 src/scaleselection.jl diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index d41129a..ce4ce25 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -22,16 +22,10 @@ export transform, transform! - #= rescale!, =# - #= FeatureNormalizer, =# - #= predict, =# - #= predict! =# -include("scalerange.jl") +include("scaleselection.jl") include("basis_expansion.jl") include("center.jl") -#include("rescale.jl") -#include("featurenormalizer.jl") include("standardize.jl") include("fixedrange.jl") diff --git a/src/center.jl b/src/center.jl index b8c9067..2291a4c 100644 --- a/src/center.jl +++ b/src/center.jl @@ -44,26 +44,22 @@ Examples: μ = center!(D, [-1,-1], operate_on=[:A, :B]) """ -function center!(X, μ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!(X, μ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, μ, convert(ObsDimension, obsdim), operate_on) end -function center!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, convert(ObsDimension, obsdim), operate_on) end -function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}; operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, ObsDim.Constant{M}(), operate_on) end -function center!{T,N}(X::AbstractArray{T,N}, obsdim::ObsDim.Last; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!{T,N}(X::AbstractArray{T,N}, obsdim::ObsDim.Last; operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, ObsDim.Constant{N}(), operate_on) end -function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on::AbstractVector) - center!(X, ObsDim.Constant{M}(), operate_on) -end - function center!{T,N}(X::AbstractArray{T,N}, obsdim::ObsDim.Last, operate_on::AbstractVector) center!(X, ObsDim.Constant{N}(), operate_on) end @@ -73,11 +69,11 @@ function center!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, opera center!(X, μ, obsdim, operate_on) end -function center!{T,N,M}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Constant{M}; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!{T,N,M}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Constant{M}; operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, μ, ObsDim.Constant{M}(), operate_on) end -function center!{T,N}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Last; operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function center!{T,N}(X::AbstractArray{T,N}, μ::AbstractVector, obsdim::ObsDim.Last; operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) center!(X, μ, ObsDim.Constant{N}(), operate_on) end @@ -103,8 +99,7 @@ function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{2}, op μ end - -function center!(x::AbstractVector; obsdim=LearnBase.default_obsdim(x), operate_on=default_scalerange(x)) +function center!(x::AbstractVector; obsdim=LearnBase.default_obsdim(x), operate_on=default_scaleselection(x)) center!(x, convert(ObsDimension, obsdim), operate_on) end @@ -141,7 +136,7 @@ end # -------------------------------------------------------------------- -function center!(D::AbstractDataFrame; operate_on=default_scalerange(D)) +function center!(D::AbstractDataFrame; operate_on=default_scaleselection(D)) center!(D, operate_on) end @@ -163,7 +158,7 @@ function center!(D::AbstractDataFrame, operate_on::AbstractVector{Symbol}) μ_vec end -function center!(D::AbstractDataFrame, μ::AbstractVector; operate_on=default_scalerange(D)) +function center!(D::AbstractDataFrame, μ::AbstractVector; operate_on=default_scaleselection(D)) center!(D, μ, operate_on) end diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 53bc7f5..bb84359 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -1,38 +1,38 @@ """ - lower, upper, xmin, xmax = fixedrange!(X[, lower, upper, xmin, xmax; obsdim]) + lower, upper, xmin, xmax = fixedrange!(X[, lower, upper, xmin, xmax; obsdim, operate_on]) -Rescale `X` to the interval (lower:upper) along `obsdim`. If `upper` and `lower` are not -provided they default to 0 and 1 respectively, rescaling the data to the unit range (0:1). -`xmin` and `xmax` are vectors consisiting of the maximum and minimum values of `X` along obsdim. -`xmin`, `xmax` default to minimum(X, obsdim) and maximum(X, obsdim) respectively. -`obsdim` refers to the dimension of observations, e.g. `obsdim`=1 if the rows of `X` correspond to -measurements. `obsdim`=2 if columns of `X` represent measurements. +or -Examples: + lower, upper, xmin, xmax = fixedrange!(D[, lower, upper, xmin, xmax; operate_on]) - X = rand(10, 4) - fixedrange!(X, obsdim=1) - fixedrange!(X, -1, 1, obsdim=2) +where `X` is of type Matrix or Vector and `D` of type DataFrame. +Normalize `X` or `D` along `obsdim` to the interval [lower:upper]. +If `lower` and `upper` are omitted the default range is [0:1]. -where `X` is of type Matrix or Vector and `D` of type DataFrame. +`lower` : (Scalar) Lower limit of new range. + Defaults to 0. -Center `X` along `obsdim` around the corresponding entry in the -vector `μ`. +`upper` : (Scalar) Upper limit of new range. + Defaults to 1. +`xmin` : (Vector) Minimum values of data before normalization. `xmin` will + correspond to `lower` after transformation. + Defaults to `minimum(X, obsdim)`. -`μ` : Vector or value describing the center. - Defaults to mean(X, 2) +`xmin` : (Vector) Maximum value of data before normalization. `xmax` will + correspond to `upper` after transformation. + Defaults to `maximum(X, obsdim)`. `obsdim` : Specify which axis corresponds to observations. Defaults to obsdim=2 (observations are columns of matrix) - For DataFrames `obsdim` is obsolete and centering occurs + For DataFrames `obsdim` is obsolete and rescaling occurs column wise. `operate_on`: Specify the indices of columns or rows to be centered. Defaults to all columns/rows. - For DataFrames this must be a vector of symbols, not indices + For DataFrames this must be a vector of symbols, not indices. E.g. `operate_on`=[1,3] will perform centering on columns with index 1 and 3 only (if obsdim=1, else rows 1 and 3) @@ -47,16 +47,17 @@ Examples: x = rand(10) D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) - μ = center!(X, obsdim=2) - μ = center!(X, ObsDim.First()) - μ = center!(X, obsdim=1, operate_on=[1,3] - μ = center!(X, [7.0, 8.0], obsdim=1, operate_on=[1,3] - μ = center!(D) - μ = center!(D, operate_on=[:A, :B]) - μ = center!(D, [-1,-1], operate_on=[:A, :B]) -""" + lower, upper, xmin, xmax = fixedrange!(X) + lower, upper, xmin, xmax = fixedrange!(X, -1, 1) + lower, upper, xmin, xmax = fixedrange!(X, -1, 1, obsdim=1) + lower, upper, xmin, xmax = fixedrange!(X, -1, 1, obsdim=1, operate_on=[1,4]) + -function fixedrange!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) + lower, upper, xmin, xmax = fixedrange!(D) + lower, upper, xmin, xmax = fixedrange!(D, -1, 1) + lower, upper, xmin, xmax = fixedrange!(D, -1, 1, operate_on=[:A,:B]) +""" +function fixedrange!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) fixedrange!(X, convert(ObsDimension, obsdim), operate_on) end @@ -72,7 +73,7 @@ function fixedrange!{M}(X, obsdim::ObsDim.Constant{M}, operate_on) fixedrange!(X, lower, upper, xmin, xmax, obsdim, operate_on) end -function fixedrange!(X, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function fixedrange!(X, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) fixedrange!(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end @@ -86,7 +87,7 @@ function fixedrange!{T,M}(X::AbstractArray{T,M}, lower, upper, obsdim::ObsDim.La fixedrange!(X, lower, upper, ObsDim.Constant{M}(), operate_on) end -function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function fixedrange!(X, lower, upper, xmin, xmax; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) fixedrange!(X, lower, upper, xmin, xmax, convert(ObsDimension, obsdim), operate_on) end @@ -149,11 +150,11 @@ end # -------------------------------------------------------------------- -function fixedrange!(D::AbstractDataFrame; operate_on=default_scalerange(D)) +function fixedrange!(D::AbstractDataFrame; operate_on=default_scaleselection(D)) fixedrange!(D, 0, 1, operate_on) end -function fixedrange!(D::AbstractDataFrame, lower, upper; operate_on=default_scalerange(D)) +function fixedrange!(D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) fixedrange!(D, lower, upper, operate_on) end @@ -179,7 +180,7 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, operate_on: lower, upper, xmin, xmax end -function fixedrange!(D::AbstractDataFrame, lower, upper, xmin, xmax; operate_on=default_scalerange(D)) +function fixedrange!(D::AbstractDataFrame, lower, upper, xmin, xmax; operate_on=default_scaleselection(D)) fixedrange!(D, lower, upper, xmin, xmax, operate_on) end @@ -202,6 +203,68 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Real, lower, upper, xmin, xmax, colname end + +""" +`FixedRangeScaler` is used with the functions `fit()` and `transform()`. +After fitting a `FixedRangeScaler` to one data set, it can be used to perform the same +transformation to a new set of data. E.g. fit the `FixedRangeScaler` to your training +data and then apply the scaling to the test data at a later stage. (See examples below). + + fit(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) + +`X` : Data of type Matrix or `DataFrame`. + +`lower` : (Scalar) Lower limit of new range. + Defaults to 0. + +`upper` : (Scalar) Upper limit of new range. + Defaults to 1. + +`xmin` : (Vector) Minimum values of data before normalization. `xmin` will + correspond to `lower` after transformation. + Defaults to `minimum(X, obsdim)`. + +`xmin` : (Vector) Maximum value of data before normalization. `xmax` will + correspond to `upper` after transformation. + Defaults to `maximum(X, obsdim)`. + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and rescaling occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices. + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + + + Xtrain = rand(100, 4) + Xtest = rand(10, 4) + x = rand(10) + D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + scaler = fit(FixedRangeScaler, Xtrain) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1, obsdim=1) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1, obsdim=1, operate_on=[1,3]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A,:B]) + + transform(Xtest, scaler) + transform!(Xtest, scaler) + +Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat +as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not +the case for `transform` however. +For `DataFrames` `transform!` can be used on columns of type <: Integer. +""" immutable FixedRangeScaler{T<:Real,U<:Real,V<:Real,W<:Real,M,I} lower::T upper::U @@ -211,7 +274,7 @@ immutable FixedRangeScaler{T<:Real,U<:Real,V<:Real,W<:Real,M,I} operate_on::Vector{I} end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end @@ -225,7 +288,7 @@ function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, ::ObsDim.Last, opera FixedRangeScaler(X, ObsDim.Constant{N}(), operate_on) end -function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end @@ -239,28 +302,41 @@ function FixedRangeScaler{T<:Real,N}(X::AbstractArray{T,N}, lower, upper, ::ObsD FixedRangeScaler(X, lower, upper, ObsDim.Constant{N}(), operate_on) end -function FixedRangeScaler(D::AbstractDataFrame; operate_on=default_scalerange(D)) +function FixedRangeScaler(D::AbstractDataFrame; operate_on=default_scaleselection(D)) FixedRangeScaler(D, 0, 1, operate_on) end -function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real; operate_on=default_scalerange(D)) +function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real; operate_on=default_scaleselection(D)) FixedRangeScaler(D, lower, upper, operate_on) end -function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operate_on::AbstractVector{Symbol}) +function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operate_on::AbstractVector{Symbol}) xmin = Float64[] xmax = Float64[] - for colname in operate_on + colnames = valid_columns(D, operate_on) + for colname in colnames push!(xmin, minimum(D[colname])) push!(xmax, maximum(D[colname])) end - FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), operate_on) + FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), colnames) end function valid_columns(D::AbstractDataFrame) valid_colnames = Symbol[] for colname in names(D) - if (eltype(D[colname]) <: Real) & !(any(isnull(D[colname]))) + if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) + push!(valid_colnames, colname) + else + warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") + end + end + valid_colnames +end + +function valid_columns(D::AbstractDataFrame, colnames) + valid_colnames = Symbol[] + for colname in colnames + if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") @@ -269,19 +345,19 @@ function valid_columns(D::AbstractDataFrame) valid_colnames end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scalerange(D)) +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) FixedRangeScaler(D, 0, 1, operate_on) end -function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scalerange(D)) +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) FixedRangeScaler(D, lower, upper, operate_on) end diff --git a/src/scalerange.jl b/src/scalerange.jl deleted file mode 100644 index 3fcefac..0000000 --- a/src/scalerange.jl +++ /dev/null @@ -1,31 +0,0 @@ -function default_scalerange(X::AbstractMatrix, ::ObsDim.Constant{1}) - collect(1:size(X, 2)) -end - -function default_scalerange(X::AbstractMatrix, ::ObsDim.Constant{2}) - collect(1:size(X, 1)) -end - -function default_scalerange(X::AbstractMatrix, ::ObsDim.Last) - collect(1:size(X, 1)) -end - -function default_scalerange(x::AbstractVector) - collect(1:length(x)) -end - -function default_scalerange(x::AbstractVector, ::ObsDim.Last) - collect(1:length(x)) -end - -function default_scalerange{M}(x::AbstractVector, ::ObsDim.Constant{M}) - collect(1:length(x)) -end - -function default_scalerange(D::AbstractDataFrame) - flt1 = Bool[T <: Real for T in eltypes(D)] - flt2 = Bool[any(isna(D[colname])) for colname in names(D)] - flt = (flt1 | !flt2) - names(D)[flt] -end - diff --git a/src/scaleselection.jl b/src/scaleselection.jl new file mode 100644 index 0000000..4abfd77 --- /dev/null +++ b/src/scaleselection.jl @@ -0,0 +1,31 @@ +function default_scaleselection(X::AbstractMatrix, ::ObsDim.Constant{1}) + collect(1:size(X, 2)) +end + +function default_scaleselection(X::AbstractMatrix, ::ObsDim.Constant{2}) + collect(1:size(X, 1)) +end + +function default_scaleselection(X::AbstractMatrix, ::ObsDim.Last) + collect(1:size(X, 1)) +end + +function default_scaleselection(x::AbstractVector) + collect(1:length(x)) +end + +function default_scaleselection(x::AbstractVector, ::ObsDim.Last) + collect(1:length(x)) +end + +function default_scaleselection{M}(x::AbstractVector, ::ObsDim.Constant{M}) + collect(1:length(x)) +end + +function default_scaleselection(D::AbstractDataFrame) + flt1 = Bool[T <: Real for T in eltypes(D)] + flt2 = Bool[any(isna(D[colname])) for colname in names(D)] + flt = (flt1 & !flt2) + names(D)[flt] +end + diff --git a/src/standardize.jl b/src/standardize.jl index 3e4dd7f..d28a2ab 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -1,11 +1,52 @@ """ - μ, σ = standardize!(X[, μ, σ, obsdim]) + μ, σ = standardize!(X[, μ, σ; obsdim, operate_on]) -Center `X` along `obsdim` around the corresponding entry in the -vector `μ` and then standardize each feature using the corresponding -entry in the vector `σ`. +or + + μ, σ = standardize!(D[, μ, σ; operate_on]) + +Normalize `X` along `obsdim` according to X = (X - μ) / σ. +If μ and σ are omitted they are computed such that variables have a mean of zero + + + +`μ` : Vector or value describing the translation. + Defaults to mean(X, 2) + +`σ` : Vector or value describing the scale. + Defaults to std(X, 2) + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and centering occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + + X = rand(4, 100) + x = rand(10) + D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + μ, σ = standardize!(X, obsdim=2) + μ, σ = standardize!(X, ObsDim.First()) + μ, σ = standardize!(X, obsdim=1, operate_on=[1,3] + μ, σ = standardize!(X, [7.0,8.0], [1,1], obsdim=1, operate_on=[1,3] + μ, σ = standardize!(D) + μ, σ = standardize!(D, operate_on=[:A,:B]) + μ, σ = standardize!(D, [-1,-1], [2,2], operate_on=[:A,:B]) """ -function standardize!(X, μ, σ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function standardize!(X, μ, σ; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) standardize!(X, μ, σ, convert(ObsDimension, obsdim), operate_on) end @@ -13,7 +54,7 @@ function standardize!{T,N}(X::AbstractArray{T,N}, μ, σ, ::ObsDim.Last, operate standardize!(X, μ, σ, ObsDim.Constant{N}(), operate_on) end -function standardize!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function standardize!(X; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) standardize!(X, convert(ObsDimension, obsdim), operate_on) end @@ -73,9 +114,8 @@ function standardize!{M}(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat end # -------------------------------------------------------------------- - -function standardize!(D::AbstractDataFrame) - standardize!(D, names(D)) +function standardize!(D::AbstractDataFrame; operate_on=default_scaleselection(D)) + standardize!(D, operate_on) end function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) @@ -87,33 +127,33 @@ function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) μ = mean(D[colname]) σ = std(D[colname]) if isna(μ) - warn("Column \"$colname\" contains NA values, skipping rescaling of this column!") + warn("Skipping \"$colname\" because it contains NA values") continue end - standardize!(D, colname, μ, σ) + standardize!(D, μ, σ, colname) push!(μ_vec, μ) push!(σ_vec, σ) else - warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") + warn("Skipping \"$colname\" because data is not of type T <: Real.") end end μ_vec, σ_vec end -function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}, μ::AbstractVector, σ::AbstractVector) +function standardize!(D::AbstractDataFrame, μ::AbstractVector, σ::AbstractVector; operate_on=default_scaleselection(D)) + standardize!(D, μ, σ, operate_on) +end + +function standardize!(D::AbstractDataFrame, μ::AbstractVector, σ::AbstractVector, colnames::AbstractVector{Symbol}) for (icol, colname) in enumerate(colnames) - if eltype(D[colname]) <: Real - standardize!(D, colname, μ[icol], σ[icol]) - else - warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") - end + standardize!(D, μ[icol], σ[icol], colname) end μ, σ end -function standardize!(D::AbstractDataFrame, colname::Symbol, μ::Real, σ::Real) - if sum(isna(D[colname])) > 0 - warn("Column \"$colname\" contains NA values, skipping rescaling on this column!") +function standardize!(D::AbstractDataFrame, μ::Real, σ::Real, colname::Symbol) + if any(isna(D[colname])) | !(eltype(D[colname]) <: Real) + warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) nobs = length(newcol) @@ -125,15 +165,72 @@ function standardize!(D::AbstractDataFrame, colname::Symbol, μ::Real, σ::Real) μ, σ end +""" +`StandardScaler` is used with the functions `fit()` and `transform()`. +After fitting a `StandardScaler` to one data set, it can be used to perform the same +transformation to a new set of data. E.g. fit the `StandardScaler` to your training +data and then apply the scaling to the test data at a later stage. (See examples below). + + fit(StandardScaler, X[, μ, σ; obsdim, operate_on]) + +`X` : Data of type Matrix or `DataFrame`. + +`μ` : Vector or scalar describing the translation. + Defaults to mean(X, obsdim) + +`σ` : Vector or scalar describing the scale. + Defaults to std(X, obsdim) + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and rescaling occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices. + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) -immutable StandardScaler{T,U,M,I} +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + + + Xtrain = rand(100, 4) + Xtest = rand(10, 4) + x = rand(4) + Dtrain = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + Dtest = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + scaler = fit(StandardScaler, Xtrain) + scaler = fit(StandardScaler, Xtrain, obsdim=1) + scaler = fit(StandardScaler, Xtrain, obsdim=1, operate_on=[1,3]) + transform(Xtest, scaler) + transform!(Xtest, scaler) + transform(x, scaler) + transform!(x, scaler) + + scaler = fit(StandardScaler, Dtrain) + scaler = fit(StandardScaler, Dtrain, operate_on=[:A,:B]) + transform(Dtest, scaler) + transform!(Dtest, scaler) + +Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat +as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not +the case for `transform` however. +For `DataFrames` `transform!` can be used on columns of type <: Integer. +""" +immutable StandardScaler{T<:Real,U<:Real,I,M} offset::Vector{T} scale::Vector{U} obsdim::ObsDim.Constant{M} operate_on::Vector{I} end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) +function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end @@ -141,30 +238,34 @@ function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, ::ObsDim.Last, operate StandardScaler(X, ObsDim.Constant{M}(), operate_on) end -function StandardScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on) - StandardScaler(vec(mean(X, M))[operate_on], vec(std(X, M))[operate_on], obsdim, operate_on) +function StandardScaler{T<:Real,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}, operate_on::AbstractVector) + offset = vec(mean(X,M))[operate_on] + scale = vec(std(X, M))[operate_on] + StandardScaler(offset, scale, obsdim, operate_on) end -function StandardScaler{T<:Real,M}(X::AbstractArray{T,M}, offset, scale; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) - StandardScaler(offset, scale, convert(ObsDimension, obsdim), operate_on) +function StandardScaler(D::AbstractDataFrame; operate_on=default_scaleselection(D)) + StandardScaler(D, operate_on) end -function StandardScaler{T<:Real,N}(X::AbstractArray{T,N}, offset, scale, ::ObsDim.Last, operate_on) - StandardScaler(offset, scale, ObsDim.Constant{N}(), operate_on) +function StandardScaler(D::AbstractDataFrame, operate_on::Vector{Symbol}) + colnames = valid_columns(D, operate_on) + offset = Float64[mean(D[colname]) for colname in colnames] + scale = Float64[std(D[colname]) for colname in colnames] + StandardScaler(offset, scale, ObsDim.Constant{1}(), colnames) end -function StandardScaler(D::AbstractDataFrame; operate_on=default_scalerange(D)) - offset = Float64[mean(D[colname]) for colname in operate_on] - scale = Float64[std(D[colname]) for colname in operate_on] - StandardScaler(offset, scale, ObsDim.Constant{1}(), operate_on) +function StandardScaler(D::AbstractDataFrame, offset, scale; operate_on=default_scaleselection(D)) + colnames = valid_columns(D) + StandardScaler(offset, scale, ObsDim.Constant{1}(), colnames) end -function StandardScaler(D::AbstractDataFrame, offset, scale; operate_on=default_scalerange(D)) - StandardScaler(offset, scale, ObsDim.Constant{1}(), operate_on) +function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end -function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scalerange(X, convert(ObsDimension, obsdim))) - StandardScaler(X, obsdim, operate_on) +function StatsBase.fit(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) + StandardScaler(D, operate_on) end function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) @@ -173,12 +274,12 @@ function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScale end function transform!(D::AbstractDataFrame, cs::StandardScaler) - standardize!(D, cs.operate_on, cs.offset, cs.scale) + standardize!(D, cs.offset, cs.scale, cs.operate_on) D end function transform{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) - Xnew = copy(X) + Xnew = deepcopy(X) transform!(Xnew, cs) end @@ -189,7 +290,7 @@ function transform{T<:Real,N}(X::AbstractArray{T,N}, cs::StandardScaler) end function transform(D::AbstractDataFrame, cs::StandardScaler) - Dnew = copy(D) + Dnew = deepcopy(D) transform!(Dnew, cs) Dnew end diff --git a/test/runtests.jl b/test/runtests.jl index 3f52c72..39eb1b0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,8 +5,8 @@ using Base.Test tests = [ "tst_expand.jl" "tst_center.jl" - "tst_standardize.jl" "tst_fixedrangescaler.jl" + "tst_standardize.jl" ] for t in tests diff --git a/test/tst_center.jl b/test/tst_center.jl index 42e813d..e184202 100644 --- a/test/tst_center.jl +++ b/test/tst_center.jl @@ -86,9 +86,41 @@ D_NA[1, :A] = NA xx = deepcopy(x) mu = ones(xx) center!(xx, mu) - @test mean(xx) - mean(x) ≈ -1e_x = collect(-2:0.5:10) + @test mean(xx) - mean(x) ≈ -1 end @testset "DataFrame" begin # Center DataFrame + DD = deepcopy(D) + center!(DD) + @test abs(mean(DD[:A])) <= 10e-10 + @test abs(mean(DD[:B])) <= 10e-10 + @test all(DD[:C] .== D[:C]) + + DD = deepcopy(D) + center!(DD, operate_on=[:B]) + @test all(DD[:A] .== D[:A]) + @test abs(mean(DD[:B])) <= 10e-10 + @test all(DD[:C] .== D[:C]) + + DD = deepcopy(D) + mu = center!(DD, operate_on=[:A, :B]) + @test abs(mean(DD[:A])) <= 10e-10 + @test abs(mean(DD[:B])) <= 10e-10 + @test all(DD[:C] .== D[:C]) + @test all(mu .== [mean(D[:A]), mean(D[:B])]) + + DD = deepcopy(D) + mu = [mean(D[:A]), mean(D[:B])] + @test all(center!(DD, mu, operate_on=[:A, :B]) .== mu) + @test abs(mean(DD[:A])) <= 10e-10 + @test abs(mean(DD[:B])) <= 10e-10 + @test all(DD[:C] .== D[:C]) + + DD = deepcopy(D_NA) + center!(DD) + @test all(DD[2:end, :A] .== D[2:end, :A]) + @test abs(mean(DD[:B])) <= 10e-10 + @test all(DD[:C] .== D[:C]) + @test isna(DD[1, :A]) end diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index ed0e214..bf9b13e 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -1,48 +1,101 @@ -R = reshape(1:40, 10, 4) -F = convert(Matrix{Float64}, R) -r1 = collect(1:4) -r2 = collect(1:10) +X = collect(Float64, reshape(1:40, 10, 4)) +x = rand(10) * 10 + +D = DataFrame(A=rand(10), B=collect(1:10), C=[hex(x) for x in 11:20]) +D_NA = deepcopy(D) +D_NA[1, :A] = NA @testset "Array" begin - scaler = fit(FixedRangeScaler, F) - X = transform(F, scaler) - @test mean(X[:,end]) ≈ 1 - @test mean(X[:,1]) ≈ 0 - @test maximum(X) == 1 - @test minimum(X) == 0 - - scaler = fit(FixedRangeScaler, F, obsdim=1) - X = transform(F, scaler) - @test mean(X[1,:]) ≈ 0 - @test mean(X[end,:]) ≈ 1 - @test maximum(X) == 1 - @test minimum(X) == 0 - - scaler = fit(FixedRangeScaler, F, -2, 2) - X = transform(F, scaler) - @test mean(X[:,end]) ≈ 2 - @test mean(X[:,1]) ≈ -2 - @test maximum(X) == 2 - @test minimum(X) == -2 - - scaler = fit(FixedRangeScaler, F, -2, 2, obsdim=1) - X = transform(F, scaler) - @test mean(X[1,:]) ≈ -2 - @test mean(X[end,:]) ≈ 2 - @test maximum(X) == 2 - @test minimum(X) == -2 - - scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=1) - X = transform(R, scaler) - @test mean(X[1,:]) ≈ -2 - @test mean(X[end,:]) ≈ 2 - @test maximum(X) == 2 - @test minimum(X) == -2 - - scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=1) - r = transform(r1, scaler) - @test r == -[2, 6, 10, 14] - - scaler = fit(FixedRangeScaler, R, -2, 2, obsdim=2) - r = transform(r2, scaler) - @test r == -2 * ones(size(R, 1)) + scaler = fit(FixedRangeScaler, X) + XX = transform(X, scaler) + @test mean(XX[:,end]) ≈ 1 + @test mean(XX[:,1]) ≈ 0 + @test maximum(XX) == 1 + @test minimum(XX) == 0 + + scaler = fit(FixedRangeScaler, X, obsdim=1) + XX = transform(X, scaler) + @test mean(XX[1,:]) ≈ 0 + @test mean(XX[end,:]) ≈ 1 + @test maximum(XX) == 1 + @test minimum(XX) == 0 + + scaler = fit(FixedRangeScaler, X, -2, 2) + XX = transform(X, scaler) + @test mean(XX[:,end]) ≈ 2 + @test mean(XX[:,1]) ≈ -2 + @test maximum(XX) == 2 + @test minimum(XX) == -2 + + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1) + XX = transform(X, scaler) + @test mean(XX[1,:]) ≈ -2 + @test mean(XX[end,:]) ≈ 2 + @test maximum(XX) == 2 + @test minimum(XX) == -2 + + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2) + XX = transform(X, scaler) + @test mean(minimum(XX, 2)) ≈ -2 + @test mean(maximum(XX, 2)) ≈ 2 + @test maximum(XX) == 2 + @test minimum(XX) == -2 + + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1, operate_on=[1,2]) + XX = transform(X, scaler) + @test mean(minimum(XX[:,[1,2]], 1)) ≈ -2 + @test mean(maximum(XX[:,[1,2]], 1)) ≈ 2 + + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) + XX = transform(X, scaler) + @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 + @test mean(maximum(XX[[1,2],:], 2)) ≈ 2 + + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) + XX = deepcopy(X) + transform!(XX, scaler) + @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 + @test mean(maximum(XX[[1,2],:], 2)) ≈ 2 +end + +@testset "DataFrame" begin + scaler = fit(FixedRangeScaler, D) + DD = transform(D, scaler) + @test minimum(DD[:A]) == 0 + @test maximum(DD[:A]) == 1 + + scaler = fit(FixedRangeScaler, D, -1, 1) + DD = transform(D, scaler) + @test minimum(DD[:A]) == -1 + @test maximum(DD[:A]) == 1 + @test minimum(DD[:B]) == -1 + @test maximum(DD[:B]) == 1 + + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A]) + DD = transform(D, scaler) + @test minimum(DD[:A]) == -1 + @test maximum(DD[:A]) == 1 + @test minimum(DD[:B]) == minimum(D[:B]) + @test maximum(DD[:B]) == maximum(D[:B]) + + scaler = fit(FixedRangeScaler, D_NA, -1, 1) + DD = transform(D_NA, scaler) + @test isna(DD[1,:A]) + @test DD[end,:A] == D_NA[end,:A] + @test minimum(DD[:B]) == -1 + @test maximum(DD[:B]) == 1 + + scaler = fit(FixedRangeScaler, D_NA, -1, 1, operate_on=[:A, :B]) + DD = transform(D_NA, scaler) + @test isna(DD[1,:A]) + @test DD[end,:A] == D_NA[end,:A] + @test minimum(DD[:B]) == -1 + @test maximum(DD[:B]) == 1 + + DD = deepcopy(D) + scaler = fit(FixedRangeScaler, DD, -1, 1, operate_on=[:A, :B]) + transform!(DD, scaler) + @test minimum(DD[:A]) == -1 + @test maximum(DD[:A]) == 1 + @test minimum(DD[:B]) == -1 + @test maximum(DD[:B]) == 1 end diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl index 202eb34..44a99a3 100644 --- a/test/tst_standardize.jl +++ b/test/tst_standardize.jl @@ -1,93 +1,157 @@ -e_x = collect(-2:0.5:10) -e_X = expand_poly(e_x, 5) -df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) -df_na = deepcopy(df) -df_na[1, :A] = NA +X = collect(Float64, reshape(1:40, 10, 4)) +x = rand(10) * 10 + +D = DataFrame(A=rand(10), B=collect(1:10), C=[hex(x) for x in 11:20]) +D_NA = deepcopy(D) +D_NA[1, :A] = NA @testset "Array" begin # Rescale Vector - xa = copy(e_x) - mu, sigma = standardize!(xa) - @test mu ≈ mean(e_x) - @test sigma ≈ std(e_x) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu, sigma = standardize!(xa, mu, sigma) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu, sigma = standardize!(xa, mu, sigma, obsdim=1) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu = copy(e_x) .- 1 - sigma = ones(e_x) - mu, sigma = standardize!(xa, mu, sigma, obsdim=1) - @test mean(xa) ≈ 1 - - Xa = copy(e_X) - standardize!(Xa) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] - - Xa = copy(e_X) - standardize!(Xa, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] - - Xa = copy(e_X) - standardize!(Xa, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 1)) - sigma = vec(std(Xa, 1)) - standardize!(Xa, mu, sigma, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 2)) - sigma = vec(std(Xa, 2)) - standardize!(Xa, mu, sigma, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 + xx = deepcopy(x) + mu, sigma = standardize!(xx) + @test mu ≈ mean(x) + @test sigma ≈ std(x) + @test abs(mean(xx)) <= 10e-10 + @test std(xx) ≈ 1 + + xx = deepcopy(x) + mu, sigma = standardize!(xx, mu, sigma) + @test abs(mean(xx)) <= 10e-10 + @test std(xx) ≈ 1 + + xx = deepcopy(x) + mu, sigma = standardize!(xx, mu, sigma, obsdim=1) + @test abs(mean(xx)) <= 10e-10 + @test std(xx) ≈ 1 + + xx = deepcopy(x) + mu = deepcopy(x) .- 1 + sigma = ones(x) + mu, sigma = standardize!(xx, mu, sigma, obsdim=1) + @test mean(xx) ≈ 1 + + # Rescale Matrix + XX = deepcopy(X) + standardize!(XX) + @test abs(sum(mean(XX, 2))) <= 10e-10 + @test std(XX, 2) ≈ ones(size(X, 1)) + + XX = deepcopy(X) + standardize!(XX, obsdim=2) + @test abs(sum(mean(XX, 2))) <= 10e-10 + @test std(XX, 2) ≈ ones(size(X, 1)) + + XX = deepcopy(X) + standardize!(XX, obsdim=1) + @test abs(sum(mean(XX, 1))) <= 10e-10 + @test vec(std(XX, 1)) ≈ ones(size(X, 2)) + + XX = deepcopy(X) + mu = vec(mean(XX, 1)) + sigma = vec(std(XX, 1)) + standardize!(XX, mu, sigma, obsdim=1) + @test abs(sum(mean(XX, 1))) <= 10e-10 + + XX = deepcopy(X) + mu = vec(mean(XX, 2)) + sigma = vec(std(XX, 2)) + standardize!(XX, mu, sigma, obsdim=2) + @test abs(sum(mean(XX, 2))) <= 10e-10 + + XX = deepcopy(X) + flt = [1,2] + standardize!(XX, obsdim=1, operate_on=flt) + @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 + @test vec(std(XX[:,flt], 1)) ≈ ones(2) + @test all(X[:,[3,4]] .== XX[:,[3,4]]) + + XX = deepcopy(X) + flt = [2,8] + mu = vec(mean(XX, 2)) + sigma = vec(std(XX, 2)) + standardize!(XX, mu[flt], sigma[flt], obsdim=2, operate_on=flt) + @test abs(sum(mean(XX[flt,:], 2))) <= 10e-10 + + scaler = fit(StandardScaler, X) + XX = transform(X, scaler) + @test abs(sum(mean(XX, 2))) <= 10e-10 + @test std(XX, 2) ≈ ones(size(X, 1)) + + scaler = fit(StandardScaler, X, obsdim=2) + XX = transform(X, scaler) + @test abs(sum(mean(XX, 2))) <= 10e-10 + @test std(XX, 2) ≈ ones(size(X, 1)) + + scaler = fit(StandardScaler, X, obsdim=1) + XX = transform(X, scaler) + @test abs(sum(mean(XX, 1))) <= 10e-10 + @test vec(std(XX, 1)) ≈ ones(size(X, 2)) + + flt = [1,4] + scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) + XX = transform(X, scaler) + xx = transform(vec(X[1,:]), scaler) + @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 + @test vec(std(XX[:,flt], 1)) ≈ ones(size(X[:,flt], 2)) + @test all(xx .== XX[1,:]) + + XX = deepcopy(X) + xx = vec(X[1,:]) + flt = [1,4] + scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) + transform!(XX, scaler) + transform!(xx, scaler) + @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 + @test vec(std(XX[:,flt], 1)) ≈ ones(size(X[:,flt], 2)) + @test all(xx .== XX[1,:]) end -#= @testset "DataFrame" begin =# -#= D = copy(df) =# -#= mu, sigma = standardize!(D) =# -#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# -#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# - -#= D = copy(df) =# -#= mu, sigma = standardize!(D, [:A, :B]) =# -#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# -#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# - -#= D = copy(df) =# -#= mu_check = [mean(D[colname]) for colname in names(D)[1:2]] =# -#= sigma_check = [std(D[colname]) for colname in names(D)[1:2]] =# -#= mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) =# -#= @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 =# -#= @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 =# - -#= # skip columns that contain NA values =# -#= D = copy(df_na) =# -#= mu, sigma = standardize!(D, [:A, :B]) =# -#= @test isna(D[1, :A]) =# -#= @test all(D[2:end, :A] .== df_na[2:end, :A]) =# -#= @test abs(mean(D[:B])) < 10e-10 =# -#= @test abs(std(D[:B])) - 1 < 10e-10 =# - -#= D = copy(df_na) =# -#= mu_check = [mean(D[colname]) for colname in names(D)[1:2]] =# -#= sigma_check = [std(D[colname]) for colname in names(D)[1:2]] =# -#= mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) =# -#= #1= @test isna(D[1, :A]) =1# =# -#= #1= @test all(D[2:end, :A] .== df_na[2:end, :A]) =1# =# -#= #1= @test abs(mean(D[:B])) < 10e-10 =1# =# -#= #1= @test (abs(std(D[:B])) - 1) < 10e-10 =1# =# -#= end =# +@testset "DataFrame" begin + DD = deepcopy(D) + mu, sigma = standardize!(DD) + @test abs(sum([mean(DD[colname]) for colname in names(DD)[1:2]])) <= 10e-10 + @test mean([std(DD[colname]) for colname in names(DD)[1:2]]) - 1 <= 10e-10 + + DD = deepcopy(D) + mu, sigma = standardize!(DD, operate_on=[:A,:B,:C]) + @test abs(sum([mean(DD[colname]) for colname in names(DD)[1:2]])) <= 10e-10 + @test mean([std(DD[colname]) for colname in names(DD)[1:2]]) - 1 <= 10e-10 + + DD = deepcopy(D) + mu, sigma = standardize!(DD, mu, sigma, operate_on=[:A,:B]) + @test abs(sum([mean(DD[colname]) for colname in names(DD)[1:2]])) <= 10e-10 + @test mean([std(DD[colname]) for colname in names(DD)[1:2]]) - 1 <= 10e-10 + + # skip columns that contain NA values + DD = deepcopy(D_NA) + mu, sigma = standardize!(DD) + @test isna(DD[1, :A]) + @test all(DD[2:end, :A] .== D_NA[2:end, :A]) + @test abs(mean(DD[:B])) < 10e-10 + @test abs(std(DD[:B])) - 1 < 10e-10 + + scaler = fit(StandardScaler, D) + DD = transform(D, scaler) + @test mean(DD[:A]) <= 10e-10 + @test std(DD[:A]) - 1 <= 10e-10 + @test mean(DD[:B]) <= 10e-10 + @test std(DD[:B]) - 1 <= 10e-10 + @test all(DD[:C] .== D[:C]) + + scaler = fit(StandardScaler, D, operate_on=[:A, :C]) + DD = transform(D, scaler) + @test mean(DD[:A]) <= 10e-10 + @test std(DD[:A]) - 1 <= 10e-10 + @test all(DD[:B] .== D[:B]) + @test all(DD[:C] .== D[:C]) + @test mean(D[:A]) != mean(DD[:A]) + + DD = deepcopy(D) + scaler = fit(StandardScaler, DD, operate_on=[:A, :C]) + transform!(DD, scaler) + @test mean(DD[:A]) <= 10e-10 + @test std(DD[:A]) - 1 <= 10e-10 + @test all(DD[:B] .== D[:B]) + @test all(DD[:C] .== D[:C]) + @test mean(D[:A]) != mean(DD[:A]) +end From 0c21f0974b8978f367776ee2ffafafcea8781683 Mon Sep 17 00:00:00 2001 From: abieler Date: Thu, 1 Jun 2017 07:07:04 +0200 Subject: [PATCH 05/18] Add fit_transform and fit_transform! --- src/fixedrange.jl | 76 ++++++++++++++++++++++-------------- src/scaleselection.jl | 24 ++++++++++++ src/standardize.jl | 24 ++++++++++++ test/runtests.jl | 2 +- test/tst_fixedrangescaler.jl | 7 ++++ 5 files changed, 102 insertions(+), 31 deletions(-) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index bb84359..4fb70e2 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -220,14 +220,6 @@ data and then apply the scaling to the test data at a later stage. (See examples `upper` : (Scalar) Upper limit of new range. Defaults to 1. -`xmin` : (Vector) Minimum values of data before normalization. `xmin` will - correspond to `lower` after transformation. - Defaults to `minimum(X, obsdim)`. - -`xmin` : (Vector) Maximum value of data before normalization. `xmax` will - correspond to `upper` after transformation. - Defaults to `maximum(X, obsdim)`. - `obsdim` : Specify which axis corresponds to observations. Defaults to obsdim=2 (observations are columns of matrix) For DataFrames `obsdim` is obsolete and rescaling occurs @@ -321,46 +313,70 @@ function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operat FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), colnames) end -function valid_columns(D::AbstractDataFrame) - valid_colnames = Symbol[] - for colname in names(D) - if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) - push!(valid_colnames, colname) - else - warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") - end - end - valid_colnames +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end -function valid_columns(D::AbstractDataFrame, colnames) - valid_colnames = Symbol[] - for colname in colnames - if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) - push!(valid_colnames, colname) - else - warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") - end - end - valid_colnames +function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) + Xnew = transform(X, scaler) + return Xnew, scaler end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) - FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) +function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) + transform!(X, scaler) + return scaler end function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end +function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) + Xnew = transform(X, scaler) + return Xnew, scaler +end + +function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) + transform!(X, scaler) + return scaler +end + function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) FixedRangeScaler(D, 0, 1, operate_on) end +function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) + scaler = FixedRangeScaler(D, 0, 1, operate_on) + Dnew = transform(D, scaler) + return Dnew, scaler +end + +function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) + scaler = FixedRangeScaler(D, 0, 1, operate_on) + transform!(D, scaler) + return scaler +end + function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) FixedRangeScaler(D, lower, upper, operate_on) end +function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) + scaler = FixedRangeScaler(D, lower, upper, operate_on) + Dnew = transform(D, scaler) + return Dnew, scaler +end + +function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) + scaler = FixedRangeScaler(D, lower, upper, operate_on) + transform!(D, scaler) + return scaler +end + function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::FixedRangeScaler) fixedrange!(X, cs.lower, cs.upper, cs.xmin, cs.xmax, cs.obsdim, cs.operate_on) end diff --git a/src/scaleselection.jl b/src/scaleselection.jl index 4abfd77..2b7a5c0 100644 --- a/src/scaleselection.jl +++ b/src/scaleselection.jl @@ -29,3 +29,27 @@ function default_scaleselection(D::AbstractDataFrame) names(D)[flt] end +function valid_columns(D::AbstractDataFrame) + valid_colnames = Symbol[] + for colname in names(D) + if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) + push!(valid_colnames, colname) + else + warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") + end + end + valid_colnames +end + +function valid_columns(D::AbstractDataFrame, colnames) + valid_colnames = Symbol[] + for colname in colnames + if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) + push!(valid_colnames, colname) + else + warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") + end + end + valid_colnames +end + diff --git a/src/standardize.jl b/src/standardize.jl index d28a2ab..92428e4 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -264,10 +264,34 @@ function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; ob StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end +function fit_transform{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) + Xnew = transform(X, scaler) + return Xnew, scaler +end + +function fit_transform!{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) + scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) + transform!(X, scaler) + return scaler +end + function StatsBase.fit(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) StandardScaler(D, operate_on) end +function fit_transform(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) + scaler = StandardScaler(D, operate_on) + Dnew = transform(D, scaler) + return Dnew, scaler +end + +function fit_transform!(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) + scaler = StandardScaler(D, operate_on) + transform!(D, scaler) + return scaler +end + function transform!{T<:AbstractFloat,N}(X::AbstractArray{T,N}, cs::StandardScaler) standardize!(X, cs.offset, cs.scale, cs.obsdim, cs.operate_on) X diff --git a/test/runtests.jl b/test/runtests.jl index 39eb1b0..3f52c72 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,8 +5,8 @@ using Base.Test tests = [ "tst_expand.jl" "tst_center.jl" - "tst_fixedrangescaler.jl" "tst_standardize.jl" + "tst_fixedrangescaler.jl" ] for t in tests diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index bf9b13e..e77bbd7 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -91,6 +91,13 @@ end @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 + scaler = fit(FixedRangeScaler, D_NA, -1, 1, operate_on=[:A, :B, :C]) + DD = transform(D_NA, scaler) + @test isna(DD[1,:A]) + @test DD[end,:A] == D_NA[end,:A] + @test minimum(DD[:B]) == -1 + @test maximum(DD[:B]) == 1 + DD = deepcopy(D) scaler = fit(FixedRangeScaler, DD, -1, 1, operate_on=[:A, :B]) transform!(DD, scaler) From a09226fe425f1ded3038aeb7c7f96b8f98aca2f3 Mon Sep 17 00:00:00 2001 From: abieler Date: Thu, 1 Jun 2017 10:17:06 +0200 Subject: [PATCH 06/18] Clean up --- src/MLPreprocessing.jl | 4 +++- src/fixedrange.jl | 38 ++++++++++++++++++++++-------------- src/standardize.jl | 22 +++++++++++++-------- test/tst_fixedrangescaler.jl | 30 ++++++++++++++-------------- test/tst_standardize.jl | 16 +++++++-------- 5 files changed, 63 insertions(+), 47 deletions(-) diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index ce4ce25..798e5ee 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -19,10 +19,12 @@ export StandardScaler, FixedRangeScaler, fit, + fit_transform, + fit_transform!, transform, transform! - + include("scaleselection.jl") include("basis_expansion.jl") include("center.jl") diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 4fb70e2..05e7a4f 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -205,12 +205,15 @@ end """ -`FixedRangeScaler` is used with the functions `fit()` and `transform()`. +`FixedRangeScaler` is used with the functions `fit()`, `transform()` and `fit_transform()` +to scale data in a Matrix `X` or DataFrame to a fixed range [lower:upper]. After fitting a `FixedRangeScaler` to one data set, it can be used to perform the same transformation to a new set of data. E.g. fit the `FixedRangeScaler` to your training data and then apply the scaling to the test data at a later stage. (See examples below). - fit(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) + fit(X, FixedRangeScaler[, lower, upper; obsdim, operate_on]) + + fit_transform(X, FixedRangeScaler[, lower, upper; obsdim, operate_on]) `X` : Data of type Matrix or `DataFrame`. @@ -249,9 +252,14 @@ Examples: scaler = fit(FixedRangeScaler, Xtrain, -1, 1, obsdim=1, operate_on=[1,3]) scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A,:B]) - transform(Xtest, scaler) + Xscaled = transform(Xtest, scaler) transform!(Xtest, scaler) + Xscaled, scaler = fit_transform(X, FixedRangeScaler, -1, 1, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(X, FixedRangeScaler, -1, 1, obsdim=1, operate_on=[1,2,4]) + + + Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not the case for `transform` however. @@ -313,65 +321,65 @@ function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operat FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), colnames) end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function StatsBase.fit(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) FixedRangeScaler(D, 0, 1, operate_on) end -function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function fit_transform(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, 0, 1, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function fit_transform!(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, 0, 1, operate_on) transform!(D, scaler) return scaler end -function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) +function StatsBase.fit(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) FixedRangeScaler(D, lower, upper, operate_on) end -function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) +function fit_transform(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, lower, upper, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) +function fit_transform!(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, lower, upper, operate_on) transform!(D, scaler) return scaler diff --git a/src/standardize.jl b/src/standardize.jl index 92428e4..37eace5 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -166,12 +166,15 @@ function standardize!(D::AbstractDataFrame, μ::Real, σ::Real, colname::Symbol) end """ -`StandardScaler` is used with the functions `fit()` and `transform()`. +`StandardScaler` is used with the functions `fit()`, `transform()` and `fit_transform()` +to standardize data in a Matrix `X` or DataFrame according to Xnew = (X - μ) / σ. After fitting a `StandardScaler` to one data set, it can be used to perform the same transformation to a new set of data. E.g. fit the `StandardScaler` to your training data and then apply the scaling to the test data at a later stage. (See examples below). - fit(StandardScaler, X[, μ, σ; obsdim, operate_on]) + fit(X, StandardScaler[, μ, σ; obsdim, operate_on]) + + fit_transform(X, StandardScaler[, μ, σ; obsdim, operate_on]) `X` : Data of type Matrix or `DataFrame`. @@ -218,6 +221,9 @@ Examples: transform(Dtest, scaler) transform!(Dtest, scaler) + Xscaled, scaler = fit_transform(X, StandardScaler, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(X, StandardScaler, obsdim=1, operate_on=[1,2,4]) + Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not the case for `transform` however. @@ -260,33 +266,33 @@ function StandardScaler(D::AbstractDataFrame, offset, scale; operate_on=default_ StandardScaler(offset, scale, ObsDim.Constant{1}(), colnames) end -function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function StatsBase.fit(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) StandardScaler(D, operate_on) end -function fit_transform(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function fit_transform(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) scaler = StandardScaler(D, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) +function fit_transform!(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) scaler = StandardScaler(D, operate_on) transform!(D, scaler) return scaler diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index e77bbd7..890f738 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -5,52 +5,52 @@ D = DataFrame(A=rand(10), B=collect(1:10), C=[hex(x) for x in 11:20]) D_NA = deepcopy(D) D_NA[1, :A] = NA @testset "Array" begin - scaler = fit(FixedRangeScaler, X) + scaler = fit(X, FixedRangeScaler) XX = transform(X, scaler) @test mean(XX[:,end]) ≈ 1 @test mean(XX[:,1]) ≈ 0 @test maximum(XX) == 1 @test minimum(XX) == 0 - scaler = fit(FixedRangeScaler, X, obsdim=1) + scaler = fit(X, FixedRangeScaler, obsdim=1) XX = transform(X, scaler) @test mean(XX[1,:]) ≈ 0 @test mean(XX[end,:]) ≈ 1 @test maximum(XX) == 1 @test minimum(XX) == 0 - scaler = fit(FixedRangeScaler, X, -2, 2) + scaler = fit(X, FixedRangeScaler, -2, 2) XX = transform(X, scaler) @test mean(XX[:,end]) ≈ 2 @test mean(XX[:,1]) ≈ -2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1) + scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=1) XX = transform(X, scaler) @test mean(XX[1,:]) ≈ -2 @test mean(XX[end,:]) ≈ 2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2) + scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2) XX = transform(X, scaler) @test mean(minimum(XX, 2)) ≈ -2 @test mean(maximum(XX, 2)) ≈ 2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1, operate_on=[1,2]) + scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=1, operate_on=[1,2]) XX = transform(X, scaler) @test mean(minimum(XX[:,[1,2]], 1)) ≈ -2 @test mean(maximum(XX[:,[1,2]], 1)) ≈ 2 - scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) + scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2, operate_on=[1,2]) XX = transform(X, scaler) @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 @test mean(maximum(XX[[1,2],:], 2)) ≈ 2 - scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) + scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2, operate_on=[1,2]) XX = deepcopy(X) transform!(XX, scaler) @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 @@ -58,40 +58,40 @@ D_NA[1, :A] = NA end @testset "DataFrame" begin - scaler = fit(FixedRangeScaler, D) + scaler = fit(D, FixedRangeScaler) DD = transform(D, scaler) @test minimum(DD[:A]) == 0 @test maximum(DD[:A]) == 1 - scaler = fit(FixedRangeScaler, D, -1, 1) + scaler = fit(D, FixedRangeScaler, -1, 1) DD = transform(D, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A]) + scaler = fit(D, FixedRangeScaler, -1, 1, operate_on=[:A]) DD = transform(D, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 @test minimum(DD[:B]) == minimum(D[:B]) @test maximum(DD[:B]) == maximum(D[:B]) - scaler = fit(FixedRangeScaler, D_NA, -1, 1) + scaler = fit(D_NA, FixedRangeScaler, -1, 1) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(FixedRangeScaler, D_NA, -1, 1, operate_on=[:A, :B]) + scaler = fit(D_NA, FixedRangeScaler, -1, 1, operate_on=[:A, :B]) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(FixedRangeScaler, D_NA, -1, 1, operate_on=[:A, :B, :C]) + scaler = fit(D_NA, FixedRangeScaler, -1, 1, operate_on=[:A, :B, :C]) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @@ -99,7 +99,7 @@ end @test maximum(DD[:B]) == 1 DD = deepcopy(D) - scaler = fit(FixedRangeScaler, DD, -1, 1, operate_on=[:A, :B]) + scaler = fit(DD, FixedRangeScaler, -1, 1, operate_on=[:A, :B]) transform!(DD, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl index 44a99a3..9a701a3 100644 --- a/test/tst_standardize.jl +++ b/test/tst_standardize.jl @@ -72,23 +72,23 @@ D_NA[1, :A] = NA standardize!(XX, mu[flt], sigma[flt], obsdim=2, operate_on=flt) @test abs(sum(mean(XX[flt,:], 2))) <= 10e-10 - scaler = fit(StandardScaler, X) + scaler = fit(X, StandardScaler) XX = transform(X, scaler) @test abs(sum(mean(XX, 2))) <= 10e-10 @test std(XX, 2) ≈ ones(size(X, 1)) - scaler = fit(StandardScaler, X, obsdim=2) + scaler = fit(X, StandardScaler, obsdim=2) XX = transform(X, scaler) @test abs(sum(mean(XX, 2))) <= 10e-10 @test std(XX, 2) ≈ ones(size(X, 1)) - scaler = fit(StandardScaler, X, obsdim=1) + scaler = fit(X, StandardScaler, obsdim=1) XX = transform(X, scaler) @test abs(sum(mean(XX, 1))) <= 10e-10 @test vec(std(XX, 1)) ≈ ones(size(X, 2)) flt = [1,4] - scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) + scaler = fit(X, StandardScaler, obsdim=1, operate_on=flt) XX = transform(X, scaler) xx = transform(vec(X[1,:]), scaler) @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 @@ -98,7 +98,7 @@ D_NA[1, :A] = NA XX = deepcopy(X) xx = vec(X[1,:]) flt = [1,4] - scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) + scaler = fit(X, StandardScaler, obsdim=1, operate_on=flt) transform!(XX, scaler) transform!(xx, scaler) @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 @@ -130,7 +130,7 @@ end @test abs(mean(DD[:B])) < 10e-10 @test abs(std(DD[:B])) - 1 < 10e-10 - scaler = fit(StandardScaler, D) + scaler = fit(D, StandardScaler) DD = transform(D, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 @@ -138,7 +138,7 @@ end @test std(DD[:B]) - 1 <= 10e-10 @test all(DD[:C] .== D[:C]) - scaler = fit(StandardScaler, D, operate_on=[:A, :C]) + scaler = fit(D, StandardScaler, operate_on=[:A, :C]) DD = transform(D, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 @@ -147,7 +147,7 @@ end @test mean(D[:A]) != mean(DD[:A]) DD = deepcopy(D) - scaler = fit(StandardScaler, DD, operate_on=[:A, :C]) + scaler = fit(DD, StandardScaler, operate_on=[:A, :C]) transform!(DD, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 From 2c34c73f844b8bc4eb6abb5e1a8f6f1aba1ec0b8 Mon Sep 17 00:00:00 2001 From: abieler Date: Thu, 1 Jun 2017 17:53:26 +0200 Subject: [PATCH 07/18] WIP one hot encoding of DataFrame columns --- src/MLPreprocessing.jl | 5 ++++- src/encoding.jl | 26 ++++++++++++++++++++++++++ src/scaleselection.jl | 22 ++++++++++++++++++---- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 src/encoding.jl diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index 798e5ee..4ab16e5 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -22,7 +22,9 @@ export fit_transform, fit_transform!, transform, - transform! + transform!, + + onehot! include("scaleselection.jl") @@ -30,5 +32,6 @@ include("basis_expansion.jl") include("center.jl") include("standardize.jl") include("fixedrange.jl") +include("encoding.jl") end # module diff --git a/src/encoding.jl b/src/encoding.jl new file mode 100644 index 0000000..24e3208 --- /dev/null +++ b/src/encoding.jl @@ -0,0 +1,26 @@ +immutable OneHotEncoder{S} + operate_on::Vector{S} +end + +function onehot!(D::DataFrame, varname::Symbol) + for keyword in unique(D[:, varname]) + sym_keyword = Symbol(string(varname) * "_" * keyword) + D[sym_keyword] = zeros(Int, size(D, 1)) + for i in 1:size(D, 1) + if D[i, varname] == keyword + D[i, sym_keyword] = 1 + end + end + end +end + +function onehot!(D::DataFrame, operate_on::Vector{Symbol}) + for varname in operate_on + onehot!(D, varname) + end +end + +function onehot!(D::DataFrame; operate_on=default_categoricalselection(D)) + onehot!(D, operate_on) +end + diff --git a/src/scaleselection.jl b/src/scaleselection.jl index 2b7a5c0..13700ff 100644 --- a/src/scaleselection.jl +++ b/src/scaleselection.jl @@ -22,11 +22,12 @@ function default_scaleselection{M}(x::AbstractVector, ::ObsDim.Constant{M}) collect(1:length(x)) end +function default_categoricalselection(D::AbstractDataFrame) + valid_columns_categorical(D::AbstractDataFrame) +end + function default_scaleselection(D::AbstractDataFrame) - flt1 = Bool[T <: Real for T in eltypes(D)] - flt2 = Bool[any(isna(D[colname])) for colname in names(D)] - flt = (flt1 & !flt2) - names(D)[flt] + valid_columns(D) end function valid_columns(D::AbstractDataFrame) @@ -53,3 +54,16 @@ function valid_columns(D::AbstractDataFrame, colnames) valid_colnames end +function valid_columns_categorical(D::AbstractDataFrame) + valid_colnames = Symbol[] + for colname in names(D) + if !(eltype(D[colname]) <: Real) + if !(any(isna(D[colname]))) + push!(valid_colnames, colname) + else + warn("Skipping \"$colname\" because it contains NA") + end + end + end + valid_colnames +end From d9da3af8c0a33797ed2b9f51c8b5a19a83d3c726 Mon Sep 17 00:00:00 2001 From: abieler Date: Fri, 2 Jun 2017 11:02:10 +0200 Subject: [PATCH 08/18] Remove obsolete files --- src/center.jl | 2 +- src/rescale.jl | 158 -------------------------------------------- test/tst_rescale.jl | 93 -------------------------- 3 files changed, 1 insertion(+), 252 deletions(-) delete mode 100644 src/rescale.jl delete mode 100644 test/tst_rescale.jl diff --git a/src/center.jl b/src/center.jl index 2291a4c..aae9a0a 100644 --- a/src/center.jl +++ b/src/center.jl @@ -103,7 +103,7 @@ function center!(x::AbstractVector; obsdim=LearnBase.default_obsdim(x), operate_ center!(x, convert(ObsDimension, obsdim), operate_on) end -function center!{T,M}(x::AbstractVector{T}, ::ObsDim.Constant{M}, operate_on::AbstractVector) +function center!(x::AbstractVector, ::ObsDim.Constant, operate_on::AbstractVector) μ = mean(x) for iVar in operate_on x[iVar] = x[iVar] - μ diff --git a/src/rescale.jl b/src/rescale.jl deleted file mode 100644 index 476489a..0000000 --- a/src/rescale.jl +++ /dev/null @@ -1,158 +0,0 @@ -""" - μ, σ = rescale!(X[, μ, σ, obsdim]) - -or - - μ, σ = rescale!(D[, colnames, μ, σ]) - -where `X` is of type Matrix or Vector and `D` of type DataFrame. - -Center `X` along `obsdim` around the corresponding entry in the -vector `μ` and then rescale each feature using the corresponding -entry in the vector `σ`. - -For DataFrames, `obsdim` is obsolete and centering is done column wise. -The vector `colnames` allows to specify which columns to center. -If `colnames` is not provided all columns of type T<:Real are centered. - -Example: - - X = rand(4, 100) - D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) - - μ, σ = rescale!(X, obsdim=2) - μ, σ = rescale!(X, ObsDim.First()) - μ, σ = rescale!(D) - μ, σ = rescale!(D, [:A, :B]) - -""" -function rescale!(X, μ, σ; obsdim=LearnBase.default_obsdim(X)) - rescale!(X, μ, σ, convert(ObsDimension, obsdim)) -end - -function rescale!{T,N}(X::AbstractArray{T,N}, μ, σ, ::ObsDim.Last) - rescale!(X, μ, σ, ObsDim.Constant{N}()) -end - -function rescale!(X; obsdim=LearnBase.default_obsdim(X)) - rescale!(X, convert(ObsDimension, obsdim)) -end - -function rescale!{T,N}(X::AbstractArray{T,N}, ::ObsDim.Last) - rescale!(X, ObsDim.Constant{N}()) -end - -function rescale!{T,N,M}(X::AbstractArray{T,N}, obsdim::ObsDim.Constant{M}) - μ = vec(mean(X, M)) - σ = vec(std(X, M)) - rescale!(X, μ, σ, obsdim) -end - -function rescale!(X::AbstractVector, ::ObsDim.Constant{1}) - μ = mean(X) - σ = std(X) - @inbounds for i in 1:length(X) - X[i] = (X[i] - μ) / σ - end - μ, σ -end - -function rescale!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{2}) - σ[σ .== 0] = 1 - nVars, nObs = size(X) - for iObs in 1:nObs - @inbounds for iVar in 1:nVars - X[iVar, iObs] = (X[iVar, iObs] - μ[iVar]) / σ[iVar] - end - end - μ, σ -end - -function rescale!(X::AbstractMatrix, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) - σ[σ .== 0] = 1 - nObs, nVars = size(X) - for iVar in 1:nVars - @inbounds for iObs in 1:nObs - X[iObs, iVar] = (X[iObs, iVar] - μ[iVar]) / σ[iVar] - end - end - μ, σ -end - -function rescale!(X::AbstractVector, μ::AbstractVector, σ::AbstractVector, ::ObsDim.Constant{1}) - @inbounds for i in 1:length(X) - X[i] = (X[i] - μ[i]) / σ[i] - end - μ, σ -end - -function rescale!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::ObsDim.Constant{1}) - @inbounds for i in 1:length(X) - X[i] = (X[i] - μ) / σ - end - μ, σ -end - -# -------------------------------------------------------------------- - -function rescale!(D::AbstractDataFrame) - μ_vec = Float64[] - σ_vec = Float64[] - - flt = Bool[T <: Real for T in eltypes(D)] - for colname in names(D)[flt] - μ = mean(D[colname]) - σ = std(D[colname]) - rescale!(D, colname, μ, σ) - push!(μ_vec, μ) - push!(σ_vec, σ) - end - μ_vec, σ_vec -end - -function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol}) - μ_vec = Float64[] - σ_vec = Float64[] - for colname in colnames - if eltype(D[colname]) <: Real - μ = mean(D[colname]) - σ = std(D[colname]) - if isna(μ) - warn("Column \"$colname\" contains NA values, skipping rescaling of this column!") - continue - end - rescale!(D, colname, μ, σ) - push!(μ_vec, μ) - push!(σ_vec, σ) - else - warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") - end - end - μ_vec, σ_vec -end - -function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol}, μ::AbstractVector, σ::AbstractVector) - for (icol, colname) in enumerate(colnames) - if eltype(D[colname]) <: Real - rescale!(D, colname, μ[icol], σ[icol]) - else - warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.") - end - end - μ, σ -end - -function rescale!(D::AbstractDataFrame, colname::Symbol, μ, σ) - if sum(isna(D[colname])) > 0 - warn("Column \"$colname\" contains NA values, skipping rescaling of this column!") - else - σ_div = σ == 0 ? one(σ) : σ - newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) - nobs = length(newcol) - @inbounds for i in eachindex(newcol) - newcol[i] = (newcol[i] - μ) / σ_div - end - D[colname] = newcol - end - μ, σ -end diff --git a/test/tst_rescale.jl b/test/tst_rescale.jl deleted file mode 100644 index a8c04a3..0000000 --- a/test/tst_rescale.jl +++ /dev/null @@ -1,93 +0,0 @@ -e_x = collect(-2:0.5:10) -e_X = expand_poly(e_x, 5) -df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) -df_na = deepcopy(df) -df_na[1, :A] = NA - -@testset "Array" begin - # Rescale Vector - xa = copy(e_x) - mu, sigma = standardize!(xa) - @test mu ≈ mean(e_x) - @test sigma ≈ std(e_x) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu, sigma = standardize!(xa, mu, sigma) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu, sigma = standardize!(xa, mu, sigma, obsdim=1) - @test abs(mean(xa)) <= 10e-10 - @test std(xa) ≈ 1 - - xa = copy(e_x) - mu = copy(e_x) .- 1 - sigma = ones(e_x) - mu, sigma = standardize!(xa, mu, sigma, obsdim=1) - @test mean(xa) ≈ 1 - - Xa = copy(e_X) - standardize!(Xa) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] - - Xa = copy(e_X) - standardize!(Xa, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 - @test std(Xa, 2) ≈ [1, 1, 1, 1, 1] - - Xa = copy(e_X) - standardize!(Xa, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 1)) - sigma = vec(std(Xa, 1)) - standardize!(Xa, mu, sigma, obsdim=1) - @test abs(sum(mean(Xa, 1))) <= 10e-10 - - Xa = copy(e_X) - mu = vec(mean(Xa, 2)) - sigma = vec(std(Xa, 2)) - standardize!(Xa, mu, sigma, obsdim=2) - @test abs(sum(mean(Xa, 2))) <= 10e-10 -end - -@testset "DataFrame" begin - D = copy(df) - mu, sigma = standardize!(D) - @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 - @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 - - D = copy(df) - mu, sigma = standardize!(D, [:A, :B]) - @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 - @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 - - D = copy(df) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - sigma_check = [std(D[colname]) for colname in names(D)[1:2]] - mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) - @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10 - @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 - - # skip columns that contain NA values - D = copy(df_na) - mu, sigma = standardize!(D, [:A, :B]) - @test isna(D[1, :A]) - @test all(D[2:end, :A] .== df_na[2:end, :A]) - @test abs(mean(D[:B])) < 10e-10 - @test abs(std(D[:B])) - 1 < 10e-10 - - D = copy(df_na) - mu_check = [mean(D[colname]) for colname in names(D)[1:2]] - sigma_check = [std(D[colname]) for colname in names(D)[1:2]] - mu, sigma = standardize!(D, [:A, :B], mu_check, sigma_check) - #= @test isna(D[1, :A]) =# - #= @test all(D[2:end, :A] .== df_na[2:end, :A]) =# - #= @test abs(mean(D[:B])) < 10e-10 =# - #= @test (abs(std(D[:B])) - 1) < 10e-10 =# -end From 08857395312bb2974b02763a0341d2637eebc447 Mon Sep 17 00:00:00 2001 From: abieler Date: Sun, 4 Jun 2017 09:32:59 +0200 Subject: [PATCH 09/18] Swap arguments for fit and fit_transform --- src/fixedrange.jl | 24 ++++++++++++------------ src/standardize.jl | 12 ++++++------ test/tst_fixedrangescaler.jl | 30 +++++++++++++++--------------- test/tst_standardize.jl | 16 ++++++++-------- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 05e7a4f..01a64c3 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -321,65 +321,65 @@ function FixedRangeScaler(D::AbstractDataFrame, lower::Real, upper::Real, operat FixedRangeScaler(lower, upper, xmin, xmax, ObsDim.Constant{1}(), colnames) end -function StatsBase.fit{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real,N}(X::AbstractArray{T,N}, ::Type{FixedRangeScaler}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real,N}(::Type{FixedRangeScaler}, X::AbstractArray{T,N}, lower, upper; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = FixedRangeScaler(X, lower, upper, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) FixedRangeScaler(D, 0, 1, operate_on) end -function fit_transform(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) +function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, 0, 1, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(D::AbstractDataFrame, ::Type{FixedRangeScaler}; operate_on=default_scaleselection(D)) +function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, 0, 1, operate_on) transform!(D, scaler) return scaler end -function StatsBase.fit(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) +function StatsBase.fit(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) FixedRangeScaler(D, lower, upper, operate_on) end -function fit_transform(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) +function fit_transform(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, lower, upper, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(D::AbstractDataFrame, ::Type{FixedRangeScaler}, lower, upper; operate_on=default_scaleselection(D)) +function fit_transform!(::Type{FixedRangeScaler}, D::AbstractDataFrame, lower, upper; operate_on=default_scaleselection(D)) scaler = FixedRangeScaler(D, lower, upper, operate_on) transform!(D, scaler) return scaler diff --git a/src/standardize.jl b/src/standardize.jl index 37eace5..a7a75c6 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -266,33 +266,33 @@ function StandardScaler(D::AbstractDataFrame, offset, scale; operate_on=default_ StandardScaler(offset, scale, ObsDim.Constant{1}(), colnames) end -function StatsBase.fit{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function StatsBase.fit{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) StandardScaler(X, convert(ObsDimension, obsdim), operate_on) end -function fit_transform{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) Xnew = transform(X, scaler) return Xnew, scaler end -function fit_transform!{T<:Real}(X::AbstractMatrix{T}, ::Type{StandardScaler}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) +function fit_transform!{T<:Real}(::Type{StandardScaler}, X::AbstractMatrix{T}; obsdim=LearnBase.default_obsdim(X), operate_on=default_scaleselection(X, convert(ObsDimension, obsdim))) scaler = StandardScaler(X, convert(ObsDimension, obsdim), operate_on) transform!(X, scaler) return scaler end -function StatsBase.fit(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) +function StatsBase.fit(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) StandardScaler(D, operate_on) end -function fit_transform(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) +function fit_transform(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) scaler = StandardScaler(D, operate_on) Dnew = transform(D, scaler) return Dnew, scaler end -function fit_transform!(D::AbstractDataFrame, ::Type{StandardScaler}; operate_on=default_scaleselection(D)) +function fit_transform!(::Type{StandardScaler}, D::AbstractDataFrame; operate_on=default_scaleselection(D)) scaler = StandardScaler(D, operate_on) transform!(D, scaler) return scaler diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index 890f738..7c3b50a 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -5,52 +5,52 @@ D = DataFrame(A=rand(10), B=collect(1:10), C=[hex(x) for x in 11:20]) D_NA = deepcopy(D) D_NA[1, :A] = NA @testset "Array" begin - scaler = fit(X, FixedRangeScaler) + scaler = fit(FixedRangeScaler, X) XX = transform(X, scaler) @test mean(XX[:,end]) ≈ 1 @test mean(XX[:,1]) ≈ 0 @test maximum(XX) == 1 @test minimum(XX) == 0 - scaler = fit(X, FixedRangeScaler, obsdim=1) + scaler = fit(FixedRangeScaler, X, obsdim=1) XX = transform(X, scaler) @test mean(XX[1,:]) ≈ 0 @test mean(XX[end,:]) ≈ 1 @test maximum(XX) == 1 @test minimum(XX) == 0 - scaler = fit(X, FixedRangeScaler, -2, 2) + scaler = fit(FixedRangeScaler, X, -2, 2) XX = transform(X, scaler) @test mean(XX[:,end]) ≈ 2 @test mean(XX[:,1]) ≈ -2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=1) + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1) XX = transform(X, scaler) @test mean(XX[1,:]) ≈ -2 @test mean(XX[end,:]) ≈ 2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2) + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2) XX = transform(X, scaler) @test mean(minimum(XX, 2)) ≈ -2 @test mean(maximum(XX, 2)) ≈ 2 @test maximum(XX) == 2 @test minimum(XX) == -2 - scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=1, operate_on=[1,2]) + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=1, operate_on=[1,2]) XX = transform(X, scaler) @test mean(minimum(XX[:,[1,2]], 1)) ≈ -2 @test mean(maximum(XX[:,[1,2]], 1)) ≈ 2 - scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2, operate_on=[1,2]) + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) XX = transform(X, scaler) @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 @test mean(maximum(XX[[1,2],:], 2)) ≈ 2 - scaler = fit(X, FixedRangeScaler, -2, 2, obsdim=2, operate_on=[1,2]) + scaler = fit(FixedRangeScaler, X, -2, 2, obsdim=2, operate_on=[1,2]) XX = deepcopy(X) transform!(XX, scaler) @test mean(minimum(XX[[1,2],:], 2)) ≈ -2 @@ -58,40 +58,40 @@ D_NA[1, :A] = NA end @testset "DataFrame" begin - scaler = fit(D, FixedRangeScaler) + scaler = fit(FixedRangeScaler, D) DD = transform(D, scaler) @test minimum(DD[:A]) == 0 @test maximum(DD[:A]) == 1 - scaler = fit(D, FixedRangeScaler, -1, 1) + scaler = fit(FixedRangeScaler, D , -1, 1) DD = transform(D, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(D, FixedRangeScaler, -1, 1, operate_on=[:A]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A]) DD = transform(D, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 @test minimum(DD[:B]) == minimum(D[:B]) @test maximum(DD[:B]) == maximum(D[:B]) - scaler = fit(D_NA, FixedRangeScaler, -1, 1) + scaler = fit(FixedRangeScaler, D, -1, 1) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(D_NA, FixedRangeScaler, -1, 1, operate_on=[:A, :B]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B]) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 - scaler = fit(D_NA, FixedRangeScaler, -1, 1, operate_on=[:A, :B, :C]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B, :C]) DD = transform(D_NA, scaler) @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @@ -99,7 +99,7 @@ end @test maximum(DD[:B]) == 1 DD = deepcopy(D) - scaler = fit(DD, FixedRangeScaler, -1, 1, operate_on=[:A, :B]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B]) transform!(DD, scaler) @test minimum(DD[:A]) == -1 @test maximum(DD[:A]) == 1 diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl index 9a701a3..44a99a3 100644 --- a/test/tst_standardize.jl +++ b/test/tst_standardize.jl @@ -72,23 +72,23 @@ D_NA[1, :A] = NA standardize!(XX, mu[flt], sigma[flt], obsdim=2, operate_on=flt) @test abs(sum(mean(XX[flt,:], 2))) <= 10e-10 - scaler = fit(X, StandardScaler) + scaler = fit(StandardScaler, X) XX = transform(X, scaler) @test abs(sum(mean(XX, 2))) <= 10e-10 @test std(XX, 2) ≈ ones(size(X, 1)) - scaler = fit(X, StandardScaler, obsdim=2) + scaler = fit(StandardScaler, X, obsdim=2) XX = transform(X, scaler) @test abs(sum(mean(XX, 2))) <= 10e-10 @test std(XX, 2) ≈ ones(size(X, 1)) - scaler = fit(X, StandardScaler, obsdim=1) + scaler = fit(StandardScaler, X, obsdim=1) XX = transform(X, scaler) @test abs(sum(mean(XX, 1))) <= 10e-10 @test vec(std(XX, 1)) ≈ ones(size(X, 2)) flt = [1,4] - scaler = fit(X, StandardScaler, obsdim=1, operate_on=flt) + scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) XX = transform(X, scaler) xx = transform(vec(X[1,:]), scaler) @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 @@ -98,7 +98,7 @@ D_NA[1, :A] = NA XX = deepcopy(X) xx = vec(X[1,:]) flt = [1,4] - scaler = fit(X, StandardScaler, obsdim=1, operate_on=flt) + scaler = fit(StandardScaler, X, obsdim=1, operate_on=flt) transform!(XX, scaler) transform!(xx, scaler) @test abs(sum(mean(XX[:,flt], 1))) <= 10e-10 @@ -130,7 +130,7 @@ end @test abs(mean(DD[:B])) < 10e-10 @test abs(std(DD[:B])) - 1 < 10e-10 - scaler = fit(D, StandardScaler) + scaler = fit(StandardScaler, D) DD = transform(D, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 @@ -138,7 +138,7 @@ end @test std(DD[:B]) - 1 <= 10e-10 @test all(DD[:C] .== D[:C]) - scaler = fit(D, StandardScaler, operate_on=[:A, :C]) + scaler = fit(StandardScaler, D, operate_on=[:A, :C]) DD = transform(D, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 @@ -147,7 +147,7 @@ end @test mean(D[:A]) != mean(DD[:A]) DD = deepcopy(D) - scaler = fit(DD, StandardScaler, operate_on=[:A, :C]) + scaler = fit(StandardScaler, DD, operate_on=[:A, :C]) transform!(DD, scaler) @test mean(DD[:A]) <= 10e-10 @test std(DD[:A]) - 1 <= 10e-10 From 9feac96393f8a2e1050607f48084c3a1586bfa4d Mon Sep 17 00:00:00 2001 From: abieler Date: Sun, 4 Jun 2017 10:16:40 +0200 Subject: [PATCH 10/18] Fix documentation for swapped arguments --- src/fixedrange.jl | 8 ++++---- src/standardize.jl | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 01a64c3..30cd53c 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -211,9 +211,9 @@ After fitting a `FixedRangeScaler` to one data set, it can be used to perform th transformation to a new set of data. E.g. fit the `FixedRangeScaler` to your training data and then apply the scaling to the test data at a later stage. (See examples below). - fit(X, FixedRangeScaler[, lower, upper; obsdim, operate_on]) + fit(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) - fit_transform(X, FixedRangeScaler[, lower, upper; obsdim, operate_on]) + fit_transform(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) `X` : Data of type Matrix or `DataFrame`. @@ -255,8 +255,8 @@ Examples: Xscaled = transform(Xtest, scaler) transform!(Xtest, scaler) - Xscaled, scaler = fit_transform(X, FixedRangeScaler, -1, 1, obsdim=1, operate_on=[1,2,4]) - scaler = fit_transform!(X, FixedRangeScaler, -1, 1, obsdim=1, operate_on=[1,2,4]) + Xscaled, scaler = fit_transform(FixedRangeScaler, X, -1, 1, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(FixedRangeScaler, X, -1, 1, obsdim=1, operate_on=[1,2,4]) diff --git a/src/standardize.jl b/src/standardize.jl index a7a75c6..80e6563 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -172,9 +172,9 @@ After fitting a `StandardScaler` to one data set, it can be used to perform the transformation to a new set of data. E.g. fit the `StandardScaler` to your training data and then apply the scaling to the test data at a later stage. (See examples below). - fit(X, StandardScaler[, μ, σ; obsdim, operate_on]) + fit(StandardScaler, X[, μ, σ; obsdim, operate_on]) - fit_transform(X, StandardScaler[, μ, σ; obsdim, operate_on]) + fit_transform(StandardScaler, X[, μ, σ; obsdim, operate_on]) `X` : Data of type Matrix or `DataFrame`. @@ -221,8 +221,8 @@ Examples: transform(Dtest, scaler) transform!(Dtest, scaler) - Xscaled, scaler = fit_transform(X, StandardScaler, obsdim=1, operate_on=[1,2,4]) - scaler = fit_transform!(X, StandardScaler, obsdim=1, operate_on=[1,2,4]) + Xscaled, scaler = fit_transform(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not From c9a396aed47c81a66c5f9a5e9621708e9ba064a7 Mon Sep 17 00:00:00 2001 From: abieler Date: Wed, 14 Jun 2017 20:39:51 +0200 Subject: [PATCH 11/18] Remove encoding.jl --- src/MLPreprocessing.jl | 5 +---- src/encoding.jl | 26 -------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 src/encoding.jl diff --git a/src/MLPreprocessing.jl b/src/MLPreprocessing.jl index 4ab16e5..798e5ee 100644 --- a/src/MLPreprocessing.jl +++ b/src/MLPreprocessing.jl @@ -22,9 +22,7 @@ export fit_transform, fit_transform!, transform, - transform!, - - onehot! + transform! include("scaleselection.jl") @@ -32,6 +30,5 @@ include("basis_expansion.jl") include("center.jl") include("standardize.jl") include("fixedrange.jl") -include("encoding.jl") end # module diff --git a/src/encoding.jl b/src/encoding.jl deleted file mode 100644 index 24e3208..0000000 --- a/src/encoding.jl +++ /dev/null @@ -1,26 +0,0 @@ -immutable OneHotEncoder{S} - operate_on::Vector{S} -end - -function onehot!(D::DataFrame, varname::Symbol) - for keyword in unique(D[:, varname]) - sym_keyword = Symbol(string(varname) * "_" * keyword) - D[sym_keyword] = zeros(Int, size(D, 1)) - for i in 1:size(D, 1) - if D[i, varname] == keyword - D[i, sym_keyword] = 1 - end - end - end -end - -function onehot!(D::DataFrame, operate_on::Vector{Symbol}) - for varname in operate_on - onehot!(D, varname) - end -end - -function onehot!(D::DataFrame; operate_on=default_categoricalselection(D)) - onehot!(D, operate_on) -end - From afa83241aab311f82674dd517f1b169507a55b56 Mon Sep 17 00:00:00 2001 From: abieler Date: Sat, 17 Jun 2017 11:08:07 +0200 Subject: [PATCH 12/18] Remove obsolete file --- src/featurenormalizer.jl | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 src/featurenormalizer.jl diff --git a/src/featurenormalizer.jl b/src/featurenormalizer.jl deleted file mode 100644 index d99f296..0000000 --- a/src/featurenormalizer.jl +++ /dev/null @@ -1,33 +0,0 @@ -immutable FeatureNormalizer - offset::Vector{Float64} - scale::Vector{Float64} - - function FeatureNormalizer(offset::Vector{Float64}, scale::Vector{Float64}) - @assert length(offset) == length(scale) - new(offset, scale) - end -end - -function FeatureNormalizer{T<:Real}(X::AbstractMatrix{T}) - FeatureNormalizer(vec(mean(X, 2)), vec(std(X, 2))) -end - -function StatsBase.fit{T<:Real}(::Type{FeatureNormalizer}, X::AbstractMatrix{T}) - FeatureNormalizer(X) -end - -function StatsBase.predict!{T<:Real}(cs::FeatureNormalizer, X::AbstractMatrix{T}) - @assert length(cs.offset) == size(X, 1) - rescale!(X, cs.offset, cs.scale) - X -end - -function StatsBase.predict{T<:AbstractFloat}(cs::FeatureNormalizer, X::AbstractMatrix{T}) - Xnew = copy(X) - StatsBase.predict!(cs, Xnew) -end - -function StatsBase.predict{T<:Real}(cs::FeatureNormalizer, X::AbstractMatrix{T}) - X = convert(AbstractMatrix{AbstractFloat}, X) - StatsBase.predict!(cs, X) -end From 9dd765188c122ca4ee8ef8323e2f588284c991e3 Mon Sep 17 00:00:00 2001 From: abieler Date: Sat, 17 Jun 2017 11:35:48 +0200 Subject: [PATCH 13/18] Start README --- README.md | 69 +++++++++++++++++++++++++++++++++++++++++++++- src/standardize.jl | 2 +- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a6d25ff..543207d 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,71 @@ ## Overview -UNDER CONSTRUCTION +Utility package that provides end user friendly methods for feature scalings and polynomial +basis expansion. + +### Standardization +Standardization of data sets result in variables with a mean of 0 and variance of 1. +A common use case would be to fit a `StandardScaler` to the training data and later +apply the same transformation to the test data. `StandardScaler` is used with the +functions `fit()`, `transform()` and `fit_transform()` as shown below. + +```julia + + fit(StandardScaler, X[, μ, σ; obsdim, operate_on]) + + fit_transform(StandardScaler, X[, μ, σ; obsdim, operate_on]) +``` + +`X` : Data of type Matrix or `DataFrame`. + +`μ` : Vector or scalar describing the translation. + Defaults to mean(X, obsdim) + +`σ` : Vector or scalar describing the scale. + Defaults to std(X, obsdim) + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and rescaling occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices. + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + + + Xtrain = rand(100, 4) + Xtest = rand(10, 4) + x = rand(4) + Dtrain = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + Dtest = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + scaler = fit(StandardScaler, Xtrain) + scaler = fit(StandardScaler, Xtrain, obsdim=1) + scaler = fit(StandardScaler, Xtrain, obsdim=1, operate_on=[1,3]) + transform(Xtest, scaler) + transform!(Xtest, scaler) + transform(x, scaler) + transform!(x, scaler) + + scaler = fit(StandardScaler, Dtrain) + scaler = fit(StandardScaler, Dtrain, operate_on=[:A,:B]) + transform(Dtest, scaler) + transform!(Dtest, scaler) + + Xscaled, scaler = fit_transform(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) + +Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat +as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not +the case for `transform` however. +For `DataFrames` `transform!` can be used on columns of type <: Integer. diff --git a/src/standardize.jl b/src/standardize.jl index 80e6563..ed52e60 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -5,7 +5,7 @@ or μ, σ = standardize!(D[, μ, σ; operate_on]) -Normalize `X` along `obsdim` according to X = (X - μ) / σ. +Standardize `X` along `obsdim` according to X = (X - μ) / σ. If μ and σ are omitted they are computed such that variables have a mean of zero From fd39887f1abb70e3ccb1d5c7ca1fde72c52d471e Mon Sep 17 00:00:00 2001 From: abieler Date: Tue, 20 Jun 2017 22:49:19 +0200 Subject: [PATCH 14/18] Add README --- README.md | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 543207d..8200206 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Utility package that provides end user friendly methods for feature scalings and polynomial basis expansion. -### Standardization +### StandardScaler Standardization of data sets result in variables with a mean of 0 and variance of 1. A common use case would be to fit a `StandardScaler` to the training data and later apply the same transformation to the test data. `StandardScaler` is used with the @@ -47,7 +47,7 @@ Columns containing non numeric elements are skipped. Examples: - +```julia Xtrain = rand(100, 4) Xtest = rand(10, 4) x = rand(4) @@ -69,8 +69,117 @@ Examples: Xscaled, scaler = fit_transform(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) scaler = fit_transform!(StandardScaler, X, obsdim=1, operate_on=[1,2,4]) +``` Note that for `transform!` the data matrix `X` has to be of type <: AbstractFloat as the scaling occurs inplace. (E.g. cannot be of type Matrix{Int64}). This is not the case for `transform` however. For `DataFrames` `transform!` can be used on columns of type <: Integer. + + +### FixedRangeScaler +`FixedRangeScaler` is used with the functions `fit()`, `transform()` and `fit_transform()` +to scale data in a Matrix `X` or DataFrame to a fixed range [lower:upper]. +After fitting a `FixedRangeScaler` to one data set, it can be used to perform the same +transformation to a new set of data. E.g. fit the `FixedRangeScaler` to your training +data and then apply the scaling to the test data at a later stage. (See examples below). + +```julia + fit(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) + + fit_transform(FixedRangeScaler, X[, lower, upper; obsdim, operate_on]) +``` + +`X` : Data of type Matrix or `DataFrame`. + +`lower` : (Scalar) Lower limit of new range. + Defaults to 0. + +`upper` : (Scalar) Upper limit of new range. + Defaults to 1. + +`obsdim` : Specify which axis corresponds to observations. + Defaults to obsdim=2 (observations are columns of matrix) + For DataFrames `obsdim` is obsolete and rescaling occurs + column wise. + +`operate_on`: Specify the indices of columns or rows to be centered. + Defaults to all columns/rows. + For DataFrames this must be a vector of symbols, not indices. + E.g. `operate_on`=[1,3] will perform centering on columns + with index 1 and 3 only (if obsdim=1, else rows 1 and 3) + +Note on DataFrames: +Columns containing `NA` values are skipped. +Columns containing non numeric elements are skipped. + +Examples: + +```julia + Xtrain = rand(100, 4) + Xtest = rand(10, 4) + x = rand(10) + D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10]) + + scaler = fit(FixedRangeScaler, Xtrain) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1, obsdim=1) + scaler = fit(FixedRangeScaler, Xtrain, -1, 1, obsdim=1, operate_on=[1,3]) + scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A,:B]) + + Xscaled = transform(Xtest, scaler) + transform!(Xtest, scaler) + + Xscaled, scaler = fit_transform(FixedRangeScaler, X, -1, 1, obsdim=1, operate_on=[1,2,4]) + scaler = fit_transform!(FixedRangeScaler, X, -1, 1, obsdim=1, operate_on=[1,2,4]) +``` + +### Lower Level Functions +The lower level functions on which `StandardScaler` and `FixedRangeScaler` are built on can also +be used seperately. + +#### center!() +```julia + μ = center!(X[, μ; obsdim, operate_on]) +``` +Shift `X` along `obsdim` by `μ` according to X = X - μ +where `X` is of type Matrix or Vector and `D` of type DataFrame. + +#### fixedrange!() +```julia + lower, upper, xmin, xmax = fixedrange!(X[, lower, upper, xmin, xmax; obsdim, operate_on]) +``` +Normalize `X` or `D` along `obsdim` to the interval [lower:upper] +where `X` is of type Matrix or Vector and `D` of type DataFrame. +If `lower` and `upper` are omitted the default range is [0:1]. + +#### standardize!() +```julia + μ, σ = standardize!(X[, μ, σ; obsdim, operate_on]) +``` +Standardize `X` along `obsdim` according to X = (X - μ) / σ. +If μ and σ are omitted they are computed such that variables have a mean of zero. + +### Polynomial Basis Expansion +```julia + M = expand_poly(x, [degree = 5], obsdim]) +``` +Perform a polynomial basis expansion of the given `degree` for the vector `x`. + +```julia +julia> expand_poly(1:5, degree = 3) +3×5 Array{Float64,2}: + 1.0 2.0 3.0 4.0 5.0 + 1.0 4.0 9.0 16.0 25.0 + 1.0 8.0 27.0 64.0 125.0 + +julia> expand_poly(1:5, degree = 3, obsdim = 1) +5×3 Array{Float64,2}: + 1.0 1.0 1.0 + 2.0 4.0 8.0 + 3.0 9.0 27.0 + 4.0 16.0 64.0 + 5.0 25.0 125.0 + +julia> expand_poly(1:5, 3, ObsDim.First()); # same but type-stable +``` From 9b941f512f95c55a6fc4a00bf8b2a819516f9535 Mon Sep 17 00:00:00 2001 From: abieler Date: Tue, 20 Jun 2017 22:52:31 +0200 Subject: [PATCH 15/18] remove obsolete test file --- test/tst_featurenormalizer.jl | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 test/tst_featurenormalizer.jl diff --git a/test/tst_featurenormalizer.jl b/test/tst_featurenormalizer.jl deleted file mode 100644 index 05906f2..0000000 --- a/test/tst_featurenormalizer.jl +++ /dev/null @@ -1,13 +0,0 @@ -@testset "Test FeatureNormalizer model" begin - e_x = collect(-5:.1:5) - e_X = [e_x e_x.^2 e_x.^3]' - - cs = fit(FeatureNormalizer, e_X) - @test vec(mean(e_X, 2)) ≈ cs.offset - @test vec(std(e_X, 2)) ≈ cs.scale - - Xa = predict(cs, e_X) - @test Xa != e_X - @test abs(sum(mean(Xa, 2))) <= 10e-10 - @test std(Xa, 2) ≈ [1, 1, 1] -end From a074b0d242c35a81370e65a65dc9eeed2b83fd56 Mon Sep 17 00:00:00 2001 From: abieler Date: Tue, 20 Jun 2017 23:11:14 +0200 Subject: [PATCH 16/18] Remove depwarns for v0.6 --- src/center.jl | 4 ++-- src/fixedrange.jl | 4 ++-- src/scaleselection.jl | 6 +++--- src/standardize.jl | 4 ++-- test/tst_center.jl | 38 ++++++++++++++++++------------------ test/tst_fixedrangescaler.jl | 6 +++--- test/tst_standardize.jl | 2 +- 7 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/center.jl b/src/center.jl index aae9a0a..4f4cbbe 100644 --- a/src/center.jl +++ b/src/center.jl @@ -145,7 +145,7 @@ function center!(D::AbstractDataFrame, operate_on::AbstractVector{Symbol}) for colname in operate_on if eltype(D[colname]) <: Real μ = mean(D[colname]) - if isna(μ) + if isna.(μ) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -174,7 +174,7 @@ function center!(D::AbstractDataFrame, μ::AbstractVector, operate_on::AbstractV end function center!(D::AbstractDataFrame, μ::Real, colname::Symbol) - if sum(isna(D[colname])) > 0 + if sum(isna.(D[colname])) > 0 warn("Skipping \"$colname\" because it contains NA values") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index 30cd53c..dba5a5f 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -166,7 +166,7 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, operate_on: if eltype(D[colname]) <: Real minval = minimum(D[colname]) maxval = maximum(D[colname]) - if isna(minval) + if isna.(minval) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -193,7 +193,7 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Abstr end function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Real, xmax::Real, colname::Symbol) - if any(isna(D[colname])) | !(eltype(D[colname]) <: Real) + if any(isna.(D[colname])) | !(eltype(D[colname]) <: Real) warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/src/scaleselection.jl b/src/scaleselection.jl index 13700ff..19f4f85 100644 --- a/src/scaleselection.jl +++ b/src/scaleselection.jl @@ -33,7 +33,7 @@ end function valid_columns(D::AbstractDataFrame) valid_colnames = Symbol[] for colname in names(D) - if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) + if (eltype(D[colname]) <: Real) & !(any(isna.(D[colname]))) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") @@ -45,7 +45,7 @@ end function valid_columns(D::AbstractDataFrame, colnames) valid_colnames = Symbol[] for colname in colnames - if (eltype(D[colname]) <: Real) & !(any(isna(D[colname]))) + if (eltype(D[colname]) <: Real) & !(any(isna.(D[colname]))) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") @@ -58,7 +58,7 @@ function valid_columns_categorical(D::AbstractDataFrame) valid_colnames = Symbol[] for colname in names(D) if !(eltype(D[colname]) <: Real) - if !(any(isna(D[colname]))) + if !(any(isna.(D[colname]))) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it contains NA") diff --git a/src/standardize.jl b/src/standardize.jl index ed52e60..ecac3ab 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -126,7 +126,7 @@ function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) if eltype(D[colname]) <: Real μ = mean(D[colname]) σ = std(D[colname]) - if isna(μ) + if isna.(μ) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -152,7 +152,7 @@ function standardize!(D::AbstractDataFrame, μ::AbstractVector, σ::AbstractVect end function standardize!(D::AbstractDataFrame, μ::Real, σ::Real, colname::Symbol) - if any(isna(D[colname])) | !(eltype(D[colname]) <: Real) + if any(isna.(D[colname])) | !(eltype(D[colname]) <: Real) warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/test/tst_center.jl b/test/tst_center.jl index e184202..143e76b 100644 --- a/test/tst_center.jl +++ b/test/tst_center.jl @@ -8,63 +8,63 @@ D_NA[1, :A] = NA @testset "Array" begin XX = deepcopy(X) mu = center!(XX, obsdim=1) - @test sum(abs(mean(XX, 1))) == 0 + @test sum(abs.(mean(XX, 1))) == 0 @test all(std(XX, 1) .== std(X, 1)) @test all(mu .== vec(mean(X, 1))) XX = deepcopy(X) mu = center!(XX, ObsDim.First()) - @test sum(abs(mean(XX, 1))) == 0 + @test sum(abs.(mean(XX, 1))) == 0 @test all(std(XX, 1) .== std(X, 1)) @test all(mu .== vec(mean(X, 1))) XX = deepcopy(X) mu = center!(XX, ObsDim.Last()) - @test sum(abs(mean(XX, 2))) == 0 + @test sum(abs.(mean(XX, 2))) == 0 @test all(std(XX, 2) .== std(X, 2)) @test all(mu .== vec(mean(X, 2))) XX = deepcopy(X) mu = center!(XX) - @test sum(abs(mean(XX, 2))) == 0 + @test sum(abs.(mean(XX, 2))) == 0 @test all(std(XX, 2) .== std(X, 2)) @test all(mu .== vec(mean(X, 2))) XX = deepcopy(X) mu = vec(mean(X, 1)) center!(XX, mu, obsdim=1) - @test sum(abs(mean(XX, 1))) == 0 + @test sum(abs.(mean(XX, 1))) == 0 @test all(std(XX, 1) .== std(X, 1)) XX = deepcopy(X) mu = vec(mean(X, 1)) center!(XX, mu, ObsDim.First()) - @test sum(abs(mean(XX, 1))) == 0 + @test sum(abs.(mean(XX, 1))) == 0 @test all(std(XX, 1) .== std(X, 1)) XX = deepcopy(X) mu = vec(mean(XX, 2)) center!(XX, mu, obsdim=2) - @test sum(abs(mean(XX, 2))) == 0 + @test sum(abs.(mean(XX, 2))) == 0 @test all(std(XX, 2) .== std(X, 2)) XX = deepcopy(X) mu = vec(mean(XX, 2)) center!(XX, mu, ObsDim.Last()) - @test sum(abs(mean(XX, 2))) == 0 + @test sum(abs.(mean(XX, 2))) == 0 @test all(std(XX, 2) .== std(X, 2)) XX = deepcopy(X) mu = vec(mean(X[:,[1,3]], 1)) center!(XX, mu, obsdim=1, operate_on=[1, 3]) - @test sum(abs(mean(XX[:,[1,3]], 1))) == 0 + @test sum(abs.(mean(XX[:,[1,3]], 1))) == 0 @test all(XX[:,2] .== X[:,2]) @test all(std(XX, 1) .== std(X, 1)) XX = deepcopy(X) mu = vec(mean(X[[1,3],:], 2)) center!(XX, mu, obsdim=2, operate_on=[1, 3]) - @test sum(abs(mean(XX[[1,3],:], 2))) == 0 + @test sum(abs.(mean(XX[[1,3],:], 2))) == 0 @test all(XX[2,:] .== X[2,:]) @test all(std(XX, 2) .== std(X, 2)) println() @@ -93,34 +93,34 @@ end # Center DataFrame DD = deepcopy(D) center!(DD) - @test abs(mean(DD[:A])) <= 10e-10 - @test abs(mean(DD[:B])) <= 10e-10 + @test abs.(mean(DD[:A])) <= 10e-10 + @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) DD = deepcopy(D) center!(DD, operate_on=[:B]) @test all(DD[:A] .== D[:A]) - @test abs(mean(DD[:B])) <= 10e-10 + @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) DD = deepcopy(D) mu = center!(DD, operate_on=[:A, :B]) - @test abs(mean(DD[:A])) <= 10e-10 - @test abs(mean(DD[:B])) <= 10e-10 + @test abs.(mean(DD[:A])) <= 10e-10 + @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) @test all(mu .== [mean(D[:A]), mean(D[:B])]) DD = deepcopy(D) mu = [mean(D[:A]), mean(D[:B])] @test all(center!(DD, mu, operate_on=[:A, :B]) .== mu) - @test abs(mean(DD[:A])) <= 10e-10 - @test abs(mean(DD[:B])) <= 10e-10 + @test abs.(mean(DD[:A])) <= 10e-10 + @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) DD = deepcopy(D_NA) center!(DD) @test all(DD[2:end, :A] .== D[2:end, :A]) - @test abs(mean(DD[:B])) <= 10e-10 + @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) - @test isna(DD[1, :A]) + @test isna.(DD[1, :A]) end diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index 7c3b50a..7fc7ece 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -79,21 +79,21 @@ end scaler = fit(FixedRangeScaler, D, -1, 1) DD = transform(D_NA, scaler) - @test isna(DD[1,:A]) + @test isna.(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B]) DD = transform(D_NA, scaler) - @test isna(DD[1,:A]) + @test isna.(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B, :C]) DD = transform(D_NA, scaler) - @test isna(DD[1,:A]) + @test isna.(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl index 44a99a3..2ac3fc0 100644 --- a/test/tst_standardize.jl +++ b/test/tst_standardize.jl @@ -125,7 +125,7 @@ end # skip columns that contain NA values DD = deepcopy(D_NA) mu, sigma = standardize!(DD) - @test isna(DD[1, :A]) + @test isna.(DD[1, :A]) @test all(DD[2:end, :A] .== D_NA[2:end, :A]) @test abs(mean(DD[:B])) < 10e-10 @test abs(std(DD[:B])) - 1 < 10e-10 From 47891878dcf64dcb591793d1772993c6ebbd637e Mon Sep 17 00:00:00 2001 From: abieler Date: Fri, 23 Jun 2017 21:44:04 +0200 Subject: [PATCH 17/18] Undo isna related depwarns --- src/center.jl | 4 ++-- src/fixedrange.jl | 4 ++-- src/scaleselection.jl | 6 +++--- src/standardize.jl | 4 ++-- test/tst_center.jl | 2 +- test/tst_fixedrangescaler.jl | 6 +++--- test/tst_standardize.jl | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/center.jl b/src/center.jl index 4f4cbbe..ec9a41b 100644 --- a/src/center.jl +++ b/src/center.jl @@ -145,7 +145,7 @@ function center!(D::AbstractDataFrame, operate_on::AbstractVector{Symbol}) for colname in operate_on if eltype(D[colname]) <: Real μ = mean(D[colname]) - if isna.(μ) + if isna(μ) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -174,7 +174,7 @@ function center!(D::AbstractDataFrame, μ::AbstractVector, operate_on::AbstractV end function center!(D::AbstractDataFrame, μ::Real, colname::Symbol) - if sum(isna.(D[colname])) > 0 + if sum([isna(value) for value in D[colname]]) > 0 warn("Skipping \"$colname\" because it contains NA values") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/src/fixedrange.jl b/src/fixedrange.jl index dba5a5f..9a3f112 100644 --- a/src/fixedrange.jl +++ b/src/fixedrange.jl @@ -166,7 +166,7 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, operate_on: if eltype(D[colname]) <: Real minval = minimum(D[colname]) maxval = maximum(D[colname]) - if isna.(minval) + if isna(minval) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -193,7 +193,7 @@ function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Abstr end function fixedrange!(D::AbstractDataFrame, lower::Real, upper::Real, xmin::Real, xmax::Real, colname::Symbol) - if any(isna.(D[colname])) | !(eltype(D[colname]) <: Real) + if any(isna, D[colname]) | !(eltype(D[colname]) <: Real) warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/src/scaleselection.jl b/src/scaleselection.jl index 19f4f85..8544d7d 100644 --- a/src/scaleselection.jl +++ b/src/scaleselection.jl @@ -33,7 +33,7 @@ end function valid_columns(D::AbstractDataFrame) valid_colnames = Symbol[] for colname in names(D) - if (eltype(D[colname]) <: Real) & !(any(isna.(D[colname]))) + if (eltype(D[colname]) <: Real) & !any(isna, D[colname]) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") @@ -45,7 +45,7 @@ end function valid_columns(D::AbstractDataFrame, colnames) valid_colnames = Symbol[] for colname in colnames - if (eltype(D[colname]) <: Real) & !(any(isna.(D[colname]))) + if (eltype(D[colname]) <: Real) & !(any(isna, D[colname])) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it either contains NA or is not of type <: Real") @@ -58,7 +58,7 @@ function valid_columns_categorical(D::AbstractDataFrame) valid_colnames = Symbol[] for colname in names(D) if !(eltype(D[colname]) <: Real) - if !(any(isna.(D[colname]))) + if !(any(isna, D[colname])) push!(valid_colnames, colname) else warn("Skipping \"$colname\" because it contains NA") diff --git a/src/standardize.jl b/src/standardize.jl index ecac3ab..381df40 100644 --- a/src/standardize.jl +++ b/src/standardize.jl @@ -126,7 +126,7 @@ function standardize!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}) if eltype(D[colname]) <: Real μ = mean(D[colname]) σ = std(D[colname]) - if isna.(μ) + if isna(μ) warn("Skipping \"$colname\" because it contains NA values") continue end @@ -152,7 +152,7 @@ function standardize!(D::AbstractDataFrame, μ::AbstractVector, σ::AbstractVect end function standardize!(D::AbstractDataFrame, μ::Real, σ::Real, colname::Symbol) - if any(isna.(D[colname])) | !(eltype(D[colname]) <: Real) + if any(isna, D[colname]) | !(eltype(D[colname]) <: Real) warn("Skipping \"$colname\" because it contains NA values or is not of type <: Real") else newcol::Vector{Float64} = convert(Vector{Float64}, D[colname]) diff --git a/test/tst_center.jl b/test/tst_center.jl index 143e76b..922eb53 100644 --- a/test/tst_center.jl +++ b/test/tst_center.jl @@ -122,5 +122,5 @@ end @test all(DD[2:end, :A] .== D[2:end, :A]) @test abs.(mean(DD[:B])) <= 10e-10 @test all(DD[:C] .== D[:C]) - @test isna.(DD[1, :A]) + @test isna(DD[1, :A]) end diff --git a/test/tst_fixedrangescaler.jl b/test/tst_fixedrangescaler.jl index 7fc7ece..7c3b50a 100644 --- a/test/tst_fixedrangescaler.jl +++ b/test/tst_fixedrangescaler.jl @@ -79,21 +79,21 @@ end scaler = fit(FixedRangeScaler, D, -1, 1) DD = transform(D_NA, scaler) - @test isna.(DD[1,:A]) + @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B]) DD = transform(D_NA, scaler) - @test isna.(DD[1,:A]) + @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 scaler = fit(FixedRangeScaler, D, -1, 1, operate_on=[:A, :B, :C]) DD = transform(D_NA, scaler) - @test isna.(DD[1,:A]) + @test isna(DD[1,:A]) @test DD[end,:A] == D_NA[end,:A] @test minimum(DD[:B]) == -1 @test maximum(DD[:B]) == 1 diff --git a/test/tst_standardize.jl b/test/tst_standardize.jl index 2ac3fc0..44a99a3 100644 --- a/test/tst_standardize.jl +++ b/test/tst_standardize.jl @@ -125,7 +125,7 @@ end # skip columns that contain NA values DD = deepcopy(D_NA) mu, sigma = standardize!(DD) - @test isna.(DD[1, :A]) + @test isna(DD[1, :A]) @test all(DD[2:end, :A] .== D_NA[2:end, :A]) @test abs(mean(DD[:B])) < 10e-10 @test abs(std(DD[:B])) - 1 < 10e-10 From 686a308b1f5f40486f016426565733c8cc18789c Mon Sep 17 00:00:00 2001 From: abieler Date: Fri, 23 Jun 2017 21:53:44 +0200 Subject: [PATCH 18/18] Fix doc typo --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8200206..f9978eb 100644 --- a/README.md +++ b/README.md @@ -162,18 +162,18 @@ If μ and σ are omitted they are computed such that variables have a mean of ze ### Polynomial Basis Expansion ```julia - M = expand_poly(x, [degree = 5], obsdim]) + M = expand_poly(x[, degree=5, obsdim]) ``` Perform a polynomial basis expansion of the given `degree` for the vector `x`. ```julia -julia> expand_poly(1:5, degree = 3) +julia> expand_poly(1:5, degree=3) 3×5 Array{Float64,2}: 1.0 2.0 3.0 4.0 5.0 1.0 4.0 9.0 16.0 25.0 1.0 8.0 27.0 64.0 125.0 -julia> expand_poly(1:5, degree = 3, obsdim = 1) +julia> expand_poly(1:5, degree=3, obsdim=1) 5×3 Array{Float64,2}: 1.0 1.0 1.0 2.0 4.0 8.0