implement rescale! and center! for DataFrame (#33)

* Convert data arrays before rescaling * Add tests and skip columns with NAs
JuliaML · Apr 27, 2017 · f652ede · f652ede
1 parent ee2374e
commit f652ede
Show file tree

Hide file tree

Showing 3 changed files with 233 additions and 2 deletions.
diff --git a/src/MLDataUtils.jl b/src/MLDataUtils.jl
@@ -5,6 +5,7 @@ using StatsBase
 using LearnBase
 using MLLabelUtils
 using MLDataPattern
+using DataFrames
 
 using LearnBase: ObsDimension, obs_dim
 import LearnBase: nobs, getobs, getobs!, datasubset, default_obsdim

diff --git a/src/feature_scaling.jl b/src/feature_scaling.jl
@@ -1,9 +1,30 @@
 """
     μ = center!(X[, μ, obsdim])
+    
+or
+
+    μ = center!(D[, colnames, μ])
+
+where `X` is of type Matrix or Vector and `D` of type DataFrame.
 
 Center `X` along `obsdim` around the corresponding entry in the
 vector `μ`. If `μ` is not specified then it defaults to the
 feature specific means.
+
+For DataFrames, `obsdim` is obsolete and centering is done column wise.
+Instead the vector `colnames` allows to specify which columns to center.
+If `colnames` is not provided all columns of type T<:Real are centered.
+
+Example:
+
+    X = rand(4, 100)
+    D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
+
+    μ = center!(X, obsdim=2)
+    μ = center!(X, ObsDim.First())
+    μ = center!(D)
+    μ = center!(D, [:A, :B])
+
 """
 function center!(X, μ; obsdim=LearnBase.default_obsdim(X))
     center!(X, μ, convert(ObsDimension, obsdim))
@@ -68,13 +89,88 @@ function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{2})
     μ
 end
 
+function center!(D::AbstractDataFrame)
+    μ_vec = Float64[]
+
+    flt = Bool[T <: Real for T in eltypes(D)]
+    for colname in names(D)[flt]
+        μ = mean(D[colname])
+        center!(D, colname, μ)
+        push!(μ_vec, μ)
+    end
+    μ_vec
+end
+
+function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol})
+    μ_vec = Float64[]
+    for colname in colnames
+        if eltype(D[colname]) <: Real
+            μ = mean(D[colname])
+            if isna(μ)
+                warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
+                continue
+            end
+            center!(D, colname, μ)
+            push!(μ_vec, μ)
+        else
+            warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.")
+        end
+    end
+    μ_vec
+end
+
+function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}, μ::AbstractVector)
+    for (icol, colname) in enumerate(colnames)
+        if eltype(D[colname]) <: Real
+            center!(D, colname, μ[icol])
+        else
+            warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.")
+        end
+    end
+    μ
+end
+
+function center!(D::AbstractDataFrame, colname::Symbol, μ)
+    if sum(isna(D[colname])) > 0 
+        warn("Column \"$colname\" contains NA values, skipping centering on this column!")
+    else
+        newcol::Vector{Float64} = convert(Vector{Float64}, D[colname])
+        nobs = length(newcol)
+        @inbounds for i in eachindex(newcol)
+            newcol[i] -= μ
+        end
+        D[colname] = newcol
+    end
+    μ
+end
 
 """
     μ, σ = rescale!(X[, μ, σ, obsdim])
 
+or 
+
+    μ, σ = rescale!(D[, colnames, μ, σ])
+
+where `X` is of type Matrix or Vector and `D` of type DataFrame.
+
 Center `X` along `obsdim` around the corresponding entry in the
 vector `μ` and then rescale each feature using the corresponding
 entry in the vector `σ`.
+
+For DataFrames, `obsdim` is obsolete and centering is done column wise.
+The vector `colnames` allows to specify which columns to center.
+If `colnames` is not provided all columns of type T<:Real are centered.
+
+Example:
+
+    X = rand(4, 100)
+    D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
+
+    μ, σ = rescale!(X, obsdim=2)
+    μ, σ = rescale!(X, ObsDim.First())
+    μ, σ = rescale!(D)
+    μ, σ = rescale!(D, [:A, :B])
+
 """
 function rescale!(X, μ, σ; obsdim=LearnBase.default_obsdim(X))
     rescale!(X, μ, σ, convert(ObsDimension, obsdim))
@@ -101,7 +197,7 @@ end
 function rescale!(X::AbstractVector, ::ObsDim.Constant{1})
     μ = mean(X)
     σ = std(X)
-    for i in 1:length(X)
+    @inbounds for i in 1:length(X)
         X[i] = (X[i] - μ) / σ
     end
     μ, σ
@@ -143,6 +239,68 @@ function rescale!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::Obs
     μ, σ
 end
 
+function rescale!(D::AbstractDataFrame)
+    μ_vec = Float64[]
+    σ_vec = Float64[]
+
+    flt = Bool[T <: Real for T in eltypes(D)]
+    for colname in names(D)[flt]
+        μ = mean(D[colname])
+        σ = std(D[colname])
+        rescale!(D, colname, μ, σ)
+        push!(μ_vec, μ)
+        push!(σ_vec, σ)
+    end
+    μ_vec, σ_vec
+end
+
+function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol})
+    μ_vec = Float64[]
+    σ_vec = Float64[]
+    for colname in colnames 
+        if eltype(D[colname]) <: Real
+            μ = mean(D[colname])
+            σ = std(D[colname])
+            if isna(μ)
+                warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
+                continue
+            end
+            rescale!(D, colname, μ, σ)
+            push!(μ_vec, μ)
+            push!(σ_vec, σ)
+        else
+            warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.")
+        end
+    end
+    μ_vec, σ_vec
+end
+
+function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol}, μ::AbstractVector, σ::AbstractVector)
+    for (icol, colname) in enumerate(colnames)
+        if eltype(D[colname]) <: Real
+            rescale!(D, colname, μ[icol], σ[icol])
+        else
+            warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.")
+        end
+    end
+    μ, σ
+end
+
+function rescale!(D::AbstractDataFrame, colname::Symbol, μ, σ)
+    if sum(isna(D[colname])) > 0 
+        warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
+    else
+        σ_div = σ == 0 ? one(σ) : σ
+        newcol::Vector{Float64} = convert(Vector{Float64}, D[colname])
+        nobs = length(newcol)
+        @inbounds for i in eachindex(newcol)
+            newcol[i] = (newcol[i] - μ) / σ_div
+        end
+        D[colname] = newcol
+    end
+    μ, σ
+end
+
 immutable FeatureNormalizer
     offset::Vector{Float64}
     scale::Vector{Float64}

diff --git a/test/tst_feature_scaling.jl b/test/tst_feature_scaling.jl
@@ -1,5 +1,8 @@
 e_x, _ = noisy_sin(50; noise = 0.)
 e_X = expand_poly(e_x, degree = 5)
+df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
+df_na = deepcopy(df)
+df_na[1, :A] = NA
 
 @testset "Test expand_poly" begin
     @test size(e_X) == (5, 50)
@@ -59,6 +62,36 @@ end
     center!(Xa, mu, ObsDim.Last())
     @test abs(sum(mean(Xa, 2))) <= 10e-10
 
+    # Center DataFrame
+    D = copy(df)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    mu = center!(D)
+    @test length(mu) == 2
+    @test abs(sum(mu .- mu_check)) <= 10e-10
+
+    D = copy(df)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    mu = center!(D, [:A, :B])
+    @test abs(sum(mu .- mu_check)) <= 10e-10
+
+    D = copy(df)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    mu = center!(D, [:A, :B], mu_check)
+    @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
+
+    # skip columns that contain NA values
+    D = copy(df_na)
+    mu = center!(D, [:A, :B])
+    @test isna(D[1, :A])
+    @test all(D[2:end, :A] .== df_na[2:end, :A])
+    @test abs(mean(D[:B])) < 10e-10
+
+    D = copy(df_na)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    mu = center!(D, [:A, :B], mu_check)
+    @test isna(D[1, :A])
+    @test all(D[2:end, :A] .== df_na[2:end, :A])
+    @test abs(mean(D[:B])) < 10e-10
 
     # Rescale Vector
     xa = copy(e_x)
@@ -78,6 +111,12 @@ end
     @test abs(mean(xa)) <= 10e-10
     @test std(xa) ≈ 1
 
+    xa = copy(e_x)
+    mu = copy(e_x) .- 1
+    sigma = ones(e_x)
+    mu, sigma = rescale!(xa, mu, sigma, obsdim=1)
+    @test mean(xa) ≈ 1
+
     Xa = copy(e_X)
     rescale!(Xa)
     @test abs(sum(mean(Xa, 2))) <= 10e-10
@@ -92,7 +131,6 @@ end
     rescale!(Xa, obsdim=1)
     @test abs(sum(mean(Xa, 1))) <= 10e-10
 
-
     Xa = copy(e_X)
     mu = vec(mean(Xa, 1))
     sigma = vec(std(Xa, 1))
@@ -104,6 +142,40 @@ end
     sigma = vec(std(Xa, 2))
     rescale!(Xa, mu, sigma, obsdim=2)
     @test abs(sum(mean(Xa, 2))) <= 10e-10
+
+    D = copy(df)
+    mu, sigma = rescale!(D)
+    @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
+    @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 
+
+    D = copy(df)
+    mu, sigma = rescale!(D, [:A, :B])
+    @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
+    @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 
+
+    D = copy(df)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    sigma_check = [std(D[colname]) for colname in names(D)[1:2]]
+    mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check)
+    @test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
+    @test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10 
+
+    # skip columns that contain NA values
+    D = copy(df_na)
+    mu, sigma = rescale!(D, [:A, :B])
+    @test isna(D[1, :A])
+    @test all(D[2:end, :A] .== df_na[2:end, :A])
+    @test abs(mean(D[:B])) < 10e-10
+    @test abs(std(D[:B])) - 1 < 10e-10
+
+    D = copy(df_na)
+    mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
+    sigma_check = [std(D[colname]) for colname in names(D)[1:2]]
+    mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check)
+    #= @test isna(D[1, :A]) =#
+    #= @test all(D[2:end, :A] .== df_na[2:end, :A]) =#
+    #= @test abs(mean(D[:B])) < 10e-10 =#
+    #= @test (abs(std(D[:B])) - 1) < 10e-10 =#
 end
 
 @testset "Test FeatureNormalizer model" begin