Skip to content
This repository has been archived by the owner on May 21, 2022. It is now read-only.

Commit

Permalink
implement rescale! and center! for DataFrame (#33)
Browse files Browse the repository at this point in the history
* Convert data arrays before rescaling

* Add tests and skip columns with NAs
  • Loading branch information
abieler authored and Evizero committed Apr 27, 2017
1 parent ee2374e commit f652ede
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/MLDataUtils.jl
Expand Up @@ -5,6 +5,7 @@ using StatsBase
using LearnBase
using MLLabelUtils
using MLDataPattern
using DataFrames

using LearnBase: ObsDimension, obs_dim
import LearnBase: nobs, getobs, getobs!, datasubset, default_obsdim
Expand Down
160 changes: 159 additions & 1 deletion src/feature_scaling.jl
@@ -1,9 +1,30 @@
"""
μ = center!(X[, μ, obsdim])
or
μ = center!(D[, colnames, μ])
where `X` is of type Matrix or Vector and `D` of type DataFrame.
Center `X` along `obsdim` around the corresponding entry in the
vector `μ`. If `μ` is not specified then it defaults to the
feature specific means.
For DataFrames, `obsdim` is obsolete and centering is done column wise.
Instead the vector `colnames` allows to specify which columns to center.
If `colnames` is not provided all columns of type T<:Real are centered.
Example:
X = rand(4, 100)
D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
μ = center!(X, obsdim=2)
μ = center!(X, ObsDim.First())
μ = center!(D)
μ = center!(D, [:A, :B])
"""
function center!(X, μ; obsdim=LearnBase.default_obsdim(X))
center!(X, μ, convert(ObsDimension, obsdim))
Expand Down Expand Up @@ -68,13 +89,88 @@ function center!(X::AbstractMatrix, μ::AbstractVector, ::ObsDim.Constant{2})
μ
end

function center!(D::AbstractDataFrame)
μ_vec = Float64[]

flt = Bool[T <: Real for T in eltypes(D)]
for colname in names(D)[flt]
μ = mean(D[colname])
center!(D, colname, μ)
push!(μ_vec, μ)
end
μ_vec
end

function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol})
μ_vec = Float64[]
for colname in colnames
if eltype(D[colname]) <: Real
μ = mean(D[colname])
if isna(μ)
warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
continue
end
center!(D, colname, μ)
push!(μ_vec, μ)
else
warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.")
end
end
μ_vec
end

function center!(D::AbstractDataFrame, colnames::AbstractVector{Symbol}, μ::AbstractVector)
for (icol, colname) in enumerate(colnames)
if eltype(D[colname]) <: Real
center!(D, colname, μ[icol])
else
warn("Skipping \"$colname\", centering only valid for columns of type T <: Real.")
end
end
μ
end

function center!(D::AbstractDataFrame, colname::Symbol, μ)
if sum(isna(D[colname])) > 0
warn("Column \"$colname\" contains NA values, skipping centering on this column!")
else
newcol::Vector{Float64} = convert(Vector{Float64}, D[colname])
nobs = length(newcol)
@inbounds for i in eachindex(newcol)
newcol[i] -= μ
end
D[colname] = newcol
end
μ
end

"""
μ, σ = rescale!(X[, μ, σ, obsdim])
or
μ, σ = rescale!(D[, colnames, μ, σ])
where `X` is of type Matrix or Vector and `D` of type DataFrame.
Center `X` along `obsdim` around the corresponding entry in the
vector `μ` and then rescale each feature using the corresponding
entry in the vector `σ`.
For DataFrames, `obsdim` is obsolete and centering is done column wise.
The vector `colnames` allows to specify which columns to center.
If `colnames` is not provided all columns of type T<:Real are centered.
Example:
X = rand(4, 100)
D = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
μ, σ = rescale!(X, obsdim=2)
μ, σ = rescale!(X, ObsDim.First())
μ, σ = rescale!(D)
μ, σ = rescale!(D, [:A, :B])
"""
function rescale!(X, μ, σ; obsdim=LearnBase.default_obsdim(X))
rescale!(X, μ, σ, convert(ObsDimension, obsdim))
Expand All @@ -101,7 +197,7 @@ end
function rescale!(X::AbstractVector, ::ObsDim.Constant{1})
μ = mean(X)
σ = std(X)
for i in 1:length(X)
@inbounds for i in 1:length(X)
X[i] = (X[i] - μ) / σ
end
μ, σ
Expand Down Expand Up @@ -143,6 +239,68 @@ function rescale!(X::AbstractVector, μ::AbstractFloat, σ::AbstractFloat, ::Obs
μ, σ
end

function rescale!(D::AbstractDataFrame)
μ_vec = Float64[]
σ_vec = Float64[]

flt = Bool[T <: Real for T in eltypes(D)]
for colname in names(D)[flt]
μ = mean(D[colname])
σ = std(D[colname])
rescale!(D, colname, μ, σ)
push!(μ_vec, μ)
push!(σ_vec, σ)
end
μ_vec, σ_vec
end

function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol})
μ_vec = Float64[]
σ_vec = Float64[]
for colname in colnames
if eltype(D[colname]) <: Real
μ = mean(D[colname])
σ = std(D[colname])
if isna(μ)
warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
continue
end
rescale!(D, colname, μ, σ)
push!(μ_vec, μ)
push!(σ_vec, σ)
else
warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.")
end
end
μ_vec, σ_vec
end

function rescale!(D::AbstractDataFrame, colnames::Vector{Symbol}, μ::AbstractVector, σ::AbstractVector)
for (icol, colname) in enumerate(colnames)
if eltype(D[colname]) <: Real
rescale!(D, colname, μ[icol], σ[icol])
else
warn("Skipping \"$colname\", rescaling only valid for columns of type T <: Real.")
end
end
μ, σ
end

function rescale!(D::AbstractDataFrame, colname::Symbol, μ, σ)
if sum(isna(D[colname])) > 0
warn("Column \"$colname\" contains NA values, skipping rescaling of this column!")
else
σ_div = σ == 0 ? one(σ) : σ
newcol::Vector{Float64} = convert(Vector{Float64}, D[colname])
nobs = length(newcol)
@inbounds for i in eachindex(newcol)
newcol[i] = (newcol[i] - μ) / σ_div
end
D[colname] = newcol
end
μ, σ
end

immutable FeatureNormalizer
offset::Vector{Float64}
scale::Vector{Float64}
Expand Down
74 changes: 73 additions & 1 deletion test/tst_feature_scaling.jl
@@ -1,5 +1,8 @@
e_x, _ = noisy_sin(50; noise = 0.)
e_X = expand_poly(e_x, degree = 5)
df = DataFrame(A=rand(10), B=collect(1:10), C=[string(x) for x in 1:10])
df_na = deepcopy(df)
df_na[1, :A] = NA

@testset "Test expand_poly" begin
@test size(e_X) == (5, 50)
Expand Down Expand Up @@ -59,6 +62,36 @@ end
center!(Xa, mu, ObsDim.Last())
@test abs(sum(mean(Xa, 2))) <= 10e-10

# Center DataFrame
D = copy(df)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
mu = center!(D)
@test length(mu) == 2
@test abs(sum(mu .- mu_check)) <= 10e-10

D = copy(df)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
mu = center!(D, [:A, :B])
@test abs(sum(mu .- mu_check)) <= 10e-10

D = copy(df)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
mu = center!(D, [:A, :B], mu_check)
@test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10

# skip columns that contain NA values
D = copy(df_na)
mu = center!(D, [:A, :B])
@test isna(D[1, :A])
@test all(D[2:end, :A] .== df_na[2:end, :A])
@test abs(mean(D[:B])) < 10e-10

D = copy(df_na)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
mu = center!(D, [:A, :B], mu_check)
@test isna(D[1, :A])
@test all(D[2:end, :A] .== df_na[2:end, :A])
@test abs(mean(D[:B])) < 10e-10

# Rescale Vector
xa = copy(e_x)
Expand All @@ -78,6 +111,12 @@ end
@test abs(mean(xa)) <= 10e-10
@test std(xa) 1

xa = copy(e_x)
mu = copy(e_x) .- 1
sigma = ones(e_x)
mu, sigma = rescale!(xa, mu, sigma, obsdim=1)
@test mean(xa) 1

Xa = copy(e_X)
rescale!(Xa)
@test abs(sum(mean(Xa, 2))) <= 10e-10
Expand All @@ -92,7 +131,6 @@ end
rescale!(Xa, obsdim=1)
@test abs(sum(mean(Xa, 1))) <= 10e-10


Xa = copy(e_X)
mu = vec(mean(Xa, 1))
sigma = vec(std(Xa, 1))
Expand All @@ -104,6 +142,40 @@ end
sigma = vec(std(Xa, 2))
rescale!(Xa, mu, sigma, obsdim=2)
@test abs(sum(mean(Xa, 2))) <= 10e-10

D = copy(df)
mu, sigma = rescale!(D)
@test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
@test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10

D = copy(df)
mu, sigma = rescale!(D, [:A, :B])
@test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
@test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10

D = copy(df)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
sigma_check = [std(D[colname]) for colname in names(D)[1:2]]
mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check)
@test abs(sum([mean(D[colname]) for colname in names(D)[1:2]])) <= 10e-10
@test mean([std(D[colname]) for colname in names(D)[1:2]]) - 1 <= 10e-10

# skip columns that contain NA values
D = copy(df_na)
mu, sigma = rescale!(D, [:A, :B])
@test isna(D[1, :A])
@test all(D[2:end, :A] .== df_na[2:end, :A])
@test abs(mean(D[:B])) < 10e-10
@test abs(std(D[:B])) - 1 < 10e-10

D = copy(df_na)
mu_check = [mean(D[colname]) for colname in names(D)[1:2]]
sigma_check = [std(D[colname]) for colname in names(D)[1:2]]
mu, sigma = rescale!(D, [:A, :B], mu_check, sigma_check)
#= @test isna(D[1, :A]) =#
#= @test all(D[2:end, :A] .== df_na[2:end, :A]) =#
#= @test abs(mean(D[:B])) < 10e-10 =#
#= @test (abs(std(D[:B])) - 1) < 10e-10 =#
end

@testset "Test FeatureNormalizer model" begin
Expand Down

0 comments on commit f652ede

Please sign in to comment.