From a872d7c2c587deb3dddaccb13d74256acbdd7df7 Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Tue, 23 May 2023 12:01:49 +0200 Subject: [PATCH] Fixes for quantile regression (#148) * fix a doc typo * fixes following discussion around #147 --------- Co-authored-by: Anthony D. Blaom --- .github/workflows/ci.yml | 2 + Project.toml | 2 +- src/fit/solvers.jl | 21 +++++--- src/loss-penalty/robust.jl | 69 ++++++++++++++++++------- src/mlj/regressors.jl | 2 +- test/Project.toml | 1 + test/benchmarks/elementary_functions.jl | 2 +- test/benchmarks/logistic-multinomial.jl | 2 +- test/benchmarks/ridge-lasso.jl | 2 +- test/benchmarks/robust.jl | 29 +++++++++++ test/fit/quantile.jl | 25 ++++++--- test/loss-penalty/robust.jl | 2 +- test/runtests.jl | 7 ++- 13 files changed, 127 insertions(+), 39 deletions(-) create mode 100644 test/benchmarks/robust.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e5ea73c..8d918c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,6 +41,8 @@ jobs: ${{ runner.os }}- - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + env: + RUN_COMPARISONS: "false" - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v1 with: diff --git a/Project.toml b/Project.toml index 9b54ecf..dbae3c8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJLinearModels" uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692" authors = ["Thibaut Lienart "] -version = "0.9.1" +version = "0.9.2" [deps] DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl index b9ef8c2..93655a3 100644 --- a/src/fit/solvers.jl +++ b/src/fit/solvers.jl @@ -46,11 +46,14 @@ Newton solver. This is a full Hessian solver and should be avoided for `newton_options` are the [options of Newton's method](https://julianlsolvers.github.io/Optim.jl/stable/#algo/newton/) ## Example + ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - newton_options = (linesearch = Optim.LineSearches.HagerZhang()),)) +solver = MLJLinearModels.Newton( + optim_options = Optim.Options(time_limit = 20), + newton_options = (linesearch = Optim.LineSearches.HagerZhang()),) +) ``` """ @with_kw struct Newton{O,S} <: Solver @@ -70,13 +73,15 @@ generally be preferred for larger scale cases. `newtoncg_options` are the [options of Krylov Trust Region method](https://github.com/JuliaNLSolvers/Optim.jl/blob/master/src/multivariate/solvers/second_order/krylov_trust_region.jl) ## Example + ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - newtoncg_options = (eta = 0.2,)) +solver = MLJLinearModels.NewtonCG( + optim_options = Optim.Options(time_limit = 20), + newtoncg_options = (eta = 0.2,) +) ``` - """ @with_kw struct NewtonCG{O,S} <: Solver optim_options::O = Optim.Options(f_tol=1e-4) @@ -95,8 +100,10 @@ LBFGS quasi-Newton solver. See [the wikipedia entry](https://en.wikipedia.org/wi ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),)) +solver = MLJLinearModels.LBFGS( + optim_options = Optim.Options(time_limit = 20), + lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),) +) ``` """ @with_kw struct LBFGS{O,S} <: Solver diff --git a/src/loss-penalty/robust.jl b/src/loss-penalty/robust.jl index bae7449..65c76c6 100644 --- a/src/loss-penalty/robust.jl +++ b/src/loss-penalty/robust.jl @@ -3,20 +3,37 @@ export RobustLoss, BisquareRho, Bisquare, LogisticRho, Logistic, FairRho, Fair, TalwarRho, Talwar, QuantileRho, Quantile +#= +In the non-penalised case: + + β⋆ = arg min ∑ ρ(yᵢ - ⟨xᵢ, β⟩) + +where ρ is a weighing function such as, for instance, the pinball loss for +the quantile regression. + +It is useful to define the following quantities: + + ψ(r) = ρ'(r) (first derivative) + ϕ(r) = ψ'(r) (second derivative) + ω(r) = ψ(r)/r (weighing function used for IWLS), a threshold can be passed + to clip weights + +Some refs: +- https://josephsalmon.eu/enseignement/UW/STAT593/QuantileRegression.pdf +=# + abstract type RobustRho end -abstract type RobustRho1P{δ} <: RobustRho end # one parameter +# robust rho with only one parameter +abstract type RobustRho1P{δ} <: RobustRho end struct RobustLoss{ρ} <: AtomicLoss where ρ <: RobustRho rho::ρ end -(rl::RobustLoss)(x::AVR, y::AVR) = rl(x .- y) -(rl::RobustLoss)(r::AVR) = rl.rho(r) +(rl::RobustLoss)(Xβ::AVR, y::AVR) = rl(y .- Xβ) +(rl::RobustLoss)(r::AVR) = rl.rho(r) -# ψ(r) = ρ'(r) (first derivative) -# ω(r) = ψ(r)/r (weighing function) a thresh can be passed to clip weights -# ϕ(r) = ψ'(r) (second derivative) """ $TYPEDEF @@ -24,6 +41,8 @@ $TYPEDEF Huber weighing of the residuals corresponding to ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=δ(|z|-δ/2)` otherwise. + +Note: symmetric weighing. """ struct HuberRho{δ} <: RobustRho1P{δ} HuberRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -33,7 +52,7 @@ Huber(δ::Real=1.0; delta::Real=δ) = HuberRho(delta) (::HuberRho{δ})(r::AVR) where δ = begin ar = abs.(r) w = ar .<= δ - return sum( r.^2/2 .* w .+ δ .* (ar .- δ/2) .* .!w ) + return sum( @. ifelse(w, r^2/2, δ * (ar - δ/2) ) ) end ψ(::Type{HuberRho{δ}} ) where δ = (r, w) -> r * w + δ * sign(r) * (1.0 - w) @@ -47,6 +66,8 @@ $TYPEDEF Andrews weighing of the residuals corresponding to ``ρ(z) = -cos(πz/δ)/(π/δ)²`` if `|z|≤δ` and `ρ(δ)` otherwise. + +Note: symmetric weighing. """ struct AndrewsRho{δ} <: RobustRho1P{δ} AndrewsRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -58,7 +79,7 @@ Andrews(δ::Real=1.0; delta::Real=δ) = AndrewsRho(delta) w = ar .<= δ c = π/δ κ = (δ/π)^2 - return sum( -cos.(c .* r) .* κ .* w .+ κ .* .!w ) + return sum( @. ifelse(w, -cos(c * r) * κ, κ) ) end # Note, sinc(x) = sin(πx)/πx, well defined everywhere @@ -74,6 +95,8 @@ $TYPEDEF Bisquare weighing of the residuals corresponding to ``ρ(z) = δ²/6 (1-(1-(z/δ)²)³)`` if `|z|≤δ` and `δ²/6` otherwise. + +Note: symmetric weighing. """ struct BisquareRho{δ} <: RobustRho1P{δ} BisquareRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -84,7 +107,7 @@ Bisquare(δ::Real=1.0; delta::Real=δ) = BisquareRho(delta) ar = abs.(r) w = ar .<= δ κ = δ^2/6 - return sum( κ * (1.0 .- (1.0 .- (r ./ δ).^2).^3) .* w + κ .* .!w ) + return sum( @. ifelse(w, κ * (1 - (1 - (r / δ)^2)^3), κ) ) end ψ(::Type{BisquareRho{δ}} ) where δ = (r, w) -> w * r * (1.0 - (r / δ)^2)^2 @@ -97,6 +120,8 @@ $TYPEDEF Logistic weighing of the residuals corresponding to ``ρ(z) = δ² log(cosh(z/δ))`` + +Note: symmetric weighing. """ struct LogisticRho{δ} <: RobustRho1P{δ} LogisticRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -104,7 +129,7 @@ end Logistic(δ::Real=1.0; delta::Real=δ) = LogisticRho(delta) (::LogisticRho{δ})(r::AVR) where δ = begin - return sum( δ^2 .* log.(cosh.(r ./ δ)) ) + return sum( @. δ^2 * log(cosh(r / δ)) ) end # similar to sinc, to avoid NaNs if tanh(0)/0 (lim is 1.0) @@ -121,6 +146,8 @@ $TYPEDEF Fair weighing of the residuals corresponding to ``ρ(z) = δ² (|z|/δ - log(1+|z|/δ))`` + +Note: symmetric weighing. """ struct FairRho{δ} <: RobustRho1P{δ} FairRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -128,8 +155,8 @@ end Fair(δ::Real=1.0; delta::Real=δ) = FairRho(delta) (::FairRho{δ})(r::AVR) where δ = begin - sr = abs.(r) ./ δ - return sum( δ^2 .* (sr .- log1p.(sr)) ) + sr = @. abs(r) / δ + return sum( @. δ^2 * (sr - log1p(sr)) ) end ψ(::Type{FairRho{δ}} ) where δ = (r, _) -> δ * r / (abs(r) + δ) @@ -143,6 +170,8 @@ $TYPEDEF Talwar weighing of the residuals corresponding to ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=ρ(δ)` otherwise. + +Note: symmetric weighing. """ struct TalwarRho{δ} <: RobustRho1P{δ} TalwarRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -150,8 +179,8 @@ end Talwar(δ::Real=1.0; delta::Real=δ) = TalwarRho(delta) (::TalwarRho{δ})(r::AVR) where δ = begin - w = abs.(r) .<= δ - return sum( r.^2 ./ 2 .* w .+ δ^2/2 .* .!w) + w = @. abs(r) <= δ + return sum( @. ifelse(w, r^2 / 2, δ^2/2) ) end ψ(::Type{TalwarRho{δ}} ) where δ = (r, w) -> w * r @@ -164,7 +193,11 @@ $TYPEDEF Quantile regression weighing of the residuals corresponding to -``ρ(z) = z(δ - 1(z<0))`` +``ρ(z) = -z(δ - 1(z>=0))`` + +Note: asymetric weighing, the "-" sign is because similar libraries like +quantreg for instance define the residual as `y-Xθ` while we do the opposite +(out of convenience for gradients etc). """ struct QuantileRho{δ} <: RobustRho1P{δ} QuantileRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -173,9 +206,9 @@ end Quantile(δ::Real=1.0; delta::Real=δ) = QuantileRho(delta) (::QuantileRho{δ})(r::AVR) where δ = begin - return sum( r .* (δ .- (r .<= 0.0)) ) + return sum( @. -r * (δ - (r >= 0)) ) end -ψ(::Type{QuantileRho{δ}} ) where δ = (r, _) -> (δ - (r <= 0.0)) -ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> (δ - (r <= 0.0)) / clip(r, τ) +ψ(::Type{QuantileRho{δ}} ) where δ = (r, _) -> ((r >= 0.0) - δ) +ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> ((r >= 0.0) - δ) / clip(-r, τ) ϕ(::Type{QuantileRho{δ}} ) where δ = (_, _) -> error("Newton(CG) not available for Quantile Reg.") diff --git a/src/mlj/regressors.jl b/src/mlj/regressors.jl index 335fd70..e821beb 100644 --- a/src/mlj/regressors.jl +++ b/src/mlj/regressors.jl @@ -104,7 +104,7 @@ See also [`ElasticNetRegressor`](@ref). "whether to scale the penalty with the number of observations." scale_penalty_with_samples::Bool = true """any instance of `MLJLinearModels.Analytical`. Use `Analytical()` for - Cholesky and `CG()=Analytical(iteration=true)` for conjugate-gradient. + Cholesky and `CG()=Analytical(iterative=true)` for conjugate-gradient. If `solver = nothing` (default) then `Analytical()` is used. """ solver::Option{Solver} = nothing end diff --git a/test/Project.toml b/test/Project.toml index 3190290..30cbf32 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" diff --git a/test/benchmarks/elementary_functions.jl b/test/benchmarks/elementary_functions.jl index c069c00..18ac322 100644 --- a/test/benchmarks/elementary_functions.jl +++ b/test/benchmarks/elementary_functions.jl @@ -1,6 +1,6 @@ using MLJLinearModels using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=512, sparse=0.5) diff --git a/test/benchmarks/logistic-multinomial.jl b/test/benchmarks/logistic-multinomial.jl index 16b3e9e..0554fde 100644 --- a/test/benchmarks/logistic-multinomial.jl +++ b/test/benchmarks/logistic-multinomial.jl @@ -1,6 +1,6 @@ using MLJLinearModels using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_binary(n, p; seed=525) diff --git a/test/benchmarks/ridge-lasso.jl b/test/benchmarks/ridge-lasso.jl index 2e50c43..46d9c40 100644 --- a/test/benchmarks/ridge-lasso.jl +++ b/test/benchmarks/ridge-lasso.jl @@ -1,6 +1,6 @@ using MLJLinearModels, StableRNGs using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=512, sparse=0.5) diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl new file mode 100644 index 0000000..d5a8e1c --- /dev/null +++ b/test/benchmarks/robust.jl @@ -0,0 +1,29 @@ +# Follow up from issue #147 comparing quantreg more specifically. + +if DO_COMPARISONS + @testset "Comp-QR-147" begin + using CSV, DataFrames + + dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame) + tau = 0.3 + + y = Vector(dataset[!,:rent_euro]) + X = Matrix(dataset[!,[:area, :yearc]]) + X1 = hcat(X[:,1], X[:, 2], ones(size(X, 1))) + + qr = QuantileRegression(tau; penalty=:none) + obj = objective(qr, X, y) + + θ_lbfgs = fit(qr, X, y) + @test isapprox(obj(θ_lbfgs), 226_639, rtol=1e-4) + + # in this case QR with BR method does better + θ_qr_br = rcopy(getproperty(QUANTREG, :rq_fit_br)(X1, y; tau=tau))[:coefficients] + @test isapprox(obj(θ_qr_br), 207_551, rtol=1e-4) + + # lasso doesn't + θ_qr_lasso = rcopy(getproperty(QUANTREG, :rq_fit_lasso)(X1, y; tau=tau))[:coefficients] + obj(θ_qr_lasso) # 229_172 + @test 228_000 ≤ obj(θ_qr_lasso) ≤ 231_000 + end +end diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl index f1e6105..e4ce57a 100644 --- a/test/fit/quantile.jl +++ b/test/fit/quantile.jl @@ -46,16 +46,27 @@ y1a = outlify(y1, 0.1) θ_ls = fit(LinearRegression(), X, y1a) θ_lbfgs = fit(rr, X, y1a, solver=LBFGS()) θ_iwls = fit(rr, X, y1a, solver=IWLSCG()) - θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a))[:coefficients] - θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a))[:coefficients] + θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients] + θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a, tau=δ))[:coefficients] # NOTE: we take θ_qr_br as reference point @test isapprox(J(θ_ls), 505.45286, rtol=1e-5) @test J(θ_qr_br) ≈ 409.570777 # <- ref value # Their IP algorithm essentially gives the same answer @test (J(θ_qr_fnb) - J(θ_qr_br)) ≤ 1e-10 # Our algorithms are close enough - @test isapprox(J(θ_lbfgs), 409.57154, rtol=1e-5) + @test isapprox(J(θ_lbfgs), 409.57608, rtol=1e-5) @test isapprox(J(θ_iwls), 409.59, rtol=1e-4) + + # Let's try this again but with a δ different from 0.5 + δ = 0.75 + rr = QuantileRegression(δ, lambda=0) + J = objective(rr, X, y1a) + + θ_lbfgs = fit(rr, X, y1a, solver=LBFGS()) + θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients] + + @test isapprox(J(θ_qr_br), 404.6161, rtol=1e-4) + @test isapprox(J(θ_lbfgs), 404.6195, rtol=1e-4) end end @@ -63,7 +74,7 @@ end ## With Sparsity penalty ## ########################### -n, p = 500, 100 +Jn, p = 500, 100 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=51112, sparse=0.1) # pepper with outliers y1a = outlify(y1, 0.1) @@ -90,9 +101,9 @@ y1a = outlify(y1, 0.1) θ_ls = X1 \ y1a θ_fista = fit(rr, X, y1a, solver=FISTA()) θ_ista = fit(rr, X, y1a, solver=ISTA()) - θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a))[:coefficients] + θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a, lambda=λ))[:coefficients] @test isapprox(J(θ_ls), 888.3748, rtol=1e-5) - @test isapprox(J(θ_qr_lasso), 425.5, rtol=1e-3) + @test isapprox(J(θ_qr_lasso), 425.2, rtol=1e-3) # Our algorithms are close enough @test isapprox(J(θ_fista), 425.0526, rtol=1e-5) @test isapprox(J(θ_ista), 425.4113, rtol=1e-5) @@ -101,6 +112,6 @@ y1a = outlify(y1, 0.1) @test nnz(θ_fista) == 88 @test nnz(θ_ista) == 82 # in this case fista is best - @test J(θ_fista) < J(θ_ista) < J(θ_qr_lasso) + @test J(θ_fista) < J(θ_qr_lasso) < J(θ_ista) end end diff --git a/test/loss-penalty/robust.jl b/test/loss-penalty/robust.jl index 45e98ed..0abfbfa 100644 --- a/test/loss-penalty/robust.jl +++ b/test/loss-penalty/robust.jl @@ -5,7 +5,7 @@ y = randn(rng, 10) r = x .- y @testset "Robust Loss" begin - δ = 0.5 + δ = 0.75 rlδ = RobustLoss(Huber(δ)) @test rlδ isa RobustLoss{HuberRho{δ}} @test rlδ(r) == rlδ(x, y) == sum(ifelse(abs(rᵢ)≤δ, rᵢ^2/2, δ*(abs(rᵢ)-δ/2)) for rᵢ in r) diff --git a/test/runtests.jl b/test/runtests.jl index 3d7099c..73cba12 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,8 @@ using Random, StableRNGs, DataFrames, ForwardDiff import Optim import MLJ, MLJBase -DO_COMPARISONS = false; include("testutils.jl") +DO_COMPARISONS = true +include("testutils.jl") m("UTILS"); include("utils.jl") @@ -31,3 +32,7 @@ m("MLJ", false); begin mm("fit-predict"); include("interface/fitpredict.jl") mm("extras"); include("interface/extras.jl") end + +m("MISC", false); begin + mm("benchmarks"); include("benchmarks/robust.jl") +end