From a872d7c2c587deb3dddaccb13d74256acbdd7df7 Mon Sep 17 00:00:00 2001
From: Thibaut Lienart <tlienart@me.com>
Date: Tue, 23 May 2023 12:01:49 +0200
Subject: [PATCH] Fixes for quantile regression (#148)

* fix a doc typo

* fixes following discussion around #147

---------

Co-authored-by: Anthony D. Blaom <anthony.blaom@gmail.com>
---
 .github/workflows/ci.yml                |  2 +
 Project.toml                            |  2 +-
 src/fit/solvers.jl                      | 21 +++++---
 src/loss-penalty/robust.jl              | 69 ++++++++++++++++++-------
 src/mlj/regressors.jl                   |  2 +-
 test/Project.toml                       |  1 +
 test/benchmarks/elementary_functions.jl |  2 +-
 test/benchmarks/logistic-multinomial.jl |  2 +-
 test/benchmarks/ridge-lasso.jl          |  2 +-
 test/benchmarks/robust.jl               | 29 +++++++++++
 test/fit/quantile.jl                    | 25 ++++++---
 test/loss-penalty/robust.jl             |  2 +-
 test/runtests.jl                        |  7 ++-
 13 files changed, 127 insertions(+), 39 deletions(-)
 create mode 100644 test/benchmarks/robust.jl

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e5ea73c..8d918c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,6 +41,8 @@ jobs:
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
+        env:
+          RUN_COMPARISONS: "false"
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:
diff --git a/Project.toml b/Project.toml
index 9b54ecf..dbae3c8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.9.1"
+version = "0.9.2"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
index b9ef8c2..93655a3 100644
--- a/src/fit/solvers.jl
+++ b/src/fit/solvers.jl
@@ -46,11 +46,14 @@ Newton solver. This is a full Hessian solver and should be avoided for
 `newton_options` are the [options of Newton's method](https://julianlsolvers.github.io/Optim.jl/stable/#algo/newton/)
 
 ## Example
+
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                newton_options = (linesearch = Optim.LineSearches.HagerZhang()),))
+solver = MLJLinearModels.Newton(
+    optim_options = Optim.Options(time_limit = 20),
+    newton_options = (linesearch = Optim.LineSearches.HagerZhang()),)
+)
 ```
 """
 @with_kw struct Newton{O,S} <: Solver
@@ -70,13 +73,15 @@ generally be preferred for larger scale cases.
 `newtoncg_options` are the [options of Krylov Trust Region method](https://github.com/JuliaNLSolvers/Optim.jl/blob/master/src/multivariate/solvers/second_order/krylov_trust_region.jl)
 
 ## Example
+
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                newtoncg_options = (eta = 0.2,))
+solver = MLJLinearModels.NewtonCG(
+    optim_options = Optim.Options(time_limit = 20),
+    newtoncg_options = (eta = 0.2,)
+)
 ```
-
 """
 @with_kw struct NewtonCG{O,S} <: Solver
     optim_options::O = Optim.Options(f_tol=1e-4)
@@ -95,8 +100,10 @@ LBFGS quasi-Newton solver. See [the wikipedia entry](https://en.wikipedia.org/wi
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),))
+solver = MLJLinearModels.LBFGS(
+    optim_options = Optim.Options(time_limit = 20),
+    lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),)
+)
 ```
 """
 @with_kw struct LBFGS{O,S} <: Solver
diff --git a/src/loss-penalty/robust.jl b/src/loss-penalty/robust.jl
index bae7449..65c76c6 100644
--- a/src/loss-penalty/robust.jl
+++ b/src/loss-penalty/robust.jl
@@ -3,20 +3,37 @@ export RobustLoss,
        BisquareRho, Bisquare, LogisticRho, Logistic,
        FairRho, Fair, TalwarRho, Talwar, QuantileRho, Quantile
 
+#=
+In the non-penalised case:
+
+   β⋆ = arg min ∑ ρ(yᵢ - ⟨xᵢ, β⟩)
+
+where ρ is a weighing function such as, for instance, the pinball loss for
+the quantile regression.
+
+It is useful to define the following quantities:
+
+   ψ(r) = ρ'(r)     (first derivative)
+   ϕ(r) = ψ'(r)     (second derivative)
+   ω(r) = ψ(r)/r    (weighing function used for IWLS), a threshold can be passed
+                    to clip weights
+
+Some refs:
+- https://josephsalmon.eu/enseignement/UW/STAT593/QuantileRegression.pdf
+=#
+
 abstract type RobustRho end
 
-abstract type RobustRho1P{δ} <: RobustRho end # one parameter
+# robust rho with only one parameter
+abstract type RobustRho1P{δ} <: RobustRho end
 
 struct RobustLoss{ρ} <: AtomicLoss where ρ <: RobustRho
    rho::ρ
 end
 
-(rl::RobustLoss)(x::AVR, y::AVR) = rl(x .- y)
-(rl::RobustLoss)(r::AVR) = rl.rho(r)
+(rl::RobustLoss)(Xβ::AVR, y::AVR) = rl(y .- Xβ)
+(rl::RobustLoss)(r::AVR)          = rl.rho(r)
 
-# ψ(r) = ρ'(r)     (first derivative)
-# ω(r) = ψ(r)/r    (weighing function) a thresh can be passed to clip weights
-# ϕ(r) = ψ'(r)     (second derivative)
 
 """
 $TYPEDEF
@@ -24,6 +41,8 @@ $TYPEDEF
 Huber weighing of the residuals corresponding to
 
 ``ρ(z) = z²/2``  if `|z|≤δ` and `ρ(z)=δ(|z|-δ/2)` otherwise.
+
+Note: symmetric weighing.
 """
 struct HuberRho{δ} <: RobustRho1P{δ}
    HuberRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -33,7 +52,7 @@ Huber(δ::Real=1.0; delta::Real=δ) = HuberRho(delta)
 (::HuberRho{δ})(r::AVR) where δ = begin
    ar = abs.(r)
    w  = ar .<= δ
-   return sum( r.^2/2 .* w .+ δ .* (ar .- δ/2) .* .!w )
+   return sum( @. ifelse(w, r^2/2, δ * (ar - δ/2) ) )
 end
 
 ψ(::Type{HuberRho{δ}}   ) where δ = (r, w) -> r * w + δ * sign(r) * (1.0 - w)
@@ -47,6 +66,8 @@ $TYPEDEF
 Andrews weighing of the residuals corresponding to
 
 ``ρ(z) = -cos(πz/δ)/(π/δ)²`` if `|z|≤δ` and `ρ(δ)` otherwise.
+
+Note: symmetric weighing.
 """
 struct AndrewsRho{δ} <: RobustRho1P{δ}
    AndrewsRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -58,7 +79,7 @@ Andrews(δ::Real=1.0; delta::Real=δ) = AndrewsRho(delta)
    w  = ar .<= δ
    c  = π/δ
    κ  = (δ/π)^2
-   return sum( -cos.(c .* r) .* κ .* w .+ κ .* .!w )
+   return sum( @. ifelse(w, -cos(c * r) * κ, κ) )
 end
 
 # Note, sinc(x) = sin(πx)/πx, well defined everywhere
@@ -74,6 +95,8 @@ $TYPEDEF
 Bisquare weighing of the residuals corresponding to
 
 ``ρ(z) = δ²/6 (1-(1-(z/δ)²)³)`` if `|z|≤δ` and `δ²/6` otherwise.
+
+Note: symmetric weighing.
 """
 struct BisquareRho{δ} <: RobustRho1P{δ}
    BisquareRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -84,7 +107,7 @@ Bisquare(δ::Real=1.0; delta::Real=δ) = BisquareRho(delta)
    ar = abs.(r)
    w  = ar .<= δ
    κ  = δ^2/6
-   return sum( κ * (1.0 .- (1.0 .- (r ./ δ).^2).^3) .* w + κ .* .!w )
+   return sum( @. ifelse(w, κ * (1 - (1 - (r / δ)^2)^3), κ) )
 end
 
 ψ(::Type{BisquareRho{δ}}   ) where δ = (r, w) -> w * r * (1.0 - (r / δ)^2)^2
@@ -97,6 +120,8 @@ $TYPEDEF
 Logistic weighing of the residuals corresponding to
 
 ``ρ(z) = δ² log(cosh(z/δ))``
+
+Note: symmetric weighing.
 """
 struct LogisticRho{δ} <: RobustRho1P{δ}
    LogisticRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -104,7 +129,7 @@ end
 Logistic(δ::Real=1.0; delta::Real=δ) = LogisticRho(delta)
 
 (::LogisticRho{δ})(r::AVR) where δ = begin
-   return sum( δ^2 .* log.(cosh.(r ./ δ)) )
+   return sum( @. δ^2 * log(cosh(r / δ)) )
 end
 
 # similar to sinc, to avoid NaNs if tanh(0)/0 (lim is 1.0)
@@ -121,6 +146,8 @@ $TYPEDEF
 Fair weighing of the residuals corresponding to
 
 ``ρ(z) = δ² (|z|/δ - log(1+|z|/δ))``
+
+Note: symmetric weighing.
 """
 struct FairRho{δ} <: RobustRho1P{δ}
    FairRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -128,8 +155,8 @@ end
 Fair(δ::Real=1.0; delta::Real=δ) = FairRho(delta)
 
 (::FairRho{δ})(r::AVR) where δ = begin
-   sr = abs.(r) ./ δ
-   return sum( δ^2 .* (sr .- log1p.(sr)) )
+   sr = @. abs(r) / δ
+   return sum( @. δ^2 * (sr - log1p(sr)) )
 end
 
 ψ(::Type{FairRho{δ}}   ) where δ = (r, _) -> δ * r / (abs(r) + δ)
@@ -143,6 +170,8 @@ $TYPEDEF
 Talwar weighing of the residuals corresponding to
 
 ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=ρ(δ)` otherwise.
+
+Note: symmetric weighing.
 """
 struct TalwarRho{δ} <: RobustRho1P{δ}
    TalwarRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -150,8 +179,8 @@ end
 Talwar(δ::Real=1.0; delta::Real=δ) = TalwarRho(delta)
 
 (::TalwarRho{δ})(r::AVR) where δ = begin
-   w = abs.(r) .<= δ
-   return sum( r.^2 ./ 2 .* w .+ δ^2/2 .* .!w)
+   w = @. abs(r) <= δ
+   return sum( @. ifelse(w, r^2 / 2, δ^2/2) )
 end
 
 ψ(::Type{TalwarRho{δ}}   ) where δ = (r, w) -> w * r
@@ -164,7 +193,11 @@ $TYPEDEF
 
 Quantile regression weighing of the residuals corresponding to
 
-``ρ(z) = z(δ - 1(z<0))``
+``ρ(z) = -z(δ - 1(z>=0))``
+
+Note: asymetric weighing, the "-" sign is because similar libraries like
+quantreg for instance define the residual as `y-Xθ` while we do the opposite
+(out of convenience for gradients etc).
 """
 struct QuantileRho{δ} <: RobustRho1P{δ}
    QuantileRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -173,9 +206,9 @@ end
 Quantile(δ::Real=1.0; delta::Real=δ) = QuantileRho(delta)
 
 (::QuantileRho{δ})(r::AVR) where δ = begin
-   return sum( r .* (δ .- (r .<= 0.0)) )
+   return sum( @. -r * (δ - (r >= 0)) )
 end
 
-ψ(::Type{QuantileRho{δ}}   ) where δ = (r, _) -> (δ - (r <= 0.0))
-ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> (δ - (r <= 0.0)) / clip(r, τ)
+ψ(::Type{QuantileRho{δ}}   ) where δ = (r, _) -> ((r >= 0.0) - δ)
+ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> ((r >= 0.0) - δ) / clip(-r, τ)
 ϕ(::Type{QuantileRho{δ}}   ) where δ = (_, _) -> error("Newton(CG) not available for Quantile Reg.")
diff --git a/src/mlj/regressors.jl b/src/mlj/regressors.jl
index 335fd70..e821beb 100644
--- a/src/mlj/regressors.jl
+++ b/src/mlj/regressors.jl
@@ -104,7 +104,7 @@ See also [`ElasticNetRegressor`](@ref).
     "whether to scale the penalty with the number of observations."
     scale_penalty_with_samples::Bool = true
     """any instance of `MLJLinearModels.Analytical`. Use `Analytical()` for
-    Cholesky and `CG()=Analytical(iteration=true)` for conjugate-gradient.
+    Cholesky and `CG()=Analytical(iterative=true)` for conjugate-gradient.
     If `solver = nothing` (default) then `Analytical()` is used. """
     solver::Option{Solver}   = nothing
 end
diff --git a/test/Project.toml b/test/Project.toml
index 3190290..30cbf32 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
diff --git a/test/benchmarks/elementary_functions.jl b/test/benchmarks/elementary_functions.jl
index c069c00..18ac322 100644
--- a/test/benchmarks/elementary_functions.jl
+++ b/test/benchmarks/elementary_functions.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=512, sparse=0.5)
diff --git a/test/benchmarks/logistic-multinomial.jl b/test/benchmarks/logistic-multinomial.jl
index 16b3e9e..0554fde 100644
--- a/test/benchmarks/logistic-multinomial.jl
+++ b/test/benchmarks/logistic-multinomial.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_binary(n, p; seed=525)
diff --git a/test/benchmarks/ridge-lasso.jl b/test/benchmarks/ridge-lasso.jl
index 2e50c43..46d9c40 100644
--- a/test/benchmarks/ridge-lasso.jl
+++ b/test/benchmarks/ridge-lasso.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels, StableRNGs
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=512, sparse=0.5)
diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl
new file mode 100644
index 0000000..d5a8e1c
--- /dev/null
+++ b/test/benchmarks/robust.jl
@@ -0,0 +1,29 @@
+# Follow up from issue #147 comparing quantreg more specifically.
+
+if DO_COMPARISONS
+    @testset "Comp-QR-147" begin
+        using CSV, DataFrames
+
+        dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
+        tau     = 0.3
+
+        y  = Vector(dataset[!,:rent_euro])
+        X  = Matrix(dataset[!,[:area, :yearc]])
+        X1 = hcat(X[:,1], X[:, 2], ones(size(X, 1)))
+
+        qr  = QuantileRegression(tau; penalty=:none)
+        obj = objective(qr, X, y)
+
+        θ_lbfgs = fit(qr, X, y)
+        @test isapprox(obj(θ_lbfgs), 226_639, rtol=1e-4)
+
+        # in this case QR with BR method does better
+        θ_qr_br = rcopy(getproperty(QUANTREG, :rq_fit_br)(X1, y; tau=tau))[:coefficients]
+        @test isapprox(obj(θ_qr_br), 207_551, rtol=1e-4)
+
+        # lasso doesn't
+        θ_qr_lasso = rcopy(getproperty(QUANTREG, :rq_fit_lasso)(X1, y; tau=tau))[:coefficients]
+        obj(θ_qr_lasso) # 229_172
+        @test 228_000 ≤ obj(θ_qr_lasso) ≤ 231_000
+    end
+end
diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl
index f1e6105..e4ce57a 100644
--- a/test/fit/quantile.jl
+++ b/test/fit/quantile.jl
@@ -46,16 +46,27 @@ y1a = outlify(y1, 0.1)
         θ_ls     = fit(LinearRegression(), X, y1a)
         θ_lbfgs  = fit(rr, X, y1a, solver=LBFGS())
         θ_iwls   = fit(rr, X, y1a, solver=IWLSCG())
-        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a))[:coefficients]
-        θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a))[:coefficients]
+        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients]
+        θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a, tau=δ))[:coefficients]
         # NOTE: we take θ_qr_br as reference point
         @test isapprox(J(θ_ls), 505.45286,  rtol=1e-5)
         @test J(θ_qr_br) ≈      409.570777 # <- ref value
         # Their IP algorithm essentially gives the same answer
         @test (J(θ_qr_fnb) - J(θ_qr_br)) ≤ 1e-10
         # Our algorithms are close enough
-        @test isapprox(J(θ_lbfgs), 409.57154, rtol=1e-5)
+        @test isapprox(J(θ_lbfgs), 409.57608, rtol=1e-5)
         @test isapprox(J(θ_iwls),  409.59,    rtol=1e-4)
+
+        # Let's try this again but with a δ different from 0.5
+        δ  = 0.75
+        rr = QuantileRegression(δ, lambda=0)
+        J  = objective(rr, X, y1a)
+
+        θ_lbfgs  = fit(rr, X, y1a, solver=LBFGS())
+        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients]
+
+        @test isapprox(J(θ_qr_br), 404.6161, rtol=1e-4)
+        @test isapprox(J(θ_lbfgs), 404.6195, rtol=1e-4)
     end
 end
 
@@ -63,7 +74,7 @@ end
 ## With Sparsity penalty ##
 ###########################
 
-n, p = 500, 100
+Jn, p = 500, 100
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=51112, sparse=0.1)
 # pepper with outliers
 y1a  = outlify(y1, 0.1)
@@ -90,9 +101,9 @@ y1a  = outlify(y1, 0.1)
         θ_ls       = X1 \ y1a
         θ_fista    = fit(rr, X, y1a, solver=FISTA())
         θ_ista     = fit(rr, X, y1a, solver=ISTA())
-        θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a))[:coefficients]
+        θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a, lambda=λ))[:coefficients]
         @test isapprox(J(θ_ls),       888.3748, rtol=1e-5)
-        @test isapprox(J(θ_qr_lasso), 425.5,    rtol=1e-3)
+        @test isapprox(J(θ_qr_lasso), 425.2,    rtol=1e-3)
         # Our algorithms are close enough
         @test isapprox(J(θ_fista),    425.0526, rtol=1e-5)
         @test isapprox(J(θ_ista),     425.4113, rtol=1e-5)
@@ -101,6 +112,6 @@ y1a  = outlify(y1, 0.1)
         @test nnz(θ_fista)    == 88
         @test nnz(θ_ista)     == 82
         # in this case fista is best
-        @test J(θ_fista) < J(θ_ista) < J(θ_qr_lasso)
+        @test J(θ_fista) < J(θ_qr_lasso) < J(θ_ista)
     end
 end
diff --git a/test/loss-penalty/robust.jl b/test/loss-penalty/robust.jl
index 45e98ed..0abfbfa 100644
--- a/test/loss-penalty/robust.jl
+++ b/test/loss-penalty/robust.jl
@@ -5,7 +5,7 @@ y = randn(rng, 10)
 r = x .- y
 
 @testset "Robust Loss" begin
-    δ = 0.5
+    δ = 0.75
     rlδ = RobustLoss(Huber(δ))
     @test rlδ isa RobustLoss{HuberRho{δ}}
     @test rlδ(r) == rlδ(x, y) == sum(ifelse(abs(rᵢ)≤δ, rᵢ^2/2, δ*(abs(rᵢ)-δ/2)) for rᵢ in r)
diff --git a/test/runtests.jl b/test/runtests.jl
index 3d7099c..73cba12 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,8 @@ using Random, StableRNGs, DataFrames, ForwardDiff
 import Optim
 import MLJ, MLJBase
 
-DO_COMPARISONS = false; include("testutils.jl")
+DO_COMPARISONS = true
+include("testutils.jl")
 
 m("UTILS"); include("utils.jl")
 
@@ -31,3 +32,7 @@ m("MLJ", false); begin
     mm("fit-predict"); include("interface/fitpredict.jl")
     mm("extras");      include("interface/extras.jl")
 end
+
+m("MISC", false); begin
+    mm("benchmarks"); include("benchmarks/robust.jl")
+end