Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trim and winsor now maintain original order #546

Merged
merged 8 commits into from
Feb 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 57 additions & 50 deletions src/robust.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,96 +7,103 @@
#############################

# Trimmed set
"Return the upper and lower bound elements used by `trim` and `winsor`"
function uplo(x::AbstractVector; prop::Real=0.0, count::Integer=0)
n = length(x)
n > 0 || throw(ArgumentError("x can not be empty."))

if count == 0
0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5."))
count = floor(Int, n * prop)
else
prop == 0 || throw(ArgumentError("prop and count can not both be > 0."))
0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2."))
end

# indices for lowest count values
x2 = copy(x)
lo = partialsort!(x2, 1:count+1)[end]
# indices for largest count values
up = partialsort!(x2, n-count:n)[1]

up, lo
end

"""
trim(x; prop=0.0, count=0)
trim(x::AbstractVector; prop=0.0, count=0)

Return an iterator of all elements of `x` that omits either `count` or proportion
`prop` of the highest and lowest elements.

Return a copy of `x` with either `count` or proportion `prop` of the highest
and lowest elements removed. To compute the trimmed mean of `x` use
`mean(trim(x))`; to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)).
The number of trimmed elements could be smaller than specified if several
elements equal the lower or upper bound.

To compute the trimmed mean of `x` use `mean(trim(x))`;
to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)).

# Example
```julia
julia> trim([1,2,3,4,5], prop=0.2)
julia> collect(trim([5,2,4,3,1], prop=0.2))
3-element Array{Int64,1}:
2
3
4
3
```
"""
function trim(x::AbstractVector; prop::Real=0.0, count::Integer=0)
trim!(copy(x); prop=prop, count=count)
up, lo = uplo(x; prop=prop, count=count)

(xi for xi in x if lo <= xi <= up)
end

"""
trim!(x; prop=0.0, count=0)
trim!(x::AbstractVector; prop=0.0, count=0)

A variant of [`trim`](@ref) that modifies `x` in place.
"""
function trim!(x::AbstractVector; prop::Real=0.0, count::Integer=0)
n = length(x)
n > 0 || throw(ArgumentError("x can not be empty."))

if count == 0
0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5."))
count = floor(Int, n * prop)
else
prop == 0 || throw(ArgumentError("prop and count can not both be > 0."))
0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2."))
end

partialsort!(x, (n-count+1):n)
partialsort!(x, 1:count)
deleteat!(x, (n-count+1):n)
deleteat!(x, 1:count)

up, lo = uplo(x; prop=prop, count=count)
ix = (i for (i,xi) in enumerate(x) if lo > xi || xi > up)
deleteat!(x, ix)
return x
end

"""
winsor(x; prop=0.0, count=0)
winsor(x::AbstractVector; prop=0.0, count=0)

Return a copy of `x` with either `count` or proportion `prop` of the lowest
elements of `x` replaced with the next-lowest, and an equal number of the
highest elements replaced with the previous-highest. To compute the Winsorized
mean of `x` use `mean(winsor(x))`.
Return an iterator of all elements of `x` that replaces either `count` or
proportion `prop` of the highest elements with the previous-highest element
and an equal number of the lowest elements with the next-lowest element.

The number of replaced elements could be smaller than specified if several
elements equal the lower or upper bound.

To compute the Winsorized mean of `x` use `mean(winsor(x))`.

# Example
```julia
julia> winsor([1,2,3,4,5], prop=0.2)
julia> collect(winsor([5,2,3,4,1], prop=0.2))
5-element Array{Int64,1}:
2
4
2
3
4
4
2
```
"""
function winsor(x::AbstractVector; prop::Real=0.0, count::Integer=0)
winsor!(copy(x); prop=prop, count=count)
up, lo = uplo(x; prop=prop, count=count)

(clamp(xi, lo, up) for xi in x)
end

"""
winsor!(x; prop=0.0, count=0)
winsor!(x::AbstractVector; prop=0.0, count=0)

A variant of [`winsor`](@ref) that modifies vector `x` in place.
"""
function winsor!(x::AbstractVector; prop::Real=0.0, count::Integer=0)
n = length(x)
n > 0 || throw(ArgumentError("x can not be empty."))

if count == 0
0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5."))
count = floor(Int, n * prop)
else
prop == 0 || throw(ArgumentError("prop and count can not both be > 0."))
0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2."))
end

partialsort!(x, (n-count+1):n)
partialsort!(x, 1:count)
x[1:count] .= x[count+1]
x[n-count+1:end] .= x[n-count]

copyto!(x, winsor(x; prop=prop, count=count))
return x
end

Expand Down
41 changes: 21 additions & 20 deletions test/robust.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,40 @@ using Test

### Trimming outliers

@test trim([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8]
@test trim([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7]
@test trim([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6]
@test trim([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7]
@test trim([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6]
@test collect(trim([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1]
@test collect(trim([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7]
@test collect(trim([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6]
@test collect(trim([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2]
@test collect(trim([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6]


@test_throws ArgumentError trim([])
@test_throws ArgumentError trim([1,2,3,4,5], prop=0.5)

@test trim!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8]
@test trim!([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7]
@test trim!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6]
@test trim!([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7]
@test trim!([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6]
@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1]
@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7]
@test collect(trim!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6]
@test collect(trim!([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2]
@test collect(trim!([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6]

@test_throws ArgumentError trim!([])
@test_throws ArgumentError trim!([1,2,3,4,5], prop=0.5)

@test winsor([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8]
@test winsor([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7]
@test winsor([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6]
@test winsor([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7]
@test winsor([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6]
@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1]
@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2]
@test collect(winsor([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6]
@test collect(winsor([1,2,3,4,5,6,7,8], count=1)) == [2,2,3,4,5,6,7,7]
@test collect(winsor([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2]
@test collect(winsor([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6]

@test_throws ArgumentError winsor([])
@test_throws ArgumentError winsor([1,2,3,4,5], prop=0.5)

@test winsor!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8]
@test winsor!([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7]
@test winsor!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6]
@test winsor!([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7]
@test winsor!([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6]
@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1]
@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2]
@test collect(winsor!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6]
@test collect(winsor!([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2]
@test collect(winsor!([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6]

@test_throws ArgumentError winsor!([])
@test_throws ArgumentError winsor!([1,2,3,4,5], prop=0.5)
Expand Down