Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Outliernicer #41

Merged
merged 5 commits into from
May 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
516 changes: 190 additions & 326 deletions docs/StatifierNotebook.jl.ipynb

Large diffs are not rendered by default.

35 changes: 3 additions & 32 deletions src/TSML.jl
Original file line number Diff line number Diff line change
@@ -1,59 +1,31 @@
module TSML

#if "LOAD_SK_CARET" in keys(ENV) && ENV["LOAD_SK_CARET"] == "true" # to disable precompile for binary libs
# __precompile__(false)
#elseif "LOAD_SK_CARET" in keys(ENV) && ENV["LOAD_SK_CARET"] == "false"
# __precompile__(true) # no binary libs
#else
# __precompile__(false) # assume default has binary libs
#end

__precompile__(false) # assume default has binary libs

export greet
export testall
export mrun,prun # from DataProc
export mergedict
export multirun
export matrifyrun, dateifierrun
export datevalgatorrun, datevalizerrun, datevalnnerrun
using Dates

greet() = print("Hello World!")

include("system.jl")
using .System

include("types.jl")
using .TSMLTypes
#export typerun


include("utils.jl")
using .Utils

include("dataproc.jl")
using .DataProc
#export mrun
#export prun

include("transformers.jl")
using .TSMLTransformers

include("baseline.jl")
using .BaselineAlgos
#export baselinerun

if LIB_SKL_AVAILABLE # from System module
include("scikitlearn.jl")
using .SKLearners
#export skkrun
end

if LIB_CRT_AVAILABLE # from System module
include("caret.jl")
using .CaretLearners
#export caretrun
end

include("multilearner.jl")
Expand All @@ -64,19 +36,15 @@ using .DecisionTreeLearners

include("datareader.jl")
using .DataReaders
#export datareaderrun

include("datawriter.jl")
using .DataWriters
#export datawriterrun

include("statifier.jl")
using .Statifiers
#export fullstat, statifierrun

include("monotonicer.jl")
using .Monotonicers
#export monotonicerrun

include("cliwrapper.jl")
using .CLIWrappers
Expand All @@ -85,4 +53,7 @@ export tsmlrun
include("tsclassifier.jl")
using .TSClassifiers

include("outliernicer.jl")
using .Outliernicers

end # module
4 changes: 2 additions & 2 deletions src/datareader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ end
function fit!(dtr::DataReader,x::T=[],y::Vector=[]) where {T<:Union{DataFrame,Vector,Matrix}}
fname = dtr.args[:filename]
fmt = dtr.args[:dateformat]
(fname != "" && fmt != "") || error("missing filename or date format")
(fname != "" && isfile(fname) && fmt != "") || error("missing filename or date format: (",fname,"), (",fmt,")")
dtr.model = dtr.args
end

function transform!(dtr::DataReader,x::T=[]) where {T<:Union{DataFrame,Vector,Matrix}}
fullname = dtr.args[:filename]
fmt = dtr.args[:dateformat]
(fullname != "" && fmt != "") || error("missing filename or date format")
(fullname != "" && isfile(fullname) && fmt != "") || error("missing filename or date format: (",fullname,"), (",fmt,")")
fname = basename(fullname)
fname != "" || error("filename is empty")
fn,ext=split(fname,".")
Expand Down
72 changes: 72 additions & 0 deletions src/outliernicer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
module Outliernicers

using Dates
using DataFrames
using Random
using Statistics
using StatsBase: iqr, quantile, sample

export fit!,transform!
export Outliernicer

import TSML.TSMLTypes.fit! # to overload
import TSML.TSMLTypes.transform! # to overload

using TSML.TSMLTypes
using TSML.TSMLTransformers
using TSML.Utils

"""
Outliernicer(Dict())

Detects outliers below or above (q25-iqr,q75+iqr)
and replace them with missing so that ValNNer can
use nearest neighbors to replace the missings.
"""
mutable struct Outliernicer <: Transformer
model
args

function Outliernicer(args=Dict())
default_args = Dict(
:dateinterval => Dates.Hour(1),
:nnsize => 1,
:missdirection => :symmetric
)
new(nothing, mergedict(default_args, args))
end
end

function fit!(st::Outliernicer, features::T, labels::Vector=[]) where {T<:Union{Vector,Matrix,DataFrame}}
typeof(features) <: DataFrame || error("Outliernicer.fit!: data should be a dataframe: Date,Val ")
ncol(features) == 2 || error("dataframe must have 2 columns: Date, Val")
st.model = st.args
end

function transform!(st::Outliernicer, features::T) where {T<:Union{Vector,Matrix,DataFrame}}
features != [] || return DataFrame()
typeof(features) <: DataFrame || error("Outliernicer.fit!: data should be a dataframe: Date,Val ")
ncol(features) == 2 || error("dataframe must have 2 columns: Date, Val")
sum(names(features) .== (:Date,:Value)) == 2 || error("wrong column names")
mfeatures=deepcopy(features)
rvals = mfeatures[:Value]
# compute the outlier range
# setup to store both missing and numbers
mvals = Array{Union{Missing,eltype(rvals)},1}(missing,length(rvals))
mvals .= rvals
crvals = skipmissing(rvals) # stat of non-missing
miqr = iqr(crvals)
q25,q75 = quantile(crvals,[0.25,0.75])
lower=q25-miqr; upper=q75+miqr
missindx = findall(x -> !ismissing(x) && (x > upper || x < lower),rvals)
mvals[missindx] .= missing
mfeatures[:Value] = mvals
# use ValNNer to replace missings
valnner = DateValNNer(st.args)
fit!(valnner,mfeatures)
resdf = transform!(valnner,mfeatures)
resdf[:Value] = collect(skipmissing(resdf[:Value]))
resdf
end

end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include("test_statifier.jl")
include("test_monotonicer.jl")
include("test_cliwrapper.jl")
include("test_tsclassifier.jl")
include("test_outliernicer.jl")

if LIB_SKL_AVAILABLE
include("test_scikitlearn.jl")
Expand Down
120 changes: 120 additions & 0 deletions test/test_outliernicer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
module TestOutliernicer

using TSML
using TSML.Utils
using TSML.TSMLTypes
using TSML.TSMLTransformers

using TSML.Outliernicers
using TSML.Monotonicers
using TSML.DataReaders
using TSML.Statifiers

using DataFrames
using Dates
using Random
using StatsBase: sample, mean
using Test

function test_artificialdata()
Random.seed!(123)
mdates = DateTime(2017,1,1):Dates.Hour(1):DateTime(2017,6,1)
mvals = rand(1:1000,length(mdates))
# create some outliers
soutliers = rand([500:10000;-10000:500],div(length(mdates),10))
soutndx = sample(1:length(mdates),length(soutliers))
mvals[soutndx] = soutliers
df = DataFrame(Date=mdates,Value=mvals)
outnicer = Outliernicer(Dict(:dateinterval => Dates.Hour(1)))
fit!(outnicer,df)
resdf = transform!(outnicer,df)
@test round(mean(resdf[:Value]),digits=2) == 510.32
end
@testset "Outliernicer: using artificial data" begin
test_artificialdata()
end


function test_basicoutlier()
Random.seed!(123)
fname = joinpath(dirname(pathof(TSML)),"../data/testdata.csv")
csvfilter = DataReader(Dict(:filename=>fname,:dateformat=>"dd/mm/yyyy HH:MM"))
valgator = DateValgator(Dict(:dateinterval=>Dates.Hour(1)))
valnner = DateValNNer(Dict(:dateinterval=>Dates.Hour(1)))
stfier = Statifier(Dict(:processmissing=>true))
mono = Monotonicer(Dict())
outliernicer = Outliernicer(Dict(:dateinterval=>Dates.Hour(1)))

mpipeline1 = Pipeline(Dict(
:transformers => [csvfilter,valgator,mono,valnner,outliernicer,stfier]
)
)
fit!(mpipeline1)
respipe1 = transform!(mpipeline1)
@test round(sum(respipe1[1,3:20])) == -213862.0

mpipeline2 = Pipeline(Dict(
:transformers => [csvfilter,valgator,mono,outliernicer,stfier]
)
)
fit!(mpipeline2)
respipe2 = transform!(mpipeline2)
@test round(sum(respipe2[1,3:20])) == -219595.0

mpipeline3 = Pipeline(Dict(
:transformers => [csvfilter,valgator,valnner,mono,outliernicer,stfier]
)
)
fit!(mpipeline3)
respipe3 = transform!(mpipeline3)
@test round(sum(respipe3[1,3:20])) == -213862.0
end
@testset "Outliernicer: readcsv |> valgator |> valnner |> mono |> outliernicer |> stfier" begin
test_basicoutlier()
end

function test_typesoutliernicer()
regularfile = joinpath(dirname(pathof(TSML)),"../data/typedetection/regular.csv")
monofile = joinpath(dirname(pathof(TSML)),"../data/typedetection/monotonic.csv")
dailymonofile = joinpath(dirname(pathof(TSML)),"../data/typedetection/dailymonotonic.csv")
regularfilecsv = DataReader(Dict(:filename=>regularfile,:dateformat=>"dd/mm/yyyy HH:MM"))
monofilecsv = DataReader(Dict(:filename=>monofile,:dateformat=>"dd/mm/yyyy HH:MM"))
dailymonofilecsv = DataReader(Dict(:filename=>dailymonofile,:dateformat=>"dd/mm/yyyy HH:MM"))

valgator = DateValgator(Dict(:dateinterval=>Dates.Hour(1)))
valnner = DateValNNer(Dict(:dateinterval=>Dates.Hour(1)))
stfier = Statifier(Dict(:processmissing=>true))
mono = Monotonicer(Dict())
outliernicer = Outliernicer(Dict(:dateinterval=>Dates.Hour(1)))

regpipeline = Pipeline(Dict(
:transformers => [regularfilecsv,valgator,valnner,mono,outliernicer,stfier]
)
)
fit!(regpipeline)
regulardf=transform!(regpipeline)
@test round(sum(regulardf[1,3:20])) == -61184.0

monopipeline = Pipeline(Dict(
:transformers => [monofilecsv,valgator,valnner,mono,outliernicer,stfier]
)
)
fit!(monopipeline)
monodf=transform!(monopipeline)
@test round(sum(monodf[1,3:20])) == -890049.0

dailymonopipeline = Pipeline(Dict(
:transformers => [dailymonofilecsv,valgator,valnner,mono,outliernicer,stfier]
)
)
fit!(dailymonopipeline)
dailymonodf=transform!(dailymonopipeline)

@test round(sum(dailymonodf[1,3:20])) == -294446.0

end
@testset "Outliernicer: monotonic type and outlier detections" begin
test_typesoutliernicer()
end

end