In [2]:
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()

using DataFrames
using MLJTransforms     # dev ..
using Plots
using CSV
using HTTP
using MLJ
using ScientificTypes
using Random

[32m[1m  Activating[22m[39m project at `~/Documents/GitHub/MLJTransforms/examples`


In [3]:
HTTP.download("https://raw.githubusercontent.com/JuliaAI/Imbalance.jl/dev/docs/src/examples/smoten_mushroom/mushrooms.csv", ".")
df = CSV.read("./mushrooms.csv", DataFrame)

# Display the first 5 rows with DataFrames
first(df, 5)

│ Use: io = BufferedInputStream(http::HTTP.Stream) instead.
│ See: https://github.com/BioJulia/BufferedStreams.jl
└ @ HTTP.Streams /Users/essam/.julia/packages/HTTP/sJD5V/src/Streams.jl:240
┌ Info: Downloading
│   source = https://raw.githubusercontent.com/JuliaAI/Imbalance.jl/dev/docs/src/examples/smoten_mushroom/mushrooms.csv
│   dest = ./mushrooms.csv
│   progress = NaN
│   time_taken = 0.31 s
│   time_remaining = NaN s
│   average_speed = 1.143 MiB/s
│   downloaded = 365.237 KiB
│   remaining = ∞ B
│   total = ∞ B
└ @ HTTP /Users/essam/.julia/packages/HTTP/sJD5V/src/download.jl:132


Row,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
Unnamed: 0_level_1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1,String1
1,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
2,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
3,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
4,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
5,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [4]:
df = coerce(df, autotype(df, :few_to_finite))
ScientificTypes.schema(df)

┌──────────────────────────┬────────────────┬───────────────────────────────────
│[22m names                    [0m│[22m scitypes       [0m│[22m types                           [0m ⋯
├──────────────────────────┼────────────────┼───────────────────────────────────
│ class                    │ Multiclass{2}  │ CategoricalValue{String1, UInt32 ⋯
│ cap-shape                │ Multiclass{6}  │ CategoricalValue{String1, UInt32 ⋯
│ cap-surface              │ Multiclass{4}  │ CategoricalValue{String1, UInt32 ⋯
│ cap-color                │ Multiclass{10} │ CategoricalValue{String1, UInt32 ⋯
│ bruises                  │ Multiclass{2}  │ CategoricalValue{String1, UInt32 ⋯
│ odor                     │ Multiclass{9}  │ CategoricalValue{String1, UInt32 ⋯
│ gill-attachment          │ Multiclass{2}  │ CategoricalValue{String1, UInt32 ⋯
│ gill-spacing             │ Multiclass{2}  │ CategoricalValue{String1, UInt32 ⋯
│ gill-size                │ Multiclass{2}  │ CategoricalValue{String1, UInt32 ⋯
│

In [5]:
y, X = unpack(df, ==(:class); rng=123);
first(X, 5)

Row,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…,Cat…
1,f,f,n,t,n,f,c,b,w,t,b,s,s,g,g,p,w,o,p,k,v,d
2,f,f,n,t,n,f,c,b,w,t,b,s,s,w,p,p,w,o,p,n,y,d
3,b,s,y,t,l,f,c,b,k,e,c,s,s,w,w,p,w,o,p,k,s,g
4,f,y,e,f,m,f,c,b,w,e,c,k,y,c,c,p,w,n,n,w,c,d
5,x,y,n,f,n,f,w,n,w,e,b,f,f,w,n,p,w,o,e,w,v,l


In [6]:
train_inds, test_inds = partition(eachindex(y), 0.8, shuffle=true, stratify=y, rng=Random.Xoshiro(42))
X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]

(CategoricalArrays.CategoricalValue{String1, UInt32}[String1("p"), String1("p"), String1("e"), String1("p"), String1("p"), String1("e"), String1("p"), String1("e"), String1("p"), String1("e")  …  String1("p"), String1("e"), String1("e"), String1("p"), String1("p"), String1("p"), String1("e"), String1("e"), String1("e"), String1("p")], CategoricalArrays.CategoricalValue{String1, UInt32}[String1("p"), String1("p"), String1("e"), String1("p"), String1("p"), String1("e"), String1("p"), String1("e"), String1("e"), String1("e")  …  String1("p"), String1("p"), String1("e"), String1("e"), String1("p"), String1("p"), String1("p"), String1("e"), String1("e"), String1("e")])

In [10]:
RandomForestClassifier = @load RandomForestClassifier pkg=DecisionTree

encoder = TargetEncoder(encode_ordinal = false, lambda = 0.1, m=0)
clf = RandomForestClassifier()
pipe = encoder |> clf

import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /Users/essam/.julia/packages/MLJModels/Lwfeb/src/loading.jl:159


ProbabilisticPipeline(
  target_encoder = TargetEncoder(
        cols = Symbol[], 
        exclude_cols = true, 
        encode_ordinal = false, 
        lambda = 0.1, 
        m = 0), 
  random_forest_classifier = RandomForestClassifier(
        max_depth = -1, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_purity_increase = 0.0, 
        n_subfeatures = -1, 
        n_trees = 100, 
        sampling_fraction = 0.7, 
        feature_importance = :impurity, 
        rng = Random._GLOBAL_RNG()), 
  cache = true)

In [11]:
mach = fit!(machine(pipe, X_train, y_train))

┌ Info: Training machine(ProbabilisticPipeline(target_encoder = TargetEncoder(cols = Symbol[], …), …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/qETMX/src/machines.jl:499
┌ Info: Training machine(:target_encoder, …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/qETMX/src/machines.jl:499
┌ Error: Problem fitting the machine machine(:target_encoder, …). 
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/qETMX/src/machines.jl:694
┌ Info: Running type checks... 
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/qETMX/src/machines.jl:699
│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc MLJTransforms.TargetEncoder` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.

CompositeException: TaskFailedException

    nested task error: MethodError: no method matching fit(::TargetEncoder{Float64, Int64, Vector{Symbol}}, ::Int64, ::DataFrame)
    
    Closest candidates are:
      fit(::TargetEncoder, ::Int64, ::Any, !Matched::Any)
       @ MLJTransforms ~/Documents/GitHub/MLJTransforms/src/target_encoding/interface_mlj.jl:65
      fit(!Matched::MLJModels.DeterministicConstantRegressor, ::Int64, ::Any, !Matched::Any)
       @ MLJModels ~/.julia/packages/MLJModels/Lwfeb/src/builtins/Constant.jl:32
      fit(!Matched::MLJDecisionTreeInterface.RandomForestClassifier, ::Int64, ::Any, !Matched::Any, !Matched::Any, !Matched::Any)
       @ MLJDecisionTreeInterface ~/.julia/packages/MLJDecisionTreeInterface/CXTSl/src/MLJDecisionTreeInterface.jl:127
      ...
    
    Stacktrace:
     [1] fit_only!(mach::Machine{Symbol, Any, true}, wait_on_upstream::Bool; kwargs::@Kwargs{verbosity::Int64, composite::MLJBase.ProbabilisticPipeline{@NamedTuple{target_encoder::Unsupervised, random_forest_classifier::Probabilistic}, MLJModelInterface.predict}})
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/machines.jl:767
     [2] fit_only!
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/machines.jl:746 [inlined]
     [3] (::MLJBase.var"#80#82"{@Kwargs{verbosity::Int64, composite::MLJBase.ProbabilisticPipeline{@NamedTuple{target_encoder::Unsupervised, random_forest_classifier::Probabilistic}, MLJModelInterface.predict}}, Machine{Symbol, Any, true}})()
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/composition/learning_networks/nodes.jl:237
    
    caused by: MethodError: no method matching fit(::TargetEncoder{Float64, Int64, Vector{Symbol}}, ::Int64, ::DataFrame)
    
    Closest candidates are:
      fit(::TargetEncoder, ::Int64, ::Any, !Matched::Any)
       @ MLJTransforms ~/Documents/GitHub/MLJTransforms/src/target_encoding/interface_mlj.jl:65
      fit(!Matched::MLJModels.DeterministicConstantRegressor, ::Int64, ::Any, !Matched::Any)
       @ MLJModels ~/.julia/packages/MLJModels/Lwfeb/src/builtins/Constant.jl:32
      fit(!Matched::MLJDecisionTreeInterface.RandomForestClassifier, ::Int64, ::Any, !Matched::Any, !Matched::Any, !Matched::Any)
       @ MLJDecisionTreeInterface ~/.julia/packages/MLJDecisionTreeInterface/CXTSl/src/MLJDecisionTreeInterface.jl:127
      ...
    
    Stacktrace:
     [1] fit_only!(mach::Machine{Symbol, Any, true}; rows::Nothing, verbosity::Int64, force::Bool, composite::MLJBase.ProbabilisticPipeline{@NamedTuple{target_encoder::Unsupervised, random_forest_classifier::Probabilistic}, MLJModelInterface.predict})
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/machines.jl:692
     [2] fit_only!
       @ ~/.julia/packages/MLJBase/qETMX/src/machines.jl:617 [inlined]
     [3] fit_only!(mach::Machine{Symbol, Any, true}, wait_on_upstream::Bool; kwargs::@Kwargs{verbosity::Int64, composite::MLJBase.ProbabilisticPipeline{@NamedTuple{target_encoder::Unsupervised, random_forest_classifier::Probabilistic}, MLJModelInterface.predict}})
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/machines.jl:763
     [4] fit_only!
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/machines.jl:746 [inlined]
     [5] (::MLJBase.var"#80#82"{@Kwargs{verbosity::Int64, composite::MLJBase.ProbabilisticPipeline{@NamedTuple{target_encoder::Unsupervised, random_forest_classifier::Probabilistic}, MLJModelInterface.predict}}, Machine{Symbol, Any, true}})()
       @ MLJBase ~/.julia/packages/MLJBase/qETMX/src/composition/learning_networks/nodes.jl:237

In [9]:
mach = machine(clf, X_train_enc, y_train)
fit!(mach)
cv=CV(nfolds=5)
evaluate!(mach, resampling=cv, measure=accuracy)

┌ Info: Training machine(RandomForestClassifier(max_depth = -1, …), …).
└ @ MLJBase /Users/essam/.julia/packages/MLJBase/qETMX/src/machines.jl:499


PerformanceEvaluation object with these fields:
  model, measure, operation,
  measurement, per_fold, per_observation,
  fitted_params_per_fold, report_per_fold,
  train_test_rows, resampling, repeats
Extract:
┌────────────┬──────────────┬─────────────┐
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│
├────────────┼──────────────┼─────────────┤
│ Accuracy() │ predict_mode │ 1.0         │
└────────────┴──────────────┴─────────────┘
┌───────────────────────────┬─────────┐
│[22m per_fold                  [0m│[22m 1.96*SE [0m│
├───────────────────────────┼─────────┤
│ [1.0, 1.0, 1.0, 1.0, 1.0] │ 0.0     │
└───────────────────────────┴─────────┘
