# Gradient Boosting Example

In [2]:
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split

include("/Users/jiangtao.fu/OneDrive/code/julia/algorithms_in_julia/src/boostingtree.jl")

@sk_import datasets: load_diabetes

PyObject <function load_diabetes at 0x7fb2d8a3adc0>

## Gradient Boosting Regression Example

In [3]:
diabetes = load_diabetes()

Dict{Any,Any} with 7 entries:
  "feature_names"   => ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5"…
  "target_filename" => "/Users/jiangtao.fu/.julia/conda/3/lib/python3.8/site-pa…
  "data_filename"   => "/Users/jiangtao.fu/.julia/conda/3/lib/python3.8/site-pa…
  "frame"           => nothing
  "data"            => [0.0380759 0.0506801 … 0.0199084 -0.0176461; -0.00188202…
  "target"          => [151.0, 75.0, 141.0, 206.0, 135.0, 97.0, 138.0, 63.0, 11…
  "DESCR"           => ".. _diabetes_dataset:\n\nDiabetes dataset\n------------…

In [4]:
X, y = diabetes["data"], diabetes["target"]

([0.0380759064334241 0.0506801187398187 … 0.0199084208763183 -0.0176461251598052; -0.00188201652779104 -0.044641636506989 … -0.0683297436244215 -0.09220404962683; … ; -0.0454724779400257 -0.044641636506989 … 0.0445283740214053 -0.0259303389894746; -0.0454724779400257 -0.044641636506989 … -0.00421985970694603 0.00306440941436832], [151.0, 75.0, 141.0, 206.0, 135.0, 97.0, 138.0, 63.0, 110.0, 310.0  …  173.0, 72.0, 49.0, 64.0, 48.0, 178.0, 104.0, 132.0, 220.0, 57.0])

In [5]:
size(X)

(442, 10)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

4-element Array{Array{Float64,N} where N,1}:
 [0.0671362140415805 0.0506801187398187 … 0.0233748412798208 0.0817644407962278; 0.0562385986885218 0.0506801187398187 … -0.0295276227417736 -0.0590671943081523; … ; 0.0453409833354632 0.0506801187398187 … -0.003303712578677 0.0196328370737072; -0.0636351701951234 -0.044641636506989 … -0.0225121719296605 -0.0590671943081523]
 [-0.00551455497881059 -0.044641636506989 … 0.0507815133629732 0.0569117993072195; -0.0200447087828888 -0.044641636506989 … -0.0823814832581028 -0.0466408735636482; … ; -0.0382074010379866 -0.044641636506989 … -0.0181182673078967 -0.0176461251598052; -0.034574862586967 0.0506801187398187 … 0.0267142576335128 0.0734802269665584]
 [172.0, 91.0, 48.0, 116.0, 185.0, 170.0, 129.0, 104.0, 85.0, 97.0  …  209.0, 67.0, 71.0, 187.0, 272.0, 220.0, 166.0, 85.0, 237.0, 214.0]
 [272.0, 90.0, 86.0, 232.0, 281.0, 88.0, 235.0, 151.0, 152.0, 98.0  …  161.0, 178.0, 154.0, 144.0, 179.0, 53.0, 140.0, 182.0, 97.0, 292.0]

In [7]:
# metric
mse = MeanSquaredError()

MeanSquaredError()

Single regression tree as baseline

In [8]:
minerror = Inf
for n = 2:2:20, t=1:4
    regtree = RegressionTree(X_train, y_train, TreeStoppingRule(n, t));
    currenterror = mse(y_test, predict(regtree, X_test))
    if minerror > currenterror
        minerror = currenterror
        @show "new minimal: ", t, n,  currenterror
    end
end

("new minimal: ", t, n, currenterror) = ("new minimal: ", 1, 2, 4085.026526389429)
("new minimal: ", t, n, currenterror) = ("new minimal: ", 3, 4, 3972.9084861160845)
("new minimal: ", t, n, currenterror) = ("new minimal: ", 1, 6, 3809.7870322885387)


Boosting model

In [9]:
minerror = Inf
for  n=2:5, m = 1:3:50, t=[1,2,4]
    regboost = GradientBoostingTrees(X_train, y_train, MeanSquaredError(), BoostingStoppingRule(m, TreeStoppingRule(n, t)))
    currenterror = mse(y_test, predict(regboost, X_test))
    @show n, t, m,  currenterror
    if minerror > currenterror
        minerror = currenterror
        @show "new minimal: ", n, t, m,  currenterror
    end
end

(n, t, m, currenterror) = (2, 1, 1, 5109.637931635549)
("new minimal: ", n, t, m, currenterror) = ("new minimal: ", 2, 1, 1, 5109.637931635549)
(n, t, m, currenterror) = (2, 2, 1, 5109.637931635549)
(n, t, m, currenterror) = (2, 4, 1, 5109.637931635549)
(n, t, m, currenterror) = (2, 1, 4, 4552.85065120202)
("new minimal: ", n, t, m, currenterror) = ("new minimal: ", 2, 1, 4, 4552.85065120202)
(n, t, m, currenterror) = (2, 2, 4, 4552.85065120202)
(n, t, m, currenterror) = (2, 4, 4, 4552.85065120202)
(n, t, m, currenterror) = (2, 1, 7, 4123.988887603011)
("new minimal: ", n, t, m, currenterror) = ("new minimal: ", 2, 1, 7, 4123.988887603011)
(n, t, m, currenterror) = (2, 2, 7, 4123.988887603011)
(n, t, m, currenterror) = (2, 4, 7, 4123.988887603011)
(n, t, m, currenterror) = (2, 1, 10, 3914.3301493486106)
("new minimal: ", n, t, m, currenterror) = ("new minimal: ", 2, 1, 10, 3914.3301493486106)
(n, t, m, currenterror) = (2, 2, 10, 3914.3301493486106)
(n, t, m, currenterror) = (2, 4, 10, 

(n, t, m, currenterror) = (4, 2, 19, 3223.5674257777064)
(n, t, m, currenterror) = (4, 4, 19, 3162.10769222611)
(n, t, m, currenterror) = (4, 1, 22, 3228.5259087451554)
(n, t, m, currenterror) = (4, 2, 22, 3279.495361612583)
(n, t, m, currenterror) = (4, 4, 22, 3165.4427970422885)
(n, t, m, currenterror) = (4, 1, 25, 3267.696270150221)
(n, t, m, currenterror) = (4, 2, 25, 3238.4692000948326)
(n, t, m, currenterror) = (4, 4, 25, 3210.6011549105624)
(n, t, m, currenterror) = (4, 1, 28, 3257.2797595784264)
(n, t, m, currenterror) = (4, 2, 28, 3222.852800649885)
(n, t, m, currenterror) = (4, 4, 28, 3239.626042870638)
(n, t, m, currenterror) = (4, 1, 31, 3266.633414489298)
(n, t, m, currenterror) = (4, 2, 31, 3222.852800649885)
(n, t, m, currenterror) = (4, 4, 31, 3249.752798965787)
(n, t, m, currenterror) = (4, 1, 34, 3266.633414489298)
(n, t, m, currenterror) = (4, 2, 34, 3222.852800649885)
(n, t, m, currenterror) = (4, 4, 34, 3266.1076709713425)
(n, t, m, currenterror) = (4, 1, 37, 3266.