In [1]:
using Optim
using Plots

In [2]:
default(size = (300, 200)) # plot size

# Generate data

In [3]:
f1(x) = x^2/20 + sin(2*x)

f1 (generic function with 1 method)

In [4]:
x = reshape([-10:0.1:10;],1,:) # shape = [n_input,n_sample]
y_true = f1.(x)
print(size(x), size(y_true))
plot(x[:], y_true[:])

# Build NN

In [5]:
function init_weights(;n_in=1, n_hidden=10, n_out=1)
    W1 = randn(n_hidden, n_in) # for left multiply W1*x
    b1 = zeros(n_hidden)
    W2 = randn(n_out, n_hidden)
    b2 = zeros(n_out)
    params = [W1, b1, W2, b2]
    return params
end

params = init_weights()

4-element Array{Array{Float64,N} where N,1}:
 [-0.0157935; 1.74168; … ; 0.184103; -1.53751]     
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [-0.110587 -0.37593 … 1.14 -1.31584]              
 [0.0]                                             

In [6]:
params_flat = collect(Iterators.flatten(params))
sizes = map(size, params)

4-element Array{Tuple{Int64,Vararg{Int64,N} where N},1}:
 (10, 1)
 (10,)  
 (1, 10)
 (1,)   

In [34]:
function predict(params_flat, x; act=tanh)

    W1, b1, W2, b2 = unflatten(params_flat)
    
    # normal NN calculation
    
    a = act.(W1*x .+ b1)
    y = W2*a .+ b2
    return y
end

predict (generic function with 1 method)

In [35]:
function unflatten(params_flat)
    params = []
    i1 = 1
    for s in sizes # sizes is defined outside of the function
        l = reduce(*, s) # size -> length
        i2 = i1+l
        #p = reshape(params_flat[i1:i2-1], s)
        p = reshape(view(params_flat,i1:i2-1), s)
        push!(params, p)
        i1 = i2
    end 
    return params
end

unflatten (generic function with 1 method)

In [36]:
y_pred = predict(params_flat, x)

1×201 Array{Float64,2}:
 3.82045  3.8178  3.81506  3.81224  3.80932  …  -3.81506  -3.8178  -3.82045

In [37]:
plot(x[:], y_pred[:])

In [38]:
function loss_func(params_flat, x, y_true)
    y_pred = predict(params_flat, x)
    loss = mean(abs2, y_pred - y_true)
    return loss
end

loss_func (generic function with 1 method)

In [39]:
loss_func(params_flat, x, y_true)

15.497626341831506

# Optim.jl

Objective function should only take one variable.

In [40]:
# use global x and y for now
function loss_wrap(params_flat)
    return loss_func(params_flat, x, y_true)
end

loss_wrap (generic function with 1 method)

In [41]:
loss_wrap(params_flat)

15.497626341831506

## Built-in Forward autodiff

- http://julianlsolvers.github.io/Optim.jl/stable/algo/autodiff/

Seems to speed up the optimizer by 200%

In [42]:
od = OnceDifferentiable(loss_wrap, params_flat; autodiff =:forward);
typeof(od)

NLSolversBase.OnceDifferentiable{Float64,Array{Float64,1},Array{Float64,1},Val{false}}

Do several steps of Momentum Gradient Descent to approach minimum, and then use BFGS to converge to minimum.

In [43]:
option1 = Optim.Options(iterations = 50, show_trace=true, show_every=10)
option2 = Optim.Options(iterations = 1000, show_trace=true, show_every=100)

Optim.Options{Float64,Void}(1.0e-32, 1.0e-32, 1.0e-8, 0, 0, 0, false, 1000, false, true, false, 100, nothing, NaN)

In [44]:
params = init_weights(n_hidden = 10) # re-initialize weight
params_flat = collect(Iterators.flatten(params));

In [45]:
@time opt = optimize(od, params_flat, MomentumGradientDescent(), option1)
@time opt = optimize(od, opt.minimizer, BFGS(), option2)

Iter     Function value   Gradient norm 
     0     4.134882e+01     1.155393e+01
    10     2.568268e+00     3.800745e-01
    20     2.109855e+00     8.083726e-01
    30     1.429373e+00     6.105281e-01
    40     1.192013e+00     3.153947e-01
    50     1.101046e+00     2.498291e-01
  0.395069 seconds (146.48 k allocations: 235.890 MiB, 11.95% gc time)
Iter     Function value   Gradient norm 
     0     1.101046e+00     2.498291e-01
   100     3.008080e-01     5.794716e-01
   200     1.440287e-01     2.874443e-01
   300     9.251543e-03     2.839818e-02
   400     7.666477e-03     9.637681e-02
   500     5.909647e-03     8.224074e-02
   600     5.348834e-03     1.395455e-01
   700     4.784281e-03     3.617157e-02
   800     4.637510e-03     2.918410e-02
   900     4.552170e-03     8.914301e-02
  1000     4.173698e-03     4.986889e-01
  4.412954 seconds (1.75 M allocations: 3.145 GiB, 11.76% gc time)


Results of Optimization Algorithm
 * Algorithm: BFGS
 * Starting Point: [-0.2153182189715147,-0.004952294170471341, ...]
 * Minimizer: [0.8758730144350223,1.1856263133077198, ...]
 * Minimum: 4.173698e-03
 * Iterations: 1000
 * Convergence: false
   * |x - x'| < 1.0e-32: false 
     |x - x'| = 2.35e-01 
   * |f(x) - f(x')| / |f(x)| < 1.0e-32: false
     |f(x) - f(x')| / |f(x)| = 2.43e-03 
   * |g(x)| < 1.0e-08: false 
     |g(x)| = 4.99e-01 
   * stopped by an increasing objective: false
   * Reached Maximum Number of Iterations: true
 * Objective Calls: 2664
 * Gradient Calls: 2664

In [46]:
loss_wrap(opt.minimizer)

0.0041736981848988965

In [47]:
y_pred = predict(opt.minimizer, x)
plot(x[:], y_true[:],lw=2)
plot!(x[:], y_pred[:],lw=0,
    marker=:circle,markerstrokewidth = 0,markersize=3)