# Testing auto-differentiation of 'predict' method with Zygote.jl

### Model training

In [1]:
using Flux
using Flux: gradient
using LaplaceRedux
using LinearAlgebra
using Plots
using Statistics
using Zygote

In [2]:
xs, ys = LaplaceRedux.Data.toy_data_non_linear(200)
X = hcat(xs...) # bring into tabular format
data = zip(xs,ys)

zip([[2.4177043251742587, 3.3822464490006237], [3.344195795224136, 1.7956153312866672], [2.431025263630579, 1.2883291750343913], [3.9303446068532852, 2.913062292981182], [2.6335060160501667, 0.6855425646071571], [0.9062028080279618, 4.264888516133536], [4.264443590829248, 3.6939625467394523], [1.9358462289445266, 1.7102901247254312], [2.311307115529128, 3.578834692139374], [2.212254835201132, 0.5513742316947359]  …  [-3.623518811438702, 0.989759235246586], [-1.500649952385222, 2.6954184532806806], [-4.339850006249561, 4.551783996179579], [-2.838156550164357, 1.2674937551668863], [-3.197823528615125, 2.417845746426911], [-2.5268154328821404, 3.269254255270586], [-4.913119736316474, 2.340094758991834], [-3.626175607034501, 2.411300658133167], [-3.43273893161647, 2.9706734590606994], [-4.325570267317931, 0.9129242368778023]], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [3]:
n_hidden = 10
D = size(X,1)
nn = Chain(
    Dense(D, n_hidden, σ),
    Dense(n_hidden, 1)
)  
loss(x, y) = Flux.Losses.logitbinarycrossentropy(nn(x), y) 

loss (generic function with 1 method)

In [4]:
using Flux.Optimise: update!, Adam
opt = Adam(1e-3)
epochs = 100
avg_loss(data) = mean(map(d -> loss(d[1],d[2]), data))
show_every = epochs/10

for epoch = 1:epochs
  for d in data
    gs = gradient(Flux.params(nn)) do
      l = loss(d...)
    end
    update!(opt, Flux.params(nn), gs)
  end
  if epoch % show_every == 0
    println("Epoch " * string(epoch))
    @show avg_loss(data)
  end
end

Epoch 10

│   The input will be converted, but any earlier layers may be very slow.
│   layer = Dense(2 => 10, σ)
│   summary(x) = 2-element Vector{Float64}
└ @ Flux C:\Users\marka\.julia\packages\Flux\EHgZm\src\layers\stateless.jl:60



avg_loss(data) = 0.6086040003970266


Epoch 20
avg_loss(data) = 0.49154122805222866


Epoch 30
avg_loss(data) = 0.36414230475202203


Epoch 40
avg_loss(data) = 0.2641954865306616


Epoch 50
avg_loss(data) = 0.1967995231412351


Epoch 60
avg_loss(data) = 0.1526863269135356


Epoch 70
avg_loss(data) = 0.12315487658604979
Epoch 80
avg_loss(data) = 0.10255790121853352


Epoch 90
avg_loss(data) = 0.08756980057340115


Epoch 100
avg_loss(data) = 0.07624356346204877


In [5]:
la = Laplace(nn; likelihood=:classification, subset_of_weights=:all)
fit!(la, data)

200

### Auto-differentiation testing

In [6]:
# Define the function to be differentiated:
f(x) = predict(la, x)
# Differentiate it
J = jacobian(f, X)
println(J)

MethodError: MethodError: no method matching size(::Params{Zygote.Buffer{Any, Vector{Any}}})
Closest candidates are:
  size(!Matched::Union{QR, LinearAlgebra.QRCompactWY, QRPivoted}) at C:\Users\marka\AppData\Local\Programs\Julia-1.8.5\share\julia\stdlib\v1.8\LinearAlgebra\src\qr.jl:581
  size(!Matched::Union{QR, LinearAlgebra.QRCompactWY, QRPivoted}, !Matched::Integer) at C:\Users\marka\AppData\Local\Programs\Julia-1.8.5\share\julia\stdlib\v1.8\LinearAlgebra\src\qr.jl:580
  size(!Matched::Union{Cholesky, CholeskyPivoted}) at C:\Users\marka\AppData\Local\Programs\Julia-1.8.5\share\julia\stdlib\v1.8\LinearAlgebra\src\cholesky.jl:514
  ...

Through testing it has determined that the issue arises in nested use of Zygote, the use of `jacobian` in `jacobians` causes the issue. Jacobian moditifies the input arguments and therefore cannot be auto differentiated by Zygote.

Cloning the inpute does not solve the issue, since the Zygote operates on the principle of reducing the function to the rules it can differentiate. The [issue](https://github.com/FluxML/Zygote.jl/issues/953) of nested use of Zygote is still open.