In [60]:
# code taken from Wiedman et al 2021
# https://github.com/devmotion/Calibration_ICLR2021

using Base.Filesystem

using Arrow
using CairoMakie
using CalibrationErrors
using CalibrationErrorsDistributions
using CalibrationTests
using CSV
using DataFrames
using Distributions
using Flux
using Random
using ProgressLogging
using Query
using Optim

using Logging: with_logger
using TerminalLoggers: TerminalLogger
using ColorSchemes: Dark2_8

pwd()

"/Users/fm03-mb03/Repositories/proper_calibration_errors"

In [74]:
friedman1_var(x) = 10 * sinpi(x[1] * x[2]) + 20 * (x[3] - 1//2)^2 + 10 * x[4] + 5 * x[5]

function sample_data(n::Int)
    ## sample inputs
    xs = rand(10, n)

    ## sample targets
    ys = map(eachcol(xs)) do x
        # sigma = a + b * x[6]
        # epsilon ~ N(0, sigma^2)
        return friedman1_var(x) + randn() * (0.5 + 1*x[6])
    end

    return xs, ys
end

sample_data (generic function with 1 method)

In [75]:
n_models = 5

Random.seed!(100)
train_data = sample_data(100);

# validation data

Random.seed!(200)
val_data = sample_data(100);

# For the evaluation of the models we use another data set of 100 samples that is
# sampled according to the same law.

Random.seed!(300)
test_data = sample_data(100);

In [76]:
# constant for numerical stability of division
eps = 1e-10

function glorot_uniform(nout::Int, nin::Int)
    return (rand(nout, nin) .- 0.5) .* sqrt(24 / (nout + nin))
end

function nn_model()
    ## initial parameters
    f = Chain(
        Dense(10 => 200, relu; init=glorot_uniform),
        Dense(200 => 50, relu; init=glorot_uniform),
        Dense(50 => 2; init=glorot_uniform),
    )
    # due to a lack of julia expertise, the variance output is in the log space
    # and will be transformed in each loss later
    return f
end

function pmcc(ps, ys)
    vars = exp.(ps[2,:]) .+ eps
    scores = (ps[1,:] .- ys).^2 ./ vars .+ log.(vars)
    return mean(scores)
end

function skce_biased(ps, ys)
    kernel = WassersteinExponentialKernel() ⊗ SqExponentialKernel()
    estimator = BiasedSKCE(kernel)
    n = size(ps, 2)
    predictions = [Normal(ps[1, i], sqrt(exp(ps[2, i]) + eps)) for i in 1:n]
    return calibrationerror(estimator, vec(predictions), ys)
end

function skce_unbiased(ps, ys)
    kernel = WassersteinExponentialKernel() ⊗ SqExponentialKernel()
    estimator = UnbiasedSKCE(kernel)
    n = size(ps, 2)
    predictions = [Normal(ps[1, i], sqrt(exp(ps[2, i]) + eps)) for i in 1:n]
    return calibrationerror(estimator, vec(predictions), ys)
end

function mse(ps, ys)
    scores = (ps[1,:] .- ys).^2
    return mean(scores)
end

function mean_var(ps)
    vars = exp.(ps[2,:]) .+ eps
    return mean(vars)
end

mean_var (generic function with 1 method)

In [77]:
function recal(f, val_xs, val_ys)
    preds = f(val_xs)
    
    function helper(w)
        cals = transpose(hcat(preds[1, :], log.(exp.(preds[2, :]) .* w[1] .+ w[2])))
        return pmcc(cals, val_ys)
    end

    lower = [eps, eps]
    upper = [Inf, Inf]
    res = optimize(helper, lower, upper, [1.0, 2*eps], Fminbox(LBFGS()))
    w = Optim.minimizer(res)

    return x -> transpose(hcat(x[1, :], log.(exp.(preds[2, :]) .* w[1] .+ w[2])))
end

recal (generic function with 1 method)

In [78]:
# ## Training
#
# We use a maximum likelihood approach and train the parameters $\theta$ of the model
# for 1000 iterations by minimizing the DSS on the training data set
# using ADAM.
#
# We train 5 models and compute the predicted distributions on the training and test data sets
# in each iteration step.
#
# The initial values of the weight matrices of the neural networks are sampled from the
# [uniform Glorot initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
# and the offset vectors are initialized with zeros. The model parameters are learnt by
# iteratively minimizing the DSS on the training data set.
# The parameters of the neural networks are trained by gradient descent with the
# [Adam optimization algorithm](https://arxiv.org/pdf/1412.6980.pdf) (default
# settings in [Flux.jl](https://github.com/FluxML/Flux.jl)).

function train(id, (train_xs, train_ys), (val_xs, val_ys), (test_xs, _))
    ## check if file exists
    filename = joinpath("data", "friedman1_var", "predictions_id=$(id).arrow")
    isfile(filename) && return nothing

    ## compute the predictions of the initial neural network
    f = nn_model()
    train_preds = f(train_xs)
    # get recalibration model
    g = recal(f, val_xs, val_ys)
    test_rc_preds = g(f(test_xs))
    test_preds = f(test_xs)

    ## save the initial model and its predictions
    niters = 1000
    train_predss = Vector{typeof(train_preds)}(undef, niters + 1)
    test_rc_predss = Vector{typeof(test_rc_preds)}(undef, niters + 1)
    test_predss = Vector{typeof(test_preds)}(undef, niters + 1)
    train_predss[1] = train_preds
    test_rc_predss[1] = test_rc_preds
    test_predss[1] = test_preds

    ## train with ADAM
    params = Flux.Params(Flux.params(f))
    opt = ADAM()
    @progress name = "training (id = $id)" for i in 2:(niters + 1)
        ## compute gradients
        gradients = gradient(params) do
            return pmcc(f(train_xs), train_ys)
        end

        ## update the parameters
        Flux.Optimise.update!(opt, params, gradients)

        ## save the model and its predictions
        train_predss[i] = f(train_xs)
        # get recalibration model
        g = recal(f, val_xs, val_ys)
        test_rc_predss[i] = g(f(test_xs))
        test_predss[i] = f(test_xs)
    end

    ## save the predictions
    mkpath(dirname(filename))
    preds = (train_preds=train_predss, test_rc_preds=test_rc_predss, test_preds=test_predss)
    Arrow.write(filename, preds)

end

train (generic function with 1 method)

In [79]:
Random.seed!(100)
for (id, seed) in enumerate(rand(UInt, n_models))
    @info "training NN model: run $id"
    Random.seed!(seed)
    train(id, train_data, val_data, test_data)
end

┌ Info: training NN model: run 1
└ @ Main In[79]:3
┌ Info: training NN model: run 2
└ @ Main In[79]:3
┌ Info: training NN model: run 3
└ @ Main In[79]:3
┌ Info: training NN model: run 4
└ @ Main In[79]:3
┌ Info: training NN model: run 5
└ @ Main In[79]:3


In [80]:
# ## Evaluations
#
# SKCE (biased & unbiased), DSS, MSE, avg predicted variance

function evaluate_models(dataset, id, ys)
    ## output file
    out = joinpath("data", "friedman1_var", "statistics_id=$(id)_dataset=$(dataset).csv")
    isfile(out) && return nothing

    ## load data
    filename = joinpath("data", "friedman1_var", "predictions_id=$(id).arrow")
    isfile(filename) || error("predictions for run ", id, " not found")
    tbl = Arrow.Table(filename)
    predss = getproperty(tbl, Symbol(dataset, :_preds))
    predictionss = map(predss) do preds
        return map(preds) do pred
            return pred
        end
    end

    return evaluate_stats(out, predictionss, ys)
end

function evaluate_stats(file, predictionss, ys)
    mkpath(dirname(file))
    open(file, "w") do f
        ## print headers
        println(f, "iteration,statistic,estimate")

        @progress name = "iterations" for (i, predictions) in enumerate(predictionss)
            preds = reshape(predictions, 2, trunc(Int, length(predictions)/2))   
            ## mean squared error
            mse_v = mse(preds, ys)
            println(f, i - 1, ",MSE,", mse_v)            
            
            ## mean-variance score
            pmcc_v = pmcc(preds, ys)
            println(f, i - 1, ",PMCC,", pmcc_v)

            ## unbiased estimator of SKCE
            skce = skce_unbiased(preds, ys)
            println(f, i - 1, ",SKCE (unbiased),", skce)

            ## biased estimator of SKCE
            skce = skce_biased(preds, ys)
            println(f, i - 1, ",SKCE (biased),", skce)

            ## mean predicted var
            var = mean_var(preds)
            println(f, i - 1, ",Avg Var,", var)
        end
    end

    return nothing
end

evaluate_stats (generic function with 1 method)

In [81]:
Random.seed!(300)
for (id, seed) in enumerate(rand(UInt, n_models))
    
    ## evaluate models on training data set
    @info "evaluating training statistics: run $id"
    Random.seed!(seed)
    evaluate_models("train", id, train_data[2])

    ## evaluate models on test data set
    @info "evaluating test statistics: run $id"
    Random.seed!(seed)
    evaluate_models("test", id, test_data[2])

    ## evaluate models on recalibrated test data set
    @info "evaluating test rc statistics: run $id"
    Random.seed!(seed)
    evaluate_models("test_rc", id, test_data[2])
end

┌ Info: evaluating training statistics: run 1
└ @ Main In[81]:5
┌ Info: evaluating test statistics: run 1
└ @ Main In[81]:10
┌ Info: evaluating test rc statistics: run 1
└ @ Main In[81]:15
┌ Info: evaluating training statistics: run 2
└ @ Main In[81]:5
┌ Info: evaluating test statistics: run 2
└ @ Main In[81]:10
┌ Info: evaluating test rc statistics: run 2
└ @ Main In[81]:15
┌ Info: evaluating training statistics: run 3
└ @ Main In[81]:5
┌ Info: evaluating test statistics: run 3
└ @ Main In[81]:10
┌ Info: evaluating test rc statistics: run 3
└ @ Main In[81]:15
┌ Info: evaluating training statistics: run 4
└ @ Main In[81]:5
┌ Info: evaluating test statistics: run 4
└ @ Main In[81]:10
┌ Info: evaluating test rc statistics: run 4
└ @ Main In[81]:15
┌ Info: evaluating training statistics: run 5
└ @ Main In[81]:5
┌ Info: evaluating test statistics: run 5
└ @ Main In[81]:10
┌ Info: evaluating test rc statistics: run 5
└ @ Main In[81]:15
