## German credit data

In [None]:
using CSV;
using DataFrames;
using LinearAlgebra;
using Distributions;
include("../julia/utils.jl");
include("../julia/bayes_logreg.jl");
include("../julia/optimization.jl");
include("../julia/gradients.jl");
include("../julia/generate_recourse.jl");
include("../julia/experiments.jl")

In [None]:
df = CSV.read("../data/credit.csv", DataFrame);
df[!,:] = convert.(Float64, df[!,:]); # convert to floats

In [None]:
y = df[:,:y];
N = length(y);
X = Matrix(df[:,Not(:y)]);
model = bayes_logreg(X,y);
w = model.μ;

In [None]:
using EvalMetrics
using Plots
y_hat = predict(model,X)
rocplot(y,y_hat)

In [None]:
generators = (wachter = generate_recourse_wachter, schut = generate_recourse_schut);
generator_args = (wachter=(λ=0.01,), schut=(T=1000,δ=0.05));
target = 1;

In [None]:
props = [0.01,0.05,0.1,0.25]
results = DataFrame()
for prop in props
    results_prop = run_experiment(X,y,bayes_logreg,target,generators,generator_args,experiment_dynamic)
    insertcols!(results_prop, :prop => prop)
    results = vcat(results, results_prop)
end

Possible that these results are driven by the following fact:

- classifier is biased towards target class (try random over/undersampling)

In [None]:
dt_plot = groupby(results, [:period, :generator, :prop]) |>
    gdf -> combine(gdf, :validity .=> [mean, std] .=> [:mean, :std])
dt_plot[!,:ymin] = dt_plot[!,:mean] - dt_plot[!,:std]
dt_plot[!,:ymax] = dt_plot[!,:mean] + dt_plot[!,:std]
ggplot(data=dt_plot, aes(x=:period, y=:mean, colour=:generator)) +
    geom_line() + 
    geom_errorbar(aes(ymin=:ymin, ymax=:ymax), width=.2) + 
    geom_point() +
    facet_wrap(R".~prop") +
    theme_bw() |>
    p -> ggsave("www/german_dynamic_validity.png", plot = p);
load("www/german_dynamic_validity.png")

Interesting to see that there are some clear trends in the cost of recourse over time:

- what could be the reason for these dynamics?

In [None]:
dt_plot = groupby(results, [:period, :generator, :prop]) |>
    gdf -> combine(gdf, :cost .=> [mean, std] .=> [:mean, :std])
dt_plot[!,:ymin] = dt_plot[!,:mean] - dt_plot[!,:std]
dt_plot[!,:ymax] = dt_plot[!,:mean] + dt_plot[!,:std]
ggplot(data=dt_plot, aes(x=:period, y=:mean, colour=:generator)) +
    geom_line() + 
    geom_errorbar(aes(ymin=:ymin, ymax=:ymax), width=.2) + 
    geom_point() +
    facet_wrap(R".~prop") +
    theme_bw() |>
    p -> ggsave("www/german_dynamic_cost.png", plot = p);
load("www/german_dynamic_cost.png")

### Adjusting for imbalance

In [None]:
using MLDataUtils
X_train, y_train = oversample((transpose(X),y))
X_train = transpose(X_train);
model = bayes_logreg(X_train,y_train);

In [None]:
y_hat = predict(model,X)
rocplot(y,y_hat)