In [1]:
import sys
sys.path.append('..')
# %env CUDA_VISIBLE_DEVICES=0,

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
%matplotlib inline

import torch
import torch.nn as nn
import os
from collections import defaultdict

from torch.distributions import MultivariateNormal

from src.mrartemev_nflib.flows import NormalizingFlowModel, InvertiblePermutation, Invertible1x1Conv, ActNorm, NSF_AR
from src.mrartemev_nflib.flows import MAF, AffineHalfFlow
from src.mrartemev_nflib.nn import ARMLP, MLP

from torch.utils.data import Dataset, DataLoader, TensorDataset
from itertools import repeat

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

from src.nf import CalibratedModel, neg_log_likelihood
from src.nf.classifiers import train_catboost_clf
from scipy.special import logsumexp, expit
from scipy.stats import norm


os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device

device(type='cuda', index=0)

In [2]:
class AffineHalfFlow(nn.Module):
    """
    As seen in RealNVP, affine autoregressive flow (z = x * exp(s) + t), where half of the
    dimensions in x are linearly scaled/transfromed as a function of the other half.
    Which half is which is determined by the parity bit.
    - RealNVP both scales and shifts (default)
    - NICE only shifts
    """

    def __init__(self, dim, base_network, scale=True, shift=True, **base_network_kwargs):
        super().__init__()
        self.dim = dim
        self.s_cond = lambda x, context: x.new_zeros(x.size(0), self.dim // 2, device=x.device)
        self.t_cond = lambda x, context: x.new_zeros(x.size(0), self.dim // 2, device=x.device)
        if scale:
            self.s_cond = base_network(self.dim - (self.dim // 2),
                                       self.dim // 2,
                                       **base_network_kwargs)
        if shift:
            self.t_cond = base_network(self.dim - (self.dim // 2),
                                       self.dim // 2,
                                       **base_network_kwargs)

    def forward(self, x, context=None):
        x0, x1 = x[:, ::2], x[:, 1::2]
        s = self.s_cond(x0, context=context)
        t = self.t_cond(x0, context=context)
        z1 = torch.exp(s) * x1 + t  # transform this half as a function of the other
        z = torch.cat([x0, z1], dim=1)
        log_det = torch.sum(s, dim=1)
        return z, log_det

    def inverse(self, z, context=None):
        k = z.shape[1] // 2 + z.shape[1] % 2
        z0, z1 = z[:, :k], z[:, k:]
        s = self.s_cond(z0, context)
        t = self.t_cond(z0, context)
        x1 = (z1 - t) * torch.exp(-s)  # reverse the transform on this half
        x = torch.empty_like(z)
        x[:, ::2] = z0
        x[:, 1::2] = x1
        log_det = torch.sum(-s, dim=1)
        return x, log_det

In [3]:
! ls dumps_20200602/GAS/SPLINE-AR_2_ind1

checkpoints  final_model.checkpoint


In [4]:
def fix_act_norm(layer):
    if isinstance(layer, ActNorm):
        layer.data_dep_init_done = True

In [5]:
def get_model(model_type, data, num_layers, dump_path):
    flows = []
    for _ in range(num_layers):
        if model_type == 'MAF':
            flows.append(MAF(dim=data.n_dims, base_network=ARMLP))
            flows.append(InvertiblePermutation(dim=data.n_dims))
        if model_type == 'SPLINE-AR':
            flows.append(ActNorm(dim=data.n_dims))
            flows.append(Invertible1x1Conv(dim=data.n_dims))
            flows.append(NSF_AR(dim=data.n_dims, K=8, B=3, hidden_features=32, depth=1, base_network=MLP))
        if model_type == 'GLOW':
            flows.append(ActNorm(dim=data.n_dims))
            flows.append(Invertible1x1Conv(dim=data.n_dims))
            flows.append(AffineHalfFlow(dim=data.n_dims, hidden_features=32, base_network=MLP))
            flows.append(InvertiblePermutation(dim=data.n_dims))
        if model_type == 'RealNVP':
            flows.append(AffineHalfFlow(dim=data.n_dims, base_network=MLP))
            flows.append(InvertiblePermutation(dim=data.n_dims))

    lr = 0.0005

    prior = MultivariateNormal(torch.zeros(data.n_dims).to(device), torch.eye(data.n_dims).to(device))
    model = NormalizingFlowModel(prior, flows).to(device)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    checkpoint = torch.load(dump_path)
    model.load_state_dict(checkpoint['model.state_dict()'])
    model.apply(fix_act_norm)
    
    return model

In [6]:
def to_device(model, device):
    model.to(device)
    model.prior = MultivariateNormal(torch.zeros(data.n_dims).to(device),
                                     torch.eye(data.n_dims).to(device))

In [7]:
def repeater(data_loader):
    for loader in repeat(data_loader):
        for data in loader:
            yield data

In [8]:
def batched_sample(model, n, batch_size=14000):
    generated = []
    for _ in range(n // batch_size):
        generated_batch = model.sample(batch_size)
        generated.append(generated_batch.cpu().detach())
    if n % batch_size != 0:
        generated_batch = model.sample(n % batch_size)
        generated.append(generated_batch.cpu().detach())
    generated = torch.cat(generated, dim=0)
    assert n == len(generated)
    return generated

In [9]:
def logloss_with_logits(y_pred_logits, y_true):
    return -np.mean(
        y_true * np.logaddexp(0, -y_pred_logits) + \
        (1 - y_true) * np.logaddexp(0, y_pred_logits)
    )

In [74]:
def compute_error(clf_preds, calibration_constant):
    n = len(clf_preds)
    theta = np.exp(calibration_constant)
    var = np.exp(logsumexp(2 * clf_preds) - np.log(len(clf_preds))) - theta**2

    for eps in np.arange(0.01, 0.5, 0.01):
        res = norm.cdf((np.exp(eps) - 1) * np.sqrt(n) * theta / np.sqrt(var)) - \
        norm.cdf((np.exp(-eps) - 1) * np.sqrt(n) * theta / np.sqrt(var)) - \
        2 * 0.48 * np.mean(np.abs(np.exp(clf_preds) - theta)**3) / var**1.5 / np.sqrt(n)

        if res >= 0.9:
            break

    return res, eps

In [75]:
def batch_sample(model, n, batch_size):
    model.eval()
    with torch.no_grad():
        samples = []
        for i in range(0, n, batch_size):
            samples.append(model.sample_n(batch_size))
        if n % batch_size != 0:
            samples.append(model.sample_n(n % batch_size))
        
    return torch.cat(samples, dim=0)

In [76]:
def train_cb(model, X_train_tensor, X_test_tensor, clips, iters):
    clf_ds_train = np.row_stack([
        np.column_stack([X_train_tensor.cpu().detach().numpy(), np.ones(len(X_train_tensor)).reshape(-1, 1)]),
        np.column_stack([model.sample_n(len(X_train_tensor)).cpu().detach().numpy(), np.zeros(len(X_train_tensor)).reshape(-1, 1)])
    ]).astype(np.float32)

    clf_ds_test = np.row_stack([
        np.column_stack([X_test_tensor.cpu().detach().numpy(), np.ones(len(X_test_tensor)).reshape(-1, 1)]),
        np.column_stack([model.sample_n(len(X_test_tensor)).cpu().detach().numpy(), np.zeros(len(X_test_tensor)).reshape(-1, 1)])
    ]).astype(np.float32)

    print('a')
#     samples = batch_sample(model, n, 5000).cpu().detach().cpu().numpy()
    samples = model.sample_n(n).cpu().detach().cpu().numpy()
    print('b')
    
    metrics = []
    for n_iters in iters:
#         print(n_iters)
        clf = CatBoostClassifier(n_iters, verbose=0, task_type='GPU', devices='1').fit(
            clf_ds_train[:, :-1], clf_ds_train[:, -1],
        )
        for clip in clips:
#             print(clip)
            clf_wrap = lambda x: np.clip(clf.predict(x, prediction_type='RawFormulaVal'), -100, np.log(clip))
            calibrated_model = CalibratedModel(
                clf_wrap,
                model,
                logit=True
            )
            clf_preds = clf_wrap(samples)
            assert np.all(clf_preds <= np.log(clip))
            calibration_constant = logsumexp(clf_preds) - np.log(len(clf_preds))
            logits = clf_preds - calibration_constant
            ll = -neg_log_likelihood(calibrated_model, X_test_tensor.cpu().detach()) - calibration_constant
            
            perc, eps = compute_error(clf_preds, calibration_constant)
            
            assert np.all(clf_wrap(clf_ds_test[:, :-1]) <= np.log(clip))
            auc_roc = roc_auc_score(
                clf_ds_test[:, -1],
                clf_wrap(clf_ds_test[:, :-1]),
            )
            log_loss = logloss_with_logits(
                clf_wrap(clf_ds_test[:, :-1]),
                clf_ds_test[:, -1]
            )
            metrics.append({
                'clip': clip,
                'iters': n_iters,
                'll': ll,
                'auc_roc': auc_roc,
                'logloss': log_loss,
                'overhead': np.max(logits),
                'calibration_constant': calibration_constant,
                'perc': perc,
                'eps': eps
            })

    return clf_ds_train, clf_ds_test, metrics

In [77]:
from utils import data_utils

data_mapping = {'BSDS300': data_utils.BSDS300,
                'GAS': data_utils.GAS,
                'MINIBOONE': data_utils.MINIBOONE,
                'POWER': data_utils.POWER,
                'HEPMASS': data_utils.HEPMASS}

In [78]:
def get_best_model(model_type, data, num_layers, dumps_path):
    best_ll, best_model, best_dump = -10000000, None, None
    X_test_tensor = torch.from_numpy(data.tst.x[:100000]).to(device)
    
    for dump_path in [dumps_path + '/final_model.checkpoint'] + [
        os.path.join(dumps_path, 'checkpoints', path) for path in os.listdir(dumps_path + '/checkpoints')
    ]:
        try:
            model = get_model(model_type, data, num_layers, dump_path)
        except FileNotFoundError:
            print(f'Not found {dump_path}')
            continue
        ll = -neg_log_likelihood(model, X_test_tensor)
        if ll > best_ll:
            best_ll = ll
            best_model = model
            best_dump = dump_path
    return best_model, best_ll, best_dump

In [79]:
! nvidia-smi

Fri Jun  5 20:41:10 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64       Driver Version: 440.64       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:01:00.0 Off |                  N/A |
| 20%   53C    P8    10W / 250W |   5711MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  Off  | 00000000:02:00.0 Off |                  N/A |
| 20%   45C    P8     9W / 250W |    161MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [80]:
arr = []
for data_name in ('BSDS300', ):
    data = data_mapping[data_name]()
    dim = data.n_dims
    n = min(4000000, data.trn.x.shape[0])
    X_train_tensor = torch.from_numpy(data.trn.x[:n]).to(device)
    X_test_tensor = torch.from_numpy(data.tst.x[:n]).to(device)

    for model_type in ('GLOW', ):
        num_layers = 2 if model_type == 'SPLINE-AR' else 5
        model_name = f"{model_type}_{num_layers}"

        dumps_path = f'dumps_20200605/{data_name}/{model_type}_{num_layers}_ind1'
        model, ll, dump_path = get_best_model(model_type, data, num_layers, dumps_path)
        
        model.eval()
        model.sample_n = lambda n: batched_sample(model, n)
        to_device(model, 'cpu')

        clf_ds_train, clf_ds_test, metrics = train_cb(model, X_train_tensor, X_test_tensor, [10000], [1000])
        
        arr.append({
            'data_name': data_name,
            'model_type': model_type,
            'll': ll,
            'metrics': metrics,
            'dump_path': dump_path,
        })
        print(data_name, model_type, ll, [(x['ll'], x['eps'], x['perc']) for x in metrics])

a
b
BSDS300 GLOW 152.59567260742188 [(152.87609124418796, 0.49, 0.6849422549530044)]


In [81]:
arr = []
for data_name in ('MINIBOONE', 'BSDS300', 'GAS', 'HEPMASS', 'POWER'):
    data = data_mapping[data_name]()
    dim = data.n_dims
    n = min(1000000, data.trn.x.shape[0])
    X_train_tensor = torch.from_numpy(data.trn.x[:n]).to(device)
    X_test_tensor = torch.from_numpy(data.tst.x[:n]).to(device)

    for model_type in ('GLOW', 'MAF', 'RealNVP', 'SPLINE-AR'):
        num_layers = 2 if model_type == 'SPLINE-AR' else 5
        model_name = f"{model_type}_{num_layers}"

        dumps_path = f'dumps_20200605/{data_name}/{model_type}_{num_layers}_ind1'
        model, ll, dump_path = get_best_model(model_type, data, num_layers, dumps_path)
        
        model.eval()
        model.sample_n = lambda n: batched_sample(model, n)
        to_device(model, 'cpu')

#         clf_ds_train, clf_ds_test, metrics = train_cb(model, X_train_tensor, X_test_tensor, [10000, 2, 1.5], [1, 20, 100, 500, 1000, 5000])
        clf_ds_train, clf_ds_test, metrics_ = train_cb(model, X_train_tensor, X_test_tensor, [10000], [5000])
        
        arr.append({
            'data_name': data_name,
            'model_type': model_type,
            'll': ll,
            'metrics': metrics_,
            'dump_path': dump_path,
        })
        print(data_name, model_type, ll, [(x['ll'], x['eps']) for x in metrics_])

a
b
MINIBOONE GLOW -14.054896354675293 [(-12.705804747625656, 0.02)]
a
b
MINIBOONE MAF -12.341054916381836 [(-11.098869323690618, 0.02)]
a
b
MINIBOONE RealNVP -13.369071960449219 [(-11.96748363173535, 0.03)]
Not found dumps_20200605/MINIBOONE/SPLINE-AR_2_ind1/final_model.checkpoint
a
b
MINIBOONE SPLINE-AR -18.08390235900879 [(-16.758931155987096, 0.03)]
a
b
BSDS300 GLOW 152.59567260742188 [(152.9087058825719, 0.49)]
a
b
BSDS300 MAF 146.6654052734375 [(147.87186315895826, 0.01)]
a
b
BSDS300 RealNVP 144.85086059570312 [(149.61812754833477, 0.05)]
Not found dumps_20200605/BSDS300/SPLINE-AR_2_ind1/final_model.checkpoint
a
b
BSDS300 SPLINE-AR 155.3041229248047 [(155.5172718495732, 0.01)]
a
b
GAS GLOW 9.409331321716309 [(10.312951525369805, 0.01)]
a
b
GAS MAF 7.933399677276611 [(9.274445711543388, 0.01)]
a
b
GAS RealNVP 8.754535675048828 [(9.958358646698514, 0.49)]
a
b
GAS SPLINE-AR 10.029468536376953 [(10.642059318755612, 0.01)]
a
b
HEPMASS GLOW -18.733779907226562 [(-18.01399387829084, 0.0

In [71]:
for data_name in ('MINIBOONE', 'BSDS300', 'GAS', 'HEPMASS', 'POWER'):
    for model_type in ('GLOW', 'MAF', 'RealNVP', 'SPLINE-AR'):
        model_name = (model_type, 'calibrated', y['iters'], 'inf', )
        eps = metrics[(data_name, 'eps', )][model_name]
        ll = metrics[(data_name, 'll', )][model_name]
        perc = metrics[(data_name, 'perc', )][model_name]
        
        print(data_name, model_type, ll, eps, perc)

MINIBOONE GLOW -12.710433792768075 0.49 0.9411102858708786
MINIBOONE MAF -11.107489330818982 0.49 0.9439379848399874
MINIBOONE RealNVP -11.954679347796157 0.49 0.923603221802306
MINIBOONE SPLINE-AR -16.750466825706305 0.49 0.945051835605533
BSDS300 GLOW 152.91626243047367 0.49 0.4198647855095282
BSDS300 MAF 147.87036413498066 0.49 0.9818233410821551
BSDS300 RealNVP 149.6272578739821 0.49 0.9439881981148065
BSDS300 SPLINE-AR 155.51475302450916 0.01 0.9963383496823432
GAS GLOW 10.313494415761932 0.49 0.24092418457280673
GAS MAF 9.272813639711524 0.49 0.9891715193004614
GAS RealNVP 9.961206730031982 0.01 0.993927210379924
GAS SPLINE-AR 10.639237580766473 0.01 0.9974078474341092
HEPMASS GLOW -18.012617169289598 0.01 0.9926114847842756
HEPMASS MAF -18.603928064005267 0.49 0.9713923517402725
HEPMASS RealNVP -18.302366353334303 0.01 0.9922052666571896
HEPMASS SPLINE-AR -15.945156065002017 0.01 0.9968283765374505
POWER GLOW 0.4617525650115884 0.01 0.9960127753167991
POWER MAF 0.435084807656458

In [72]:
for model_type in ('RealNVP', 'MAF', 'GLOW', 'SPLINE-AR'):
    a = []
    for data_name in ('POWER', 'GAS', 'HEPMASS', 'MINIBOONE', 'BSDS300'):
        model_name = (model_type, 'normalizing flow', '', '', )
        ll_nf = metrics[(data_name, 'll', )][model_name]
        ll_nf = f'$ {str(round(ll_nf, 3))} $'
        
        model_name = (model_type, 'calibrated', y['iters'], 'inf', )
        eps = metrics[(data_name, 'eps', )][model_name]
        ll = metrics[(data_name, 'll', )][model_name]
        overhead = metrics[(data_name, 'log overhead', )][model_name]
        
        ll = '$ \\textbf{' + str(round(ll, 3)) + '} \pm ' + str(eps) + ' $'
        overhead = str(round(overhead, 3))
        
        a.append([ll_nf, ll, overhead])
    for x, model_name in zip(zip(*a), [model_type, f'{model_type} $\star$', f'{model_type} $\star$ log overhead', ]):
        print(model_name + ' &')
        print(' & '.join(x) + ' \\\\')
    print()
    print('\midrule[0em]')
    print()

RealNVP &
$ -0.608 $ & $ 8.755 $ & $ -19.193 $ & $ -13.369 $ & $ 144.851 $ \\
RealNVP $\star$ &
$ \textbf{0.375} \pm 0.01 $ & $ \textbf{9.961} \pm 0.01 $ & $ \textbf{-18.302} \pm 0.01 $ & $ \textbf{-11.955} \pm 0.49 $ & $ \textbf{149.627} \pm 0.49 $ \\
RealNVP $\star$ log overhead &
5.188 & 5.09 & 4.051 & 4.518 & 7.779 \\

\midrule[0em]

MAF &
$ 0.128 $ & $ 7.933 $ & $ -19.643 $ & $ -12.341 $ & $ 146.665 $ \\
MAF $\star$ &
$ \textbf{0.435} \pm 0.01 $ & $ \textbf{9.273} \pm 0.49 $ & $ \textbf{-18.604} \pm 0.49 $ & $ \textbf{-11.107} \pm 0.49 $ & $ \textbf{147.87} \pm 0.49 $ \\
MAF $\star$ log overhead &
3.625 & 5.515 & 5.319 & 3.947 & 5.24 \\

\midrule[0em]

GLOW &
$ 0.243 $ & $ 9.409 $ & $ -18.734 $ & $ -14.055 $ & $ 152.596 $ \\
GLOW $\star$ &
$ \textbf{0.462} \pm 0.01 $ & $ \textbf{10.313} \pm 0.49 $ & $ \textbf{-18.013} \pm 0.01 $ & $ \textbf{-12.71} \pm 0.49 $ & $ \textbf{152.916} \pm 0.49 $ \\
GLOW $\star$ log overhead &
4.027 & 7.846 & 3.891 & 4.523 & 7.746 \\

\midrule[0em]

SPL

In [55]:
for model_type in ('RealNVP', 'MAF', 'GLOW', 'SPLINE-AR'):
    a = []
    for data_name in ('POWER', 'GAS', 'HEPMASS', 'MINIBOONE', 'BSDS300'):
        model_name = (model_type, 'normalizing flow', '', '', )
        ll_nf = metrics[(data_name, 'll', )][model_name]
        ll_nf = f'$ {str(round(ll_nf, 3))} $'
        
        model_name = (model_type, 'calibrated', y['iters'], '1.5', )
        eps = metrics[(data_name, 'eps', )][model_name]
        ll = metrics[(data_name, 'll', )][model_name]
        overhead = metrics[(data_name, 'log overhead', )][model_name]
        
        ll = '$ \\textbf{' + str(round(ll, 3)) + '} \pm ' + str(eps) + ' $'
        overhead = str(round(overhead, 3))
        
        a.append([ll_nf, ll, overhead])
    for x, model_name in zip(zip(*a), [model_type, f'{model_type} $\star$', f'{model_type} $\star$ log overhead', ]):
        print(model_name + ' &')
        print(' & '.join(x) + ' \\\\')
    print()
    print('\midrule[0em]')
    print()

RealNVP &
$ -0.608 $ & $ 8.755 $ & $ -19.193 $ & $ -13.369 $ & $ 144.851 $ \\
RealNVP $\star$ &
$ \textbf{0.065} \pm 0.01 $ & $ \textbf{9.59} \pm 0.01 $ & $ \textbf{-18.536} \pm 0.01 $ & $ \textbf{-12.652} \pm 0.02 $ & $ \textbf{147.738} \pm 0.02 $ \\
RealNVP $\star$ log overhead &
0.956 & 0.993 & 0.797 & 0.914 & 3.075 \\

\midrule[0em]

MAF &
$ 0.128 $ & $ 7.933 $ & $ -19.643 $ & $ -12.341 $ & $ 146.665 $ \\
MAF $\star$ &
$ \textbf{0.374} \pm 0.01 $ & $ \textbf{8.806} \pm 0.01 $ & $ \textbf{-18.946} \pm 0.01 $ & $ \textbf{-11.728} \pm 0.02 $ & $ \textbf{147.144} \pm 0.01 $ \\
MAF $\star$ log overhead &
0.56 & 1.026 & 0.887 & 0.824 & 0.911 \\

\midrule[0em]

GLOW &
$ 0.243 $ & $ 9.409 $ & $ -18.734 $ & $ -14.055 $ & $ 152.596 $ \\
GLOW $\star$ &
$ \textbf{0.426} \pm 0.01 $ & $ \textbf{10.041} \pm 0.01 $ & $ \textbf{-18.213} \pm 0.01 $ & $ \textbf{-13.352} \pm 0.02 $ & $ \textbf{152.656} \pm 0.01 $ \\
GLOW $\star$ log overhead &
0.512 & 0.83 & 0.741 & 0.894 & 0.605 \\

\midrule[0em]

SP

In [None]:
for data_name in ('MINIBOONE', 'BSDS300', 'GAS', 'HEPMASS', 'POWER'):
    for model_type in ('GLOW', 'MAF', 'RealNVP', 'SPLINE-AR'):
        model_name = (model_type, 'calibrated', y['iters'], 'inf', )
        eps = metrics[(data_name, 'eps', )][model_name]
        ll = metrics[(data_name, 'll', )][model_name]
        
        print(data_name, model_type, ll, eps)

In [70]:
metrics = _20

In [20]:
metrics = defaultdict(dict)
for x in arr:
    model_name = (x['model_type'], 'normalizing flow', '', '', )
    metrics[(x['data_name'], 'll', )][model_name] = x['ll']
    metrics[(x['data_name'], 'log overhead', )][model_name] = 0
    for y in x['metrics']:
        if y['clip'] == 10000:
            model_name = (x['model_type'], 'calibrated', y['iters'], 'inf', )
        else:
            model_name = (x['model_type'], 'calibrated', y['iters'], str(round(y['clip'], 2)), )
        metrics[(x['data_name'], 'll', )][model_name] = y['ll']
        metrics[(x['data_name'], 'log overhead', )][model_name] = y['overhead']
        metrics[(x['data_name'], 'AUC-ROC', )][model_name] = y['auc_roc']
        metrics[(x['data_name'], 'Logloss', )][model_name] = -y['logloss']
        metrics[(x['data_name'], 'eps', )][model_name] = y['eps']
        metrics[(x['data_name'], 'perc', )][model_name] = y['perc']
metrics = pd.DataFrame(metrics)
pd.set_option('display.max_rows', metrics.shape[0] + 1)
pd.set_option('display.max_columns', metrics.shape[0] + 1)
metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,MINIBOONE,MINIBOONE,MINIBOONE,MINIBOONE,MINIBOONE,MINIBOONE,BSDS300,BSDS300,BSDS300,BSDS300,BSDS300,BSDS300,GAS,GAS,GAS,GAS,GAS,GAS,HEPMASS,HEPMASS,HEPMASS,HEPMASS,HEPMASS,HEPMASS,POWER,POWER,POWER,POWER,POWER,POWER
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,ll,log overhead,AUC-ROC,Logloss,eps,perc,ll,log overhead,AUC-ROC,Logloss,eps,perc,ll,log overhead,AUC-ROC,Logloss,eps,perc,ll,log overhead,AUC-ROC,Logloss,eps,perc,ll,log overhead,AUC-ROC,Logloss,eps,perc
GLOW,calibrated,1.0,1.5,-13.74246,0.289202,0.672418,0.597582,0.01,0.992134,152.438782,0.356008,0.541041,0.690453,0.01,0.997971,9.442208,0.416994,0.572542,0.684557,0.01,0.997823,-18.652043,0.095536,0.549436,0.662477,0.01,0.995201,0.263122,0.219922,0.557511,0.685335,0.01,0.997394
GLOW,calibrated,1.0,2.0,-13.714622,0.289202,0.672418,0.59251,0.01,0.992134,152.438782,0.356008,0.541041,0.690453,0.01,0.997971,9.442438,0.473729,0.572542,0.68451,0.01,0.997788,-18.652043,0.095536,0.549436,0.662477,0.01,0.995201,0.263122,0.219922,0.557511,0.685335,0.01,0.997394
GLOW,calibrated,1.0,inf,-13.386456,0.289202,0.672418,0.574111,0.01,0.992134,152.438782,0.356008,0.541041,0.690453,0.01,0.997971,9.442438,0.473729,0.572542,0.68451,0.01,0.997788,-18.652043,0.095536,0.549436,0.662477,0.01,0.995201,0.263122,0.219922,0.557511,0.685335,0.01,0.997394
GLOW,calibrated,20.0,1.5,-13.481016,0.790391,0.81185,0.53818,0.02,0.993487,152.493143,0.468366,0.602018,0.67775,0.01,0.99853,9.58358,0.497558,0.681686,0.648318,0.01,0.998535,-18.418748,0.544562,0.730353,0.599294,0.01,0.997696,0.348416,0.443315,0.636487,0.662079,0.01,0.998431
GLOW,calibrated,20.0,2.0,-13.414965,0.998542,0.821948,0.521299,0.02,0.992646,152.508464,0.732827,0.603522,0.674745,0.01,0.998122,9.602289,0.753426,0.683658,0.644534,0.01,0.998316,-18.385771,0.777669,0.736124,0.592153,0.01,0.997561,0.354326,0.717705,0.636906,0.660932,0.01,0.998327
GLOW,calibrated,20.0,inf,-12.951804,3.515324,0.830069,0.485862,0.49,0.968927,152.531294,3.080643,0.603887,0.671814,0.01,0.99639,9.613303,2.52049,0.683837,0.642892,0.01,0.997732,-18.363267,2.41301,0.736927,0.588625,0.01,0.996906,0.355348,1.464231,0.636911,0.660784,0.01,0.998238
GLOW,calibrated,100.0,1.5,-13.384208,0.882603,0.831458,0.518902,0.02,0.993417,152.558603,0.534022,0.65137,0.662284,0.01,0.998598,9.780492,0.646835,0.763885,0.598129,0.01,0.998654,-18.300681,0.677232,0.768347,0.572494,0.01,0.997878,0.39712,0.488375,0.661048,0.649074,0.01,0.998635
GLOW,calibrated,100.0,2.0,-13.312769,1.074738,0.841065,0.499419,0.02,0.99244,152.58872,0.789399,0.654692,0.656141,0.01,0.998306,9.831714,0.869612,0.770904,0.586496,0.01,0.998463,-18.253448,0.875468,0.777715,0.560409,0.01,0.997671,0.411208,0.73219,0.663446,0.646041,0.01,0.998508
GLOW,calibrated,100.0,inf,-12.65714,5.409341,0.852507,0.458337,0.49,0.786368,152.688994,6.477358,0.655937,0.646324,0.49,0.679897,9.904682,5.313637,0.773396,0.576002,0.49,0.982003,-18.171816,4.1286,0.782362,0.548176,0.01,0.993016,0.420347,3.052201,0.663781,0.644733,0.01,0.997735
GLOW,calibrated,500.0,1.5,-13.36707,0.880258,0.845358,0.514252,0.02,0.993322,152.629054,0.589631,0.693809,0.645412,0.01,0.998598,9.982615,0.793421,0.823565,0.549224,0.01,0.998642,-18.239569,0.723065,0.791625,0.558633,0.01,0.997871,0.420763,0.509258,0.673092,0.642913,0.01,0.998663


In [None]:
! mkdir plots
# for data_name in ('MINIBOONE', 'BSDS300', 'GAS', 'HEPMASS', 'POWER'):
for data_name in ('MINIBOONE'):
#     for model_type in ('GLOW', 'MAF', 'RealNVP', 'SPLINE-AR'):
    for model_type in ('RealNVP',):
        iters = np.array([x for x in metrics[(data_name, 'll')][model_type]['calibrated'].index.levels[0] if isinstance(x, int)])
        log_iters = np.log(iters)
        
        fig, axs = plt.subplots(1, 3, figsize=(18, 4))

        nf_ll = metrics[(data_name, 'll')][model_type]['normalizing flow']
        axs[0].plot(log_iters, [nf_ll] * len(log_iters), label='Model likelihood', color='red')

        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_lls = [metrics[(data_name, 'll')][model_type]['calibrated'][i][clip] for i in iters]
            axs[0].plot(log_iters, calibrated_lls, label=f'Calibrated model clip={clip}', color=color)

        axs[0].set_xlabel('Catboost log trees', fontsize=14)
        axs[0].set_ylabel('Log likelihood', fontsize=14)
        axs[0].legend()
        axs[0].grid()


        axs[1].plot(log_iters, [0] * len(log_iters), label='Model log overhead', color='red')

        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_overheads = [metrics[(data_name, 'log overhead')][model_type]['calibrated'][i][clip] for i in iters]
            axs[1].plot(np.log(iters), calibrated_overheads, label=f'Calibrated model clip={clip}', color=color)

        axs[1].set_xlabel('Catboost log trees', fontsize=14)
        axs[1].set_ylabel('Log overhead', fontsize=14)
        axs[1].legend()
        axs[1].grid()


        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_logloss = [metrics[(data_name, 'Logloss')][model_type]['calibrated'][i][clip] for i in iters]
            axs[2].plot(np.log(iters), calibrated_logloss, label=f'Calibrated model clip={clip}', color=color)

        axs[2].set_xlabel('Catboost log trees', fontsize=14)
        axs[2].set_ylabel('Logloss', fontsize=14)
        axs[2].legend()
        axs[2].grid()
        
        axs[1].set_title(f'{data_name} {model_type}')
        
#         plt.savefig(f'plots/{data_name}_{model_type}.png')

In [None]:
metrics = defaultdict(dict)
for x in arr:
    model_name = (x['model_type'], 'normalizing flow', '', '', )
    metrics[(x['data_name'], 'll', )][model_name] = x['ll']
    metrics[(x['data_name'], 'log overhead', )][model_name] = 0
    for y in x['metrics']:
        if y['clip'] == 10000:
            model_name = (x['model_type'], 'calibrated', y['iters'], 'inf', )
        else:
            model_name = (x['model_type'], 'calibrated', y['iters'], str(round(y['clip'], 2)), )
        metrics[(x['data_name'], 'll', )][model_name] = y['ll']
        metrics[(x['data_name'], 'log overhead', )][model_name] = y['overhead']
        metrics[(x['data_name'], 'AUC-ROC', )][model_name] = y['auc_roc']
        metrics[(x['data_name'], 'Logloss', )][model_name] = -y['logloss']
metrics = pd.DataFrame(metrics)
pd.set_option('display.max_rows', metrics.shape[0] + 1)
metrics

In [None]:
! mkdir plots
for data_name in ('MINIBOONE', 'BSDS300', 'GAS', 'HEPMASS', 'POWER'):
    for model_type in ('GLOW', 'MAF', 'RealNVP', 'SPLINE-AR'):
        iters = np.array([x for x in metrics[(data_name, 'll')][model_type]['calibrated'].index.levels[0] if isinstance(x, int)])
        log_iters = np.log(iters)
        
        fig, axs = plt.subplots(1, 3, figsize=(18, 4))

        nf_ll = metrics[(data_name, 'll')][model_type]['normalizing flow']
        axs[0].plot(log_iters, [nf_ll] * len(log_iters), label='Model likelihood', color='red')

        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_lls = [metrics[(data_name, 'll')][model_type]['calibrated'][i][clip] for i in iters]
            axs[0].plot(log_iters, calibrated_lls, label=f'Calibrated model clip={clip}', color=color)

        axs[0].set_xlabel('Catboost log trees', fontsize=14)
        axs[0].set_ylabel('Log likelihood', fontsize=14)
        axs[0].legend()
        axs[0].grid()


        axs[1].plot(log_iters, [0] * len(log_iters), label='Model log overhead', color='red')

        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_overheads = [metrics[(data_name, 'log overhead')][model_type]['calibrated'][i][clip] for i in iters]
            axs[1].plot(np.log(iters), calibrated_overheads, label=f'Calibrated model clip={clip}', color=color)

        axs[1].set_xlabel('Catboost log trees', fontsize=14)
        axs[1].set_ylabel('Log overhead', fontsize=14)
        axs[1].legend()
        axs[1].grid()


        for clip, color in (
            ('1.5', 'green'),
            ('2', 'blue'),
            ('inf', 'pink'),
        ):
            calibrated_logloss = [metrics[(data_name, 'Logloss')][model_type]['calibrated'][i][clip] for i in iters]
            axs[2].plot(np.log(iters), calibrated_logloss, label=f'Calibrated model clip={clip}', color=color)

        axs[2].set_xlabel('Catboost log trees', fontsize=14)
        axs[2].set_ylabel('Logloss', fontsize=14)
        axs[2].legend()
        axs[2].grid()
        
        axs[1].set_title(f'{data_name} {model_type}')
        
        plt.savefig(f'plots/{data_name}_{model_type}.png')

In [None]:
! ls plots