In [1]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
import sys
sys.path.append('utils/')
from evaluation import compute_ks, compute_cvm, roc_auc_truncated

import theano
import theano.tensor as T

import lasagne

SEED = 1
KS_THRESHOLD = 0.09
CVM_THRESHOLD = 0.002



# Preparing data

In [2]:
label_prediction = pd.read_csv('../datasets/training.csv.zip').drop('id', 1)
features = label_prediction.drop(['signal', 'mass', 'min_ANNmuon', 'production'], 1).columns
print(label_prediction.shape)
label_prediction.head()

(67553, 50)


Unnamed: 0,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,DOCAtwo,...,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,production,signal,mass,min_ANNmuon
0,0.001578,0.999999,14.033335,0.681401,0.016039,0.451886,1.900433,1482.037476,0.066667,0.060602,...,12290.760742,39264.398438,3.076006,4.0038,4.031514,458,-99,0,1866.300049,0.277559
1,0.000988,0.999705,5.536157,0.302341,0.142163,9.564503,0.865666,3050.720703,0.024022,0.019245,...,16562.667969,7341.257812,3.228553,2.786543,2.975564,406,-99,0,1727.095947,0.225924
2,0.000877,0.999984,6.117302,0.276463,0.034746,1.970751,10.975849,3895.908691,0.055044,0.047947,...,22695.388672,10225.30957,3.536903,2.865686,3.05281,196,-99,0,1898.588013,0.36863
3,0.000854,0.999903,5.228067,0.220739,0.076389,4.271331,3.276358,4010.781738,0.053779,0.006417,...,16909.515625,9141.426758,3.087461,3.218034,2.375592,137,-99,0,1840.410034,0.246045
4,0.001129,0.999995,39.069534,1.898197,0.120936,4.984982,0.468348,4144.546875,0.004491,0.037326,...,97612.804688,47118.785156,4.632295,4.711155,4.296878,477,-99,0,1899.793945,0.22206


In [3]:
test = pd.read_csv('../datasets/test.csv.zip')
print(test.shape)

(855819, 47)


In [4]:
private_dataset = pd.merge(test, pd.read_csv('../datasets/private_eval.csv'), on='id').drop_duplicates()
print(private_dataset.shape)
private_dataset.head()

(28497, 48)


Unnamed: 0,id,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,...,p1_pt,p2_pt,p0_p,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,signal
0,4369817,0.002109,0.999998,14.665833,0.134259,0.029963,2.091192,0.446515,6850.274414,0.008145,...,3163.742188,2060.64917,11679.78125,14723.319336,14744.435547,2.620276,2.219076,2.656073,353,1
1,10213012,0.000886,0.999992,9.180307,0.472137,0.037329,2.02565,7.784008,4024.888916,0.070422,...,1814.38147,1084.320923,22417.171875,20466.382812,18283.640625,3.456087,3.114216,3.51732,52,1
2,17806160,0.001313,0.999966,17.477598,1.382421,0.157643,6.020585,1.043844,1584.708984,0.010628,...,1422.916504,317.407227,12352.144531,59287.277344,6400.515137,3.913197,4.422689,3.69648,202,1
3,641074,0.000827,0.999993,5.873784,0.162675,0.02219,2.182303,4.583155,7745.390137,0.011685,...,2413.23584,4147.405762,6593.108887,15263.637695,20328.482422,2.289592,2.531343,2.272115,211,1
4,11383293,0.001149,0.999999,9.548421,0.38493,0.014353,0.766637,0.499251,2880.133545,0.017162,...,1645.024414,1098.795898,3348.355713,19929.757812,25797.041016,2.834224,3.185898,3.848738,251,1


In [5]:
public_dataset = pd.merge(test, pd.read_csv('../datasets/public_eval.csv'), on='id').drop_duplicates()
print(public_dataset.shape)
public_dataset.head()

(12302, 48)


Unnamed: 0,id,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,...,p1_pt,p2_pt,p0_p,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,signal
0,12200521,0.001825,0.999989,8.118148,0.198013,0.038373,2.368293,3.995679,3088.533447,0.01694,...,1385.601074,612.74408,8325.249023,8831.709961,9389.864258,2.50815,2.539151,3.42152,433,1
1,2491177,0.000535,0.999934,8.240436,1.095218,0.083408,4.210687,8.162792,3235.502197,0.09586,...,1778.51355,352.61795,45959.210938,41672.800781,6638.411621,4.108746,3.846762,3.627684,369,0
2,14699668,0.00085,0.999996,28.220333,0.677022,0.07794,8.915274,0.035283,13247.209961,0.000482,...,4030.838379,6680.432617,38441.621094,60690.246094,88079.054688,3.380462,3.403851,3.270758,414,0
3,225705,0.001145,1.0,16.196651,0.395021,0.009561,0.717381,2.758028,5904.848633,0.000262,...,2531.938477,2261.802734,14468.316406,27498.492188,41622.230469,3.193202,3.076167,3.60488,149,1
4,4323792,0.001371,0.999999,18.812433,0.529717,0.029653,1.518456,4.937074,4474.023438,0.048851,...,2260.256836,1716.283325,8806.927734,28214.091797,44662.679688,3.456395,3.215882,3.951755,163,1


In [6]:
check_agreement = pd.read_csv('../datasets/check_agreement.csv.zip')

print(check_agreement.shape)
check_agreement.head()

(331147, 49)


Unnamed: 0,id,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,...,p2_pt,p0_p,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,signal,weight
0,15347063,0.001451,0.999964,6.94503,0.229196,0.058117,2.961298,7.953543,2251.611816,0.082219,...,834.562378,10392.814453,6380.673828,15195.594727,2.666142,3.302978,3.594246,512,0,-0.307813
1,14383299,0.000679,0.999818,9.468235,0.517488,0.189683,14.41306,7.141451,10594.470703,0.007983,...,2861.309814,3174.356934,64480.023438,23134.953125,2.995265,2.834816,2.779366,552,0,-0.331421
2,7382797,0.003027,0.999847,13.280714,0.219291,0.231709,11.973175,4.77888,2502.196289,0.045085,...,932.128235,15219.761719,3921.181641,10180.791016,2.776633,3.204923,3.081832,318,0,-0.382215
3,6751065,0.00081,0.999998,5.166821,0.167886,0.011298,0.891142,5.528002,5097.813965,0.055115,...,2617.248291,4365.08252,13221.149414,24291.875,2.179345,2.769762,2.918251,290,0,1.465194
4,9439580,0.000706,0.999896,10.897236,0.284975,0.160511,16.36755,8.670339,20388.097656,0.015587,...,4763.682617,27463.011719,46903.394531,24241.628906,2.196114,2.262732,2.310401,45,0,-0.477084


In [7]:
check_correlation = pd.read_csv('../datasets/check_correlation.csv.zip')

print(check_correlation.shape)
check_correlation.head()

(5514, 48)


Unnamed: 0,id,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,...,p1_pt,p2_pt,p0_p,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,mass
0,11120335,0.000703,0.999715,2.927074,0.214014,0.081302,4.259793,1.066585,3108.189941,0.010767,...,1294.450928,1073.97644,9274.671875,7963.914062,6712.897949,2.783731,2.50331,2.519349,280,1723.887939
1,11495369,0.000601,0.99995,15.849142,0.842973,0.182213,13.882857,5.780046,6858.264648,0.007574,...,3981.284912,1859.680542,22844.791016,109955.101562,37051.800781,3.660059,4.011287,3.684429,386,1926.284058
2,7098902,0.002009,0.999984,43.358494,1.323199,0.200158,12.870687,3.460782,3604.347412,0.052849,...,2564.43042,354.095032,45214.070312,77265.429688,12087.007812,4.458619,4.098382,4.223254,433,1830.873047
3,8103692,0.001268,0.99955,6.910733,0.198652,0.16773,8.559438,7.676139,3240.960449,0.101368,...,1178.332031,1285.70166,11245.551758,6770.969238,17003.119141,3.060873,2.434033,3.273807,245,1909.119019
4,10160864,0.001937,0.999996,36.135208,0.508036,0.09717,9.295684,0.42634,6448.445312,0.007005,...,3140.512207,385.343475,48171.457031,43973.835938,7368.522949,3.441606,3.331079,3.6433,489,1600.925049


In [8]:
Xt_train, Xt_test, yt_train, yt_test = train_test_split(label_prediction[features].values, label_prediction.signal.values, train_size=0.8)
domain_adaptation_random = pd.read_csv('../datasets/domain_adaptation_random.csv').sample(Xt_train.shape[0], random_state=SEED)
domain_adaptation_high_weight = pd.read_csv('../datasets/domain_adaptation_high_weight.csv').sample(Xt_train.shape[0], random_state=SEED)
Xd_1, yd_1 = domain_adaptation_high_weight[features].values, domain_adaptation_high_weight.domain.values
Xd_2, yd_2 = domain_adaptation_random[features].values, domain_adaptation_random.domain.values
Xm, ym = check_correlation.sample(Xt_train.shape[0], replace=True)[features].values, check_correlation.sample(Xt_train.shape[0], replace=True).mass.values

X_public, y_public = public_dataset[features].values, public_dataset.signal.values
X_private, y_private = private_dataset[features].values, private_dataset.signal.values

print(Xd_1.shape, yd_1.shape)
print(Xd_2.shape, yd_2.shape)
print(Xm.shape, ym.shape)
print(Xt_train.shape, yt_train.shape)
print(Xt_test.shape, yt_test.shape)
print(X_public.shape, y_public.shape)
print(X_private.shape, y_private.shape)

((54042L, 46L), (54042L,))
((54042L, 46L), (54042L,))
((54042L, 46L), (54042L,))
((54042L, 46L), (54042L,))
((13511L, 46L), (13511L,))
((12302L, 46L), (12302L,))
((28497L, 46L), (28497L,))


In [9]:
import random

def iterate_minibatches(X_train, y_train, batch_size=1000, shuffle=True):
    data = zip(X_train, y_train)
    if shuffle: random.shuffle(data)
    X_train_shuffled, y_train_shuffled = np.array(map(lambda x: x[0], data)), np.array(map(lambda x: x[1], data))
    for i in range(0, X_train_shuffled.shape[0], batch_size):
        yield X_train_shuffled[i:i+batch_size], y_train_shuffled[i:i+batch_size]
    

# Baseline architecture

In [10]:
from lasagne.layers import InputLayer, DenseLayer, DropoutLayer, BatchNormLayer
from lasagne.nonlinearities import softmax, tanh

input_var = T.matrix('input', dtype='float32')
target_var = T.ivector('signal')
lr = T.scalar('learning rate')

# Feature generator
feature_generator = InputLayer(shape=(None, 46), input_var=input_var)
feature_generator = BatchNormLayer(feature_generator)
feature_generator = DenseLayer(feature_generator, 70)
feature_generator = DenseLayer(feature_generator, 35)
feature_generator = DenseLayer(feature_generator, 20)
feature_generator = DropoutLayer(feature_generator, p=0.09)

# Target predictor
target_predictor = DenseLayer(feature_generator, 10)
target_predictor = DenseLayer(target_predictor, 7)
target_predictor = DenseLayer(target_predictor, 2, nonlinearity=softmax)

predictions = lasagne.layers.get_output(target_predictor)
params = lasagne.layers.get_all_params(target_predictor, trainable=True)
loss = lasagne.objectives.categorical_crossentropy(predictions, target_var)
loss = loss.mean()
acc = T.mean(T.eq(T.argmax(predictions, axis=1), target_var),
                  dtype=theano.config.floatX)
updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
train_fn = theano.function([input_var, target_var, lr], [loss, acc], updates=updates, allow_input_downcast=True)
test_fn = theano.function([input_var], [predictions], allow_input_downcast=True)

In [11]:
def cross_validation(model, verbose=True):
    agreement_probs = model(check_agreement[features].values)[0][:, 1]
    correlation_probs = model(check_correlation[features].values)[0][:, 1]
    auc_holdout = roc_auc_truncated(yt_test, model(Xt_test)[0][:, 1])
    auc_public = roc_auc_truncated(y_public, model(X_public)[0][:, 1])
    auc_private = roc_auc_truncated(y_private, model(X_private)[0][:, 1])
    ks = compute_ks(agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    cvm = compute_cvm(correlation_probs, check_correlation['mass'].values)
    ks_threshold = 0.09
    cvm_threshold = 0.002
    if verbose:
        print('AUC (holdout): %.3f' % auc_holdout)
        print('AUC (public leaderboard): %.3f' % auc_public)
        print('AUC (private leaderboard): %.3f' % auc_private)
        print('[%s] KS: %.2f (threshold %.2f)' % ('SUCCESS' if ks < ks_threshold else 'FAILED', ks, ks_threshold))
        print('[%s] CvM: %.4f (threshold %.3f)' % ('SUCCESS' if cvm < cvm_threshold else 'FAILED', cvm, cvm_threshold))
    return auc_holdout, auc_public, auc_private, ks, cvm

In [12]:
N_EPOCH = 20
lr = 1e-2
PATIENCE = 5

for epoch in range(N_EPOCH):
    if epoch and epoch % PATIENCE == 0: lr /= 10.0
    for batch_X, batch_y in iterate_minibatches(Xt_train, yt_train, batch_size=10):
        loss_value, _ = train_fn(batch_X, batch_y, lr)
    auc_holdout, auc_public, auc_private, ks, cvm = cross_validation(test_fn, verbose=False)
    print 'Epoch: %d\tloss: %.4f\tAUC: %.4f\tKS (threshold %.2f): %.4f\tCvM (threshold %.3f): %.4f' % (epoch, loss_value, auc_public, KS_THRESHOLD, ks, CVM_THRESHOLD, cvm)

cross_validation(test_fn)
pass

Epoch: 0	loss: 0.9659	AUC: 0.9764	KS (threshold 0.09): 0.2020	CvM (threshold 0.002): 0.0010
Epoch: 1	loss: 0.0903	AUC: 0.9776	KS (threshold 0.09): 0.1852	CvM (threshold 0.002): 0.0010
Epoch: 2	loss: 0.7781	AUC: 0.9803	KS (threshold 0.09): 0.1660	CvM (threshold 0.002): 0.0011
Epoch: 3	loss: 0.2660	AUC: 0.9809	KS (threshold 0.09): 0.1745	CvM (threshold 0.002): 0.0009
Epoch: 4	loss: 1.9133	AUC: 0.9794	KS (threshold 0.09): 0.1569	CvM (threshold 0.002): 0.0010
Epoch: 5	loss: 1.2263	AUC: 0.9809	KS (threshold 0.09): 0.1696	CvM (threshold 0.002): 0.0011
Epoch: 6	loss: 0.2237	AUC: 0.9822	KS (threshold 0.09): 0.1737	CvM (threshold 0.002): 0.0011
Epoch: 7	loss: 0.3760	AUC: 0.9799	KS (threshold 0.09): 0.1751	CvM (threshold 0.002): 0.0011
Epoch: 8	loss: 0.2009	AUC: 0.9801	KS (threshold 0.09): 0.1795	CvM (threshold 0.002): 0.0011
Epoch: 9	loss: 0.1735	AUC: 0.9803	KS (threshold 0.09): 0.1963	CvM (threshold 0.002): 0.0011
Epoch: 10	loss: 0.0894	AUC: 0.9819	KS (threshold 0.09): 0.1903	CvM (threshold 0.

Now let's implement loss from https://arxiv.org/pdf/1608.04802.pdf (Formula 6) for truncated AUC from https://www.kaggle.com/c/flavours-of-physics#evaluation
$$min_{f, b_1, ..., b_n}max_{\lambda_1...\lambda_n}\Sigma_{t=1}^{k}w_t\Delta_t\big((1 + \lambda_t)\Lambda^+(f, b_t) + \lambda_t{{\alpha_t}\over{1-\alpha_t}}\Lambda^-(f, b_t) - \lambda|Y^+|\big)$$

In out case

$k = 5$, $\alpha_t=0.4 + 0.1*t$, $\Delta_t=0.1$, $w_t=0.5*(t-1)$

$\Lambda^+(f, b_t)=\Sigma_{x\in X}ReLU((b_t - f(x))*y)=Hinge_{x\in X^+}f(b_t)$ ($y \in \{0, 1\}$ или $y \in \{-1, 1\}$)

$\Lambda^-(f, b_t)=\Sigma_{x\in X}ReLU((f(x) - b_y)*(1 - y))=Hinge_{x\in X^-}f(b_t)$ ($y \in \{0, 1\}$ или $y \in \{-1, 1\}$)

There is no $w_t$ in the article since article's AUC is not truncated

In [20]:
from theano.tensor.nnet import relu

def trunc_auc_loss(preds, target, b, l, a, w, d):
    """
    @param preds - predictions (f(x))
    @param target - target label (y)
    @param b - list of b from https://arxiv.org/pdf/1608.04802.pdf (Formula 6)
    @param a - list of \alpha from https://arxiv.org/pdf/1608.04802.pdf (Formula 6)
    @param w - list of weights of regions for truncated AUC
    @param l - list of \lambda from https://arxiv.org/pdf/1608.04802.pdf (Formula 6)
    @param d - list of \Delta from https://arxiv.org/pdf/1608.04802.pdf (Formula 6)
    """
    assert len(b) == len(l) == len(a) == len(w), 'lists of variables must have the same shape'
    loss = 0
    Y_plus = relu(target).mean()
    for b_, l_, a_, w_ in zip(b, l, a, w):
#         l_plus = relu((b_ - preds).T * target).mean()
#         l_minus = relu((preds - b_).T * (1 - target)).mean()
        l_plus = ((b_ - preds).T * target).mean()
        l_minus = ((preds - b_).T * (1 - target)).mean()
        loss += w_ * d * ((1 + l_) * l_plus + l_ * (a_/(1.0 - a_)) * l_minus - l_ * Y_plus).mean()

    return loss.mean()

def cross_validation(model, verbose=True):
    agreement_probs = model(check_agreement[features].values)[0][:, 0]
    correlation_probs = model(check_correlation[features].values)[0][:, 0]
    auc_holdout = roc_auc_truncated(yt_test, model(Xt_test)[0][:, 0])
    auc_public = roc_auc_truncated(y_public, model(X_public)[0][:, 0])
    auc_private = roc_auc_truncated(y_private, model(X_private)[0][:, 0])
    ks = compute_ks(agreement_probs[check_agreement['signal'].values == 0],
        agreement_probs[check_agreement['signal'].values == 1],
        check_agreement[check_agreement['signal'] == 0]['weight'].values,
        check_agreement[check_agreement['signal'] == 1]['weight'].values)
    cvm = compute_cvm(correlation_probs, check_correlation['mass'].values)
    ks_threshold = 0.09
    cvm_threshold = 0.002
    if verbose:
        print('AUC (holdout): %.3f' % auc_holdout)
        print('AUC (public leaderboard): %.3f' % auc_public)
        print('AUC (private leaderboard): %.3f' % auc_private)
        print('[%s] KS: %.2f (threshold %.2f)' % ('SUCCESS' if ks < ks_threshold else 'FAILED', ks, ks_threshold))
        print('[%s] CvM: %.4f (threshold %.3f)' % ('SUCCESS' if cvm < cvm_threshold else 'FAILED', cvm, cvm_threshold))
    return auc_holdout, auc_public, auc_private, ks, cvm

pass

In [25]:
from lasagne.layers import InputLayer, DenseLayer, DropoutLayer, BatchNormLayer
from lasagne.nonlinearities import sigmoid, tanh

input_var = T.matrix('input', dtype='float32')
target_var = T.ivector('signal')
lr = T.scalar('learning rate')

# Feature generator
feature_generator = InputLayer(shape=(None, 46), input_var=input_var)
feature_generator = BatchNormLayer(feature_generator)
feature_generator = DenseLayer(feature_generator, 70)
feature_generator = DenseLayer(feature_generator, 35)
feature_generator = DenseLayer(feature_generator, 20)
feature_generator = DropoutLayer(feature_generator, p=0.09)

# Target predictor
target_predictor = DenseLayer(feature_generator, 10)
target_predictor = DenseLayer(target_predictor, 7, nonlinearity=tanh)
target_predictor = DenseLayer(target_predictor, 1, nonlinearity=sigmoid)

predictions = lasagne.layers.get_output(target_predictor)
params = lasagne.layers.get_all_params(target_predictor, trainable=True)
b = [lasagne.utils.create_param(np.array([0.5]), [1], name='b_%d' % i) for i in range(1, 6)]
l = [lasagne.utils.create_param(np.array([0.5]), [1], name='lambda_%d' % i) for i in range(1, 6)]
a = [0.5, 0.6, 0.7, 0.8, 0.9]
w = [0.0, 0.5, 1.0, 1.5, 2.0]
d = 0.1
loss = trunc_auc_loss(predictions, target_var, b, l, a, w, d)
acc = T.mean(T.eq(T.argmax(predictions, axis=1), target_var),
                  dtype=theano.config.floatX)

updates_l = lasagne.updates.rmsprop(-loss, l, learning_rate=lr) # maximize
updates_b = lasagne.updates.rmsprop(loss, b, learning_rate=lr)
updates_nn = lasagne.updates.rmsprop(loss, params, learning_rate=lr)

train_l = theano.function([input_var, target_var, lr], [], updates=updates_l, allow_input_downcast=True)
train_b = theano.function([input_var, target_var, lr], [], updates=updates_b, allow_input_downcast=True)
train_nn = theano.function([input_var, target_var, lr], [loss, acc], updates=updates_nn, allow_input_downcast=True)

def train(X, y, lr):
    train_l(X, y, lr)
    train_b(X, y, lr)
    loss_value, acc_value = train_nn(X, y, lr)
    return loss_value, acc_value

test_fn = theano.function([input_var], [predictions], allow_input_downcast=True)

In [26]:
N_EPOCH = 20
lr = 1e-4
PATIENCE = 5

for epoch in range(N_EPOCH):
    if epoch and epoch % PATIENCE == 0: lr /= 10.0
    for batch_X, batch_y in iterate_minibatches(Xt_train, yt_train, batch_size=10):
        loss_value, _ = train(batch_X, batch_y, lr)
    auc_holdout, auc_public, auc_private, ks, cvm = cross_validation(test_fn, verbose=False)
    print 'Epoch: %d\tloss: %.4f\tAUC: %.4f\tKS (threshold %.2f): %.4f\tCvM (threshold %.3f): %.4f' % (epoch, loss_value, auc_public, KS_THRESHOLD, ks, CVM_THRESHOLD, cvm)

cross_validation(test_fn)
pass

Epoch: 0	loss: 0.0504	AUC: 0.9574	KS (threshold 0.09): 0.0725	CvM (threshold 0.002): 0.0008
Epoch: 1	loss: -0.5323	AUC: 0.9658	KS (threshold 0.09): 0.0839	CvM (threshold 0.002): 0.0008
Epoch: 2	loss: -0.2862	AUC: 0.9714	KS (threshold 0.09): 0.0870	CvM (threshold 0.002): 0.0009
Epoch: 3	loss: -0.3813	AUC: 0.9733	KS (threshold 0.09): 0.0793	CvM (threshold 0.002): 0.0010
Epoch: 4	loss: -0.5068	AUC: 0.9763	KS (threshold 0.09): 0.0808	CvM (threshold 0.002): 0.0010
Epoch: 5	loss: -0.3081	AUC: 0.9766	KS (threshold 0.09): 0.0751	CvM (threshold 0.002): 0.0010
Epoch: 6	loss: -0.6036	AUC: 0.9765	KS (threshold 0.09): 0.0794	CvM (threshold 0.002): 0.0011
Epoch: 7	loss: -0.4329	AUC: 0.9763	KS (threshold 0.09): 0.0805	CvM (threshold 0.002): 0.0011
Epoch: 8	loss: -0.5009	AUC: 0.9770	KS (threshold 0.09): 0.0713	CvM (threshold 0.002): 0.0012
Epoch: 9	loss: -0.2474	AUC: 0.9768	KS (threshold 0.09): 0.0724	CvM (threshold 0.002): 0.0011
Epoch: 10	loss: -0.4062	AUC: 0.9756	KS (threshold 0.09): 0.0662	CvM (th