## Тестирование моделей аугментации на полной выборке данных

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore', 'lbfgs failed to converge*')

import aug_functions

In [None]:
import matplotlib
matplotlib.rcParams['text.usetex'] = True

import matplotlib.pyplot as plt
from matplotlib import rc

rc('font',**{'family':'serif'})
rc('text', usetex=True)
rc('text.latex', preamble='''
        \\usepackage[utf8]{inputenc}
        \\usepackage{amssymb}
        \\usepackage{amsfonts}
        \\usepackage[russian]{babel}''')

In [None]:
from scipy.special import expit
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

In [None]:
y_prob_all = []
y_test_all = []

In [None]:
augmentation_model = "GP"

In [None]:
meta_file_test = '../data/plasticc/plasticc_test_metadata.csv.gz'
metadata = pd.read_csv(meta_file_test)

for batch_number in range(1, 12):
    file = "../data/plasticc/PLAsTiCC-2018/test_set_batch{}.csv".format(batch_number)
    data = pd.read_csv(file)

    data = data[data.detected == 1]
    object_ids = np.unique(data.object_id)
    print(batch_number, object_ids.shape)
    
    data["log_lam"] = data.apply(lambda x: aug_functions.passband2lam[x.passband], axis=1)
    
    all_data = []
    all_target_classes = []

    for i in tqdm_notebook(object_ids):
        if i in good_object_ids:
            continue
        
        anobject = aug_functions.get_object(data, i)
        
        anobject_train, anobject_test = train_test_split(anobject, test_size=0.36, random_state=11)

        model = None
        if augmentation_model == "GP":
            import gp_aug
            model = gp_aug.GaussianProcessesAugmentation(aug_functions.passband2lam)
        elif augmentation_model == "NN_single":
            import single_layer_aug
            model = single_layer_aug.SingleLayerNetAugmentation(aug_functions.passband2lam)
        elif augmentation_model == "FE":
            import features_aug
            model = features_aug.FeaturesEngineeringAugmentation(aug_functions.passband2lam)
        elif augmentation_model == "MLP":
            import mlp_reg_aug
            model = mlp_reg_aug.MLPRegressionAugmentation(aug_functions.passband2lam)
        else:
            raise ValueError("Unknown augmentation type: {}".format(augmentation_model))

#        try:
        model.fit(anobject_train['mjd'].values, anobject_train['flux'].values, 
                  anobject_train['flux_err'].values, anobject_train['passband'].values)
#        except:
#            continue

        flux_pred = model.predict(anobject_test['mjd'].values, anobject_test['passband'].values, copy=True)

        t_aug, flux_aug, flux_err_aug, passbands_aug = model.augmentation(anobject['mjd'].min(), 
                                                                          anobject['mjd'].max(), 
                                                                          n_obs=aug_functions.N_OBS)

        data_array = flux_aug.reshape((aug_functions.N_PASSBANDS, aug_functions.N_OBS)).T
        all_data.append([data_array])
        true_class = 1 if int(metadata[metadata.object_id == i].true_target.to_numpy()[0]) in (90, 67, 52) else 0
        all_target_classes.append(true_class)

    all_data = np.array(all_data)
    all_target_classes = np.array(all_target_classes)
    print(batch_number, all_data.shape, all_target_classes.shape)
    
    X_test_batch_norm = np.array((all_data - X_train.mean()) / X_train.std(), dtype=np.float32)
    X_test_batch_tensor = torch.from_numpy(X_test_batch_norm)

    y_test_batch_tensor = torch.from_numpy(np.array(all_target_classes, dtype=np.float32))

    test_batch_data = TensorDataset(X_test_batch_tensor, y_test_batch_tensor)
    test_batch_loader = torch.utils.data.DataLoader(test_batch_data, batch_size=1, shuffle=True, num_workers=2)

    with torch.no_grad():
        for test_info in test_batch_loader:
            images, test_labels = test_info
            test_outputs = net(images)
            prob = expit(test_outputs.item())
            
            y_prob_all.append(prob)
            y_test_all.append(test_labels.item())
            
    del data
    del object_ids
    del all_data
    del all_target_classes
    del X_test_batch_norm
    del X_test_batch_tensor
    del y_test_batch_tensor
    del test_batch_data
    del test_batch_loader

In [None]:
y_pred_all = np.array(y_prob_all) > 0.5
print("LogLoss = %.4f" % log_loss(y_test_all, y_pred_all))

In [None]:
print("Test ROC-AUC: %.4f, test PR-AUC: %.4f" % (roc_auc_score(y_test_all, y_prob_all), 
                                                 average_precision_score(y_test_all, y_prob_all)))