In [1]:
# %%
from pathlib import Path
import pandas as pd
import numpy as np
from custom_keras.helpers_keras import prepare_labels, build_keras_model, prepare_X
from active_gru.my_active_learner import UncertaintyGru
# from numpy.random import seed
from tensorflow.keras import backend as K
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pickle
import re
from datetime import datetime
import sys
import time
from scipy.stats import pearsonr

Set label here:

In [2]:
LABEL = 'valence'

In [3]:
# %%
dateTimeObj = datetime.now()
time_str = dateTimeObj.strftime("%H:%M_%d_%b_%y")
# %% markdown
# Seed for comparable results:
# %%
tf.random.set_seed(22)
# %% markdown
# # Constants
# %%
INDEX_COLS = ['participant', 'sequence', 'sample']
PATH_X_LABELLED = Path('AVEC2016', 'x_labelled.csv')
PATH_Y_LABELLED = Path('AVEC2016', 'y_labelled.csv')
PATH_X_POOL = Path('AVEC2016', 'x_pool.csv')
PATH_Y_POOL = Path('AVEC2016', 'y_pool.csv')
PATH_X_TEST = Path('AVEC2016', 'x_test.csv')
PATH_Y_TEST = Path('AVEC2016', 'y_test.csv')


PATH_INITIAL_WEIGHTS = os.path.join('experiment', 'init_weights_experiment.h5')
PATH_HISTORY_RANDOM = Path('experiment', 'hist_rand_{}.pkl'.format(time_str))
PATH_HISTORY_ACTIVE = Path('experiment', 'hist_active_{}.pkl'.format(time_str))
PATH_HISTORY_UNCER = Path('experiment', 'hist_uncer_{}.pkl'.format(time_str))
PATH_PLOTS = 'pictures/{}_{}_{}_querries_x_{}.png'

SEQUENCE_LENGTH = 375
SEQ_PER_QUERY = 1

In [4]:
# %% global variables
X_labelled = pd.read_csv(PATH_X_LABELLED, index_col=INDEX_COLS)
y_labelled = pd.read_csv(PATH_Y_LABELLED, index_col=INDEX_COLS)
X_pool = pd.read_csv(PATH_X_POOL, index_col=INDEX_COLS)
y_pool = pd.read_csv(PATH_Y_POOL, index_col=INDEX_COLS)
X_test = pd.read_csv(PATH_X_TEST, index_col=INDEX_COLS)
y_test = pd.read_csv(PATH_Y_TEST, index_col=INDEX_COLS)
# %%

N_FEATURES_VID = X_labelled.filter(regex='vid', axis=1).shape[-1]
N_FEATURES_AUD = X_labelled.filter(regex='aud', axis=1).shape[-1]
pool_size = 25
y_labelled = y_labelled['y_{}'.format(LABEL)]
y_pool = y_pool['y_{}'.format(LABEL)]
y_test = y_test['y_{}'.format(LABEL)]

In [5]:
y_labelled.shape

(6000,)

In [6]:
def load_pickle(path: str):
    return pickle.load(open(path, 'rb'))


def dump_pickle(history: object, path: str):
    pickle.dump(history, open(path, "wb"))

In [7]:
def comp_mc_drop(x, n_pool, model_mc_d, model_def, sequence_length, t=100):
    """Return normal predictions and MC dropout predictions"""
    # outlier scores:
    # use model to make predictions
    X_aud_3d = prepare_X(x, 'aud', sequence_length)
    X_vid_3d = prepare_X(x, 'vid', sequence_length)

    # uncertainty scores
    # predictions: m_instances * labels_per_sequence *
    # number of predictions (t) * different_labels_predicted
    predictions_mc = np.zeros(
        (X_aud_3d.shape[0], X_aud_3d.shape[1] // n_pool, t))
    for j in range(t):
        pred_curr = model_mc_d.predict([X_aud_3d, X_vid_3d])
        predictions_mc[:, :, j] = pred_curr.squeeze(axis=-1)
        # drop the last axis, which is just of dim 1 anyway
        # variance along the axis 2 (different dropout predictions)
        # mean per sequence (axis 1)
    pred = model_def.predict([X_aud_3d, X_vid_3d])
    return predictions_mc, pred

In [8]:
def plot_pred_vs_label(y, pred):
    """plot label vs model output."""
    cols = 2
    fig, ax = plt.subplots(y.shape[0] // cols + 1, cols, figsize=(20, 200))
    counter = 0
    for row in range(y.shape[0] // cols):
        for col in range(cols):
            ax[row, col].plot(y[counter], label='label')
            ax[row, col].plot(pred[counter], label='prediction')
            ax[row, col].set(title='seq_{}'.format(counter),
                             xlabel='time_step',
                             ylabel='label/prediction')
            ax[row, col].set_ylim([-0.5, 0.5])
            ax[row, col].legend()
            counter += 1
    fig.tight_layout()
    return fig

In [9]:
def comp_stats(eval_x_labelled=True):
    """Compute stats and return as pandas df."""
    if eval_x_labelled:
        x_chosen = uncert_learner.x_labelled
        y_chosen = uncert_learner.y_labelled
    else:
        x_chosen = uncert_learner.x_pool
        y_chosen = uncert_learner.y_pool
    preds_mc, pred = comp_mc_drop(x_chosen, uncert_learner.n_pool,
                                  uncert_learner.model_dropout_test,
                                  uncert_learner.model, SEQUENCE_LENGTH)
    pred_mc = preds_mc.mean(axis=-1)
    pred = pred.squeeze()
    var = preds_mc.var(axis=-1)
    y = prepare_labels(y_chosen, SEQUENCE_LENGTH, pool_size).squeeze(axis=-1)
    error_mc = np.abs(y - pred_mc)
    error_def = np.abs(y - pred)
    sample_idx = np.array([np.arange(y.shape[1]) for i in range(y.shape[0])
                           ]).astype(np.int).reshape(-1, 1)
    stats_df = pd.DataFrame(
        data=np.concatenate(
            [
                # sample_idx,
                y.reshape(-1, 1),
                pred.reshape(-1, 1),
                pred_mc.reshape(-1, 1),
                var.reshape(-1, 1),
                error_mc.reshape(-1, 1),
                error_def.reshape(-1, 1)
            ],
            axis=1),
        columns=[
            'label', 'pred', 'pred_mc', 'var_mc', 'error_mc', 'error_default'
        ])
    stats_df = stats_df.astype(np.float)
    return stats_df, y, pred, preds_mc.reshape(-1, preds_mc.shape[-1])

In [10]:
def plot_mc(df_stats, preds_mc, label='arousal'):
    """Histograms of stochastic forward passes of MC dropout."""
    # numpy.argsort(data)[len(data)//2]
    idx_median_err = df_stats['error_mc'].to_numpy().argsort()[
        df_stats['error_mc'].shape[0] // 2]
    idx_max_err = df_stats['error_mc'].idxmax()
    idx_min_err = df_stats['error_mc'].idxmin()
    idx_median_label = df_stats['label'].to_numpy().argsort()[
        df_stats['label'].shape[0] // 2]
    idx_max_label = df_stats['label'].idxmax()
    idx_min_label = df_stats['label'].idxmin()

    indices = [[idx_min_err, idx_median_err, idx_max_err],
               [idx_min_label, idx_median_label, idx_max_label]]
    titles = [[
        'lowest prediction error', 'approx. median prediction error',
        'highest prediction error'
    ], ['minimum label', 'approx. median label', 'maximum label']]

    fig, axes = plt.subplots(2, 3, figsize=(20, 8))
    for i in range(2):
        for j in range(3):
            sns.distplot(preds_mc[indices[i][j]],
                         ax=axes[i][j],
                         color='green',
                         label='MC dropout')
            axes[i][j].axvline(x=df_stats.loc[indices[i][j], 'pred_mc'],
                               label='MC prediction')
            axes[i][j].axvline(x=df_stats.loc[indices[i][j], 'label'],
                               label='label {}'.format(label),
                               color='orange')
            axes[i][j].legend()
            axes[i][j].set(title=titles[i][j],
                           xlabel='predicted label',
                           ylabel='density')
    fig.tight_layout()
    return fig

In [11]:
def scatter_plots_mc(df_stats, label='arousal'):
    """scatter plots of some interesting variables."""
    fig, axes = plt.subplots(1, 4, figsize=(15, 5))
    sns.distplot(df_stats['var_mc'], ax=axes[0])
    axes[0].set(title='histogram variance MC dropout',
                ylabel='density',
                xlabel='variance MC dropout')
    sns.scatterplot(data=df_stats, x='var_mc', y='error_mc', ax=axes[1])
    axes[1].set(title='variance MC dropout vs. prediction error',
                xlabel='variance MC dropout',
                ylabel='prediction error')
    axes[1].set_xlim(left=-0.005, auto=True)
    sns.scatterplot(data=df_stats, x='pred_mc', y='label', ax=axes[2])
    axes[2].set(title='prediction vs. true label',
                xlabel='prediction MC',
                ylabel='label {}'.format(label))
    sns.scatterplot(data=df_stats, x='label', y='error_mc', ax=axes[3])
    axes[3].set(title='label vs. prediction error'.format(label),
                ylabel='prediction error',
                xlabel='label {}'.format(label))

    fig.tight_layout()
    return fig

# Model Uncertainty

In [12]:
model_uncert = build_keras_model(SEQUENCE_LENGTH,
                                 N_FEATURES_AUD,
                                 N_FEATURES_VID,
                                 pool_size=pool_size,
                                 n_neurons_gru=64,
                                 n_neurons_hid_aud=44,
                                 n_neurons_hid_vid=100,
                                 dropout_rate=0.38,
                                 rec_dropout_rate=0.04,
                                 rec_l2=0,
                                 ker_l2=0)

model_uncert_drop = build_keras_model(
    SEQUENCE_LENGTH,
    N_FEATURES_AUD,
    N_FEATURES_VID,
    pool_size=pool_size,
    n_neurons_gru=64,
    n_neurons_hid_aud=44,
    n_neurons_hid_vid=100,
    dropout_rate=0.38,
    rec_dropout_rate=0.04,
    rec_l2=0,
    ker_l2=0,
    training_mode=True,
)
# new uncert learner object
uncert_learner = UncertaintyGru(X_pool,
                                y_pool,
                                model_uncert,
                                model_uncert_drop,
                                SEQUENCE_LENGTH,
                                pool_size,
                                X_test,
                                y_test,
                                X_labelled,
                                y_labelled,
                                t=100)

# reset weights of model
uncert_learner.model.save_weights(PATH_INITIAL_WEIGHTS)
uncert_learner.model_dropout_test.load_weights(PATH_INITIAL_WEIGHTS)

In [13]:
# train on x_labelled at t_0:
uncert_learner.train_x_labelled(epochs=40)

In [23]:
uncert_learner.model.metrics_names

['loss', 'mae', 'neg_ccc']

Plot details for different number of queries:

In [15]:
PATH_PLOTS_MC = 'pictures/{}_mc_predictions_{}_querries.png'

Perform active leaning and plot statistics at different stages, as X_labelled grows and X_pool shrinks.

In [16]:
losses = []
for i in range(6):
    x_labelled_new, y_labelled_new = uncert_learner.query_sequences(20)
    # train model
    uncert_learner.train_x_labelled(epochs=8)
    uncert_learner.evaluate_on_test_set()
    stats_x_labelled, y_labelled, pred_y_labelled, preds_mc_labelled = comp_stats(
    )
    # over/ under fitting
    # performance on x_labelled
    y_3d_l = prepare_labels(uncert_learner.y_labelled, uncert_learner.sequence_length, uncert_learner.n_pool)
    X_aud_3d_l = prepare_X(uncert_learner.x_labelled, 'aud', uncert_learner.sequence_length)
    X_vid_3d_l = prepare_X(uncert_learner.x_labelled, 'vid', uncert_learner.sequence_length)
    loss_l = uncert_learner.model.evaluate([X_aud_3d_l, X_vid_3d_l], y_3d_l, verbose=0)
    # performance on x_pool
    y_3d_p = prepare_labels(uncert_learner.y_pool, uncert_learner.sequence_length, uncert_learner.n_pool)
    X_aud_3d_p = prepare_X(uncert_learner.x_pool, 'aud', uncert_learner.sequence_length)
    X_vid_3d_p = prepare_X(uncert_learner.x_pool, 'vid', uncert_learner.sequence_length)
    loss_p = uncert_learner.model.evaluate([X_aud_3d_p, X_vid_3d_p], y_3d_p, verbose=0)
    loss = loss_l + loss_p
    losses.append(loss)                       
    # on x_labelled general statistics
    fig_stats_labelled = scatter_plots_mc(stats_x_labelled, label=LABEL)
    fig_stats_labelled.savefig(
        PATH_PLOTS.format(LABEL, 'stats', uncert_learner.queried_seq_tot,
                          'labelled'))
    #
    pred_vs_label_lab = plot_pred_vs_label(y_labelled, pred_y_labelled)
    pred_vs_label_lab.savefig(
        PATH_PLOTS.format(LABEL, 'preds', uncert_learner.queried_seq_tot,
                          'labelled'))
    # on x_pool general statistics
    stats_x_pool, y_pool, pred_y_pool, preds_mc_pool = comp_stats(
        eval_x_labelled=False)
    fig_stats_pool = scatter_plots_mc(stats_x_pool, label=LABEL)
    fig_stats_pool.savefig(
        PATH_PLOTS.format(LABEL, 'stats', uncert_learner.queried_seq_tot,
                          'pool'))

    pred_vs_label_pool = plot_pred_vs_label(y_pool, pred_y_pool)
    pred_vs_label_pool.savefig(
        PATH_PLOTS.format(LABEL, 'preds', uncert_learner.queried_seq_tot,
                          'pool'))

    # lower level MC prediction plots on x_pool
    fig_mc_pool = plot_mc(stats_x_pool, preds_mc_pool, label=LABEL)
    fig_mc_pool.savefig(
        PATH_PLOTS_MC.format(LABEL, uncert_learner.queried_seq_tot))

    # print correlation
    print('queried seqs: {}'.format(uncert_learner.queried_seq_tot))
    r, p = pearsonr(stats_x_pool['error_default'], stats_x_pool['var_mc'])
    print('Correlation MC var - predicition error \br pearsons r : {}, p: {}'.
          format(r, p))
    plt.close(fig='all')

queried seqs: 20
Correlation MC var - predicition errorr pearsons r : 0.004034555724689486, p: 0.8234652140589979
queried seqs: 40
Correlation MC var - predicition errorr pearsons r : 0.042111169297716244, p: 0.026944630181244482
queried seqs: 60
Correlation MC var - predicition errorr pearsons r : 0.03361129029301091, p: 0.0955761826717641
queried seqs: 80
Correlation MC var - predicition errorr pearsons r : 0.017956041066238014, p: 0.40422150861000095
queried seqs: 100
Correlation MC var - predicition errorr pearsons r : 0.04915234777234231, p: 0.03403264171495935
queried seqs: 120
Correlation MC var - predicition errorr pearsons r : 0.07240887820920641, p: 0.004217936499216891


In [17]:
df_losses = pd.DataFrame(data=np.array(losses), columns=['labelled loss', 'labelled_mae', 'labelled_neg_ccc', 'pool loss', 'pool_mae', 'pool_neg_ccc']) 

In [18]:
df_losses

Unnamed: 0,labelled loss,labelled_mae,labelled_neg_ccc,pool loss,pool_mae,pool_neg_ccc
0,0.077992,0.036188,0.093411,0.865974,0.105983,0.874507
1,0.026977,0.021966,0.027851,0.745825,0.103358,0.75059
2,0.164709,0.065802,0.19319,0.785723,0.101852,0.804433
3,0.015479,0.017504,0.015479,0.766385,0.097193,0.760919
4,0.183821,0.067742,0.188213,0.824194,0.096883,0.824195
5,0.013129,0.016164,0.014382,0.776075,0.091644,0.823511


In [20]:
1-0.823511

0.176489