# Setup

In [None]:
VERSION = 'pspfgp-48-nn-pretrain'
FOLD = 2
DEV = False
DATA = 'ALL'  # COMP, ALL
N_BAGS = 1
N_FOLDS = 5
N_SEEDS = 1
D_MODEL = 24
BATCH_SIZE = 128
EXPIT = True
SEED = 0
GPU = 0
N_THREADS = 8
VERBOSE = True
LEVEL_GROUPS = ['0-4', '5-12', '13-22']
PREV_LEVEL_GROUPS = False
LENGTHS = {'0-4': 600, '5-12': 1400, '13-22': 2000}

In [None]:
DISCRETE_FEATURES = ['room_fqid', 'event_name_name', 'text', 'fqid']
CONTINUOUS_FEATURES = ['duration']
FEATURES = DISCRETE_FEATURES + CONTINUOUS_FEATURES

In [None]:
from IPython.display import clear_output

In [None]:
from IPython.display import display, HTML
display(HTML('<style>td{white-space: nowrap !important;}</style>'))

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import tensorflow as tf
import time

In [None]:
if EXPIT:
    from scipy.special import expit, logit

In [None]:
tf.get_logger().setLevel('WARNING')

In [None]:
import random

os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'  # TF will not use all memory

In [None]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 250)

# Utils

In [None]:
def get_questions(level_group):
    return ([1, 2, 3] if level_group == '0-4' 
            else [4, 5, 6, 7, 8, 9, 10, 11, 12, 13] if level_group == '5-12' 
            else [14, 15, 16, 17, 18])

In [None]:
def optimize_threshold(preds, labels, step_size=0.001):
    thresholds = []
    best_score = 0
    best_threshold = 0
    for threshold in np.arange(0, 1, step_size):
        binarized_preds = (preds.melt().drop('variable', axis=1).values > threshold).astype('int')
        score = fast_f1_score(labels.melt().drop('variable', axis=1).values, binarized_preds)
        thresholds.append(threshold)
        if score > best_score:
            best_score = score
            best_threshold = np.round(threshold, 3)
    return best_threshold

In [None]:
def fast_f1_score(labels, preds):
    all_positives = (preds + labels == 2).mean()
    all_negatives = (preds + labels == 0).mean()
    score = 1 - (1 - all_negatives - all_positives) / (1 - (all_negatives - all_positives) ** 2)
    return score

In [None]:
def score_questions(preds, labels, questions, thr):
    thresholds = thr if type(thr) == list else [thr] * 18
    scores = dict()
    preds_binarized = preds.copy()
    for question in questions:
        preds_binarized[f'q{question}'] = (preds[f'q{question}'].values > thresholds[question - 1]).astype('int')
        score = fast_f1_score(labels[f'q{question}'].values, preds_binarized[f'q{question}'].values)
        scores[f'q{question}'] = np.round(score, decimals=5)
    if len(questions) > 1:
        score = fast_f1_score(labels.melt().drop('variable', axis=1).values, 
                              preds_binarized.melt().drop('variable', axis=1).values)
        scores['overall'] = np.round(score, decimals=5)
    return scores

In [None]:
def tokenize(df, feats):
    df_out = df.copy()
    tokenizer_map = {}
    for f in feats:
        factorized = pd.factorize(df[f])
        df_out[f] = factorized[0] + 1
        tokenizer_map[f] = {
            'encode': ['<PAD>'] + list(factorized[1]),
            'decode': {(i + 1): el for i, el in enumerate(factorized[1])}
        }
    return df_out, tokenizer_map

In [None]:
def compute_score(y_pred, y_true, threshold=None):
    cols = [f'q{q}' for q in range(1, 18 + 1)]
    df = pd.DataFrame(index=y_true.index)
    df.loc[:, cols] = np.hstack([y_pred[level_group] for level_group in LEVEL_GROUPS])
    if threshold is None:
        threshold = optimize_threshold(df, y_true)
    scores = score_questions(y_true, 1 * (df > threshold), range(1, 18 + 1), threshold)
    scores['thr'] = threshold
    return scores, df, threshold

In [None]:
def build_sequence(df, n_features, length):
    return (np.vstack([df.values, np.zeros((length - len(df), n_features))]) 
            if len(df) < length else df[:length].values)

In [None]:
def extract_targets(dataset):
    targets = []
    for level_group in LEVEL_GROUPS:
        t = [b[1][level_group] for b in list(dataset)]
        targets.append(np.vstack(t))
    targets = np.hstack(targets)
    return targets

In [None]:
def extract_dataset(data, session_ids):
    x_tmp = np.array([data[s][0] for s in session_ids])
    x = {f: x_tmp[:, :, i] for i, f in enumerate(FEATURES)}
    y = np.array([data[s][1] for s in session_ids])
    return x, y

# Data

In [None]:
%%time
METADATA = pickle.load(open(f'../final/METADATA.pkl', 'rb'))
dataset = pickle.load(open(f'../data/processed/dataset.pkl', 'rb'))
tokenizer_map = pickle.load(open(f'../data/processed/tokenizer_map.pkl', 'rb'))
pretrain_datasets = pickle.load(open(f'../data/processed/pretrain_datasets.pkl', 'rb'))
Y = pickle.load(open(f'../data/processed/Y.pkl', 'rb'))

# Model

## Architecture

In [None]:
class ConvBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, dropout_rate):
        super(ConvBlock, self).__init__()
        self.conv1d = tf.keras.layers.Conv1D(d_model, kernel_size=5, padding='same', activation='gelu')
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)
        
    def call(self, inputs):
        x = self.conv1d(inputs)
        x = x + inputs
        x = self.layer_norm(x)
        outputs = self.dropout(x)
        return outputs

In [None]:
class TimeEmbedding(tf.keras.layers.Layer):
    def __init__(self, n_blocks, d_model, dropout_rate):
        super(TimeEmbedding, self).__init__()
        self.conv_blocks = [ConvBlock(d_model, dropout_rate=dropout_rate) for _ in range(n_blocks)]
        
    def call(self, inputs):
        x = tf.expand_dims(inputs, axis=-1)
        for conv_block in self.conv_blocks:
            x = conv_block(x)        
        return x    

In [None]:
class ConvNet(tf.keras.Model):
    def __init__(self, input_dims, n_outputs, d_model, n_blocks=4, name=None):
        super(ConvNet, self).__init__(name=name)
        self.input_dims = input_dims
        self.n_outputs = n_outputs
        self.d_model = d_model
        self.n_blocks = n_blocks
        self.event_embedding = tf.keras.layers.Embedding(input_dims['event_name_name'], d_model, mask_zero=True)
        self.room_embedding = tf.keras.layers.Embedding(input_dims['room_fqid'], d_model, mask_zero=True)
        self.text_embedding = tf.keras.layers.Embedding(input_dims['text'], d_model, mask_zero=True)
        self.fqid_embedding = tf.keras.layers.Embedding(input_dims['fqid'], d_model, mask_zero=True)
        self.duration_embedding = TimeEmbedding(n_blocks=n_blocks, d_model=d_model, dropout_rate=0.2)
        self.gap = tf.keras.layers.GlobalAveragePooling1D()
        
    def call(self, inputs):
        event = self.event_embedding(inputs['event_name_name'])
        room = self.room_embedding(inputs['room_fqid'])
        text = self.text_embedding(inputs['text'])
        fqid = self.fqid_embedding(inputs['fqid'])
        duration = self.duration_embedding(inputs['duration'])
        x = duration * (event + room + text + fqid)
        outputs = self.gap(x)
        return outputs

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'input_dims': self.input_dims,
            'n_outputs': self.n_outputs,
            'd_model': self.d_model,
            'n_blocks': self.n_blocks,
            'name': self._name,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
class SimpleHead(tf.keras.Model):
    def __init__(self, n_units, n_outputs, name=None):
        super(SimpleHead, self).__init__(name=name)
        self.ffs = [tf.keras.layers.Dense(units, activation='gelu') for units in n_units]
        self.out = tf.keras.layers.Dense(n_outputs, activation='sigmoid')
        
    def call(self, inputs):
        x = inputs
        for ff in self.ffs:
            x = ff(x)
        outputs = self.out(x)
        return outputs

In [None]:
def build_convnet(input_dims, level_group, bag, fold):
    name = f'convnet_{level_group.replace("-", "_")}_b{bag}f{fold}'
    n_outputs = len(get_questions(level_group))
    convnet = ConvNet(input_dims[level_group], n_outputs, name=name)
    return convnet

In [None]:
def build_head(level_group, bag, fold):
    name = f'head_{level_group.replace("-", "_")}_b{bag}f{fold}'
    n_outputs = len(get_questions(level_group))
    head = SimpleHead(n_units=[512, 128, 32], n_outputs=n_outputs, name=name)
    return head

In [None]:
def build_model(convnets, heads, trainable=False):
    inputs = {}
    for level_group in LEVEL_GROUPS:
        inputs[level_group] = {}
        for feature in FEATURES:
            name = f'input_{feature}_{level_group.replace("-", "_")}'
            inputs[level_group][feature] = tf.keras.Input(shape=(LENGTHS[level_group]), name=name) 
    
    for level_group in LEVEL_GROUPS:
        convnets[level_group].trainable = trainable

    convnet_outputs = {level_group: convnets[level_group](inputs[level_group]) for level_group in LEVEL_GROUPS}

    outputs = {}
    outputs['0-4'] = heads['0-4'](convnet_outputs['0-4'])
    outputs['5-12'] = heads['5-12'](
        tf.keras.layers.Concatenate(name='concat_5_12')(
            [convnet_outputs['0-4'], convnet_outputs['5-12']]))
    outputs['13-22'] = heads['13-22'](
        tf.keras.layers.Concatenate(name='concat_13_22')(
            [convnet_outputs['0-4'], convnet_outputs['5-12'], convnet_outputs['13-22']]))

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='pspfgp_model')
    
    return model

In [None]:
class PreTrainingModel(tf.keras.Model):
    def __init__(self, convnet, head):
        super().__init__()
        self.convnet = convnet
        self.head = head
        
    def call(self, inputs):
        x = self.convnet(inputs)
        outputs = self.head(x)
        return outputs

## Build

In [None]:
input_dims = {level_group: (dataset[DISCRETE_FEATURES].max() + 1).T.to_dict() for level_group in LEVEL_GROUPS}

In [None]:
input_dims

## Pretrain

In [None]:
class LogCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.history = []
        self.t_0 = time.time()

    def on_epoch_end(self, epoch, logs=None):
        self.history.append({
            'time': int(time.time() - self.t_0),
            'epoch': epoch + 1,
            'lr': logs['lr'],
            'loss': logs['loss'],
            'val_loss': logs['val_loss']
        })
        
        if epoch == 0:
            print('{:<11}{:<9}{:<8}{:<10}{:<10}'.format('Time', 'Epoch', 'LR', 'Loss', 'Val loss'))
            
        best_loss = min([h['val_loss'] for h in self.history])
        info = self.history[-1]
        hours = str(info['time'] // 3600).zfill(2)
        minutes = str(info['time'] // 60 % 60).zfill(2)
        seconds = str(info['time'] % 60).zfill(2)
        print('{:<11}{:<9}{:<8}{:<10}{:<10}'.format(
            '{}:{}:{}'.format(hours, minutes, seconds),
            f'{epoch + 1}',
            f"{round(info['lr'], 4):06.4f}",
            f"{round(info['loss'], 5):07.5f}",
            f"{round(info['val_loss'], 5):07.5f}" + ('-' if info['val_loss'] == best_loss else ''),
        ))

In [None]:
def schedule_lr(epoch):
    if epoch < 20:
        return 1e-3
    elif epoch < 30:
        return 5e-4
    elif epoch < 40:
        return 2.5e-4
    else:
        return 1e-4

In [None]:
class LearningRateSchedulerCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.best = 2 ** 31
        self.n_steps_since_last_best = 0
        self.n_steps_of_best = 0

    def on_epoch_end(self, epoch, logs=None):
        if logs['val_loss'] < self.best:
            self.best = logs['val_loss']
            self.n_steps_since_last_best = 0
            self.n_steps_of_best += 1
        else:
            self.n_steps_since_last_best += 1
            self.n_steps_of_best = 0
        lr_backup = lr = self.model.optimizer.lr.read_value()
        lr = lr + 1e-4 if self.n_steps_of_best > 2 else lr - 1e-4 if self.n_steps_since_last_best > 2 else lr
        lr = 1e-3 if lr > 1e-3 else 1e-4 if lr < 1e-4 else lr
        logs['lr'] = np.round(lr_backup, 4)
        self.model.optimizer.lr.assign(lr)

In [None]:
early_stopping_round_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True,
)

In [None]:
%%time

random.seed(SEED)
tf.random.set_seed(SEED)

models = {level_group: {} for level_group in LEVEL_GROUPS}
session_ids = METADATA[METADATA['fold'] > -1]['session_id'].drop_duplicates().tolist()
oof_preds = pd.DataFrame(index=session_ids, 
                         data=np.zeros((len(session_ids), 18)), 
                         columns=[f'q{q}' for q in range(1, 18 + 1)])
labels = Y[Y['session_id'].isin(session_ids)].reset_index(drop=True)
logs = []

for b in range(N_BAGS):
#     for f in range(N_FOLDS):
    for f in [FOLD]:
        print(f'> Bag {b} Fold {f}')
        print()
        
        (x_train, y_train), (x_val, y_val) = pretrain_datasets[f'f{f}']
        
        for level_group in LEVEL_GROUPS:
            print(f'> Pretrain level_group {level_group}')
            print()
        
            metadata = METADATA[(METADATA['fold'] == f) & METADATA[level_group]]
            val_session_ids = metadata['session_id'].drop_duplicates().tolist()
            
            train_dataset = tf.data.Dataset.from_tensor_slices((x_train[level_group], y_train[level_group]))
            train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
            val_dataset = tf.data.Dataset.from_tensor_slices((x_val[level_group], y_val[level_group]))
            val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
            
            n_questions = len(get_questions(level_group))
            suffix = f'{level_group.replace("-", "_")}_b{b}f{f}'
            convnet = ConvNet(input_dims[level_group], n_questions, name=f'convnet_{suffix}', d_model=D_MODEL)
            head = SimpleHead(n_units=[512, 128, 64], n_outputs=n_questions, name=f'pretrained_head_{suffix}')
            
            pretraining_model = PreTrainingModel(convnet, head)
            optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
            loss = tf.keras.losses.BinaryCrossentropy()
            pretraining_model.compile(loss=loss, optimizer=optimizer)
            lr_callback = LearningRateSchedulerCallback()
            callbacks = [lr_callback, LogCallback(), early_stopping_round_callback]
            history = pretraining_model.fit(
                train_dataset, validation_data=val_dataset, epochs=100, verbose=False, callbacks=callbacks)
            
            models[level_group][f'b{b}f{f}'] = {}
            models[level_group][f'b{b}f{f}']['convnet'] = pretraining_model.convnet
            models[level_group][f'b{b}f{f}']['pretrained_head'] = pretraining_model.head
            
            preds = pretraining_model.predict(val_dataset, verbose=False)
            for i, q in enumerate(get_questions(level_group)):
                oof_preds.loc[val_session_ids, f'q{q}'] = preds[:, i]
                
            print()
            
        y_pred = oof_preds.loc[val_session_ids]
        y_true = labels.set_index('session_id').loc[val_session_ids]
        threshold = optimize_threshold(y_pred, y_true)
        y_bin = 1 * (oof_preds.loc[val_session_ids] > threshold)
        scores = score_questions(y_true, y_bin, range(1, 18 + 1), threshold)
        scores['thr'] = threshold
        for q in range(1, 18 + 1):
            scores[f'q{q}'] = np.round(scores[f'q{q}'], 3)
        df = pd.DataFrame([scores])
        display(df)
        print()

In [None]:
p = oof_preds[(oof_preds == 0).sum(axis=1) == 0]
t = Y.set_index('session_id').loc[p.index]
print(score_questions(p, t, questions=range(1, 18 + 1), thr=0.625)['overall'])

## Export

In [None]:
VERSION

In [None]:
! ls ../models/"$VERSION"

In [None]:
if FOLD == 0:
    ! rm -rf ../models/"$VERSION"
    ! mkdir ../models/"$VERSION"

In [None]:
# ls -lh ../models

In [None]:
%%time

tf.get_logger().setLevel('ERROR')

for i, level_group in enumerate(LEVEL_GROUPS):
    for b in range(N_BAGS):
#         for f in range(N_FOLDS):
        for f in [FOLD]:
            inputs = {}
            suffix = level_group.replace("-", "_")
            for feature in FEATURES:
                inputs[feature] = tf.keras.Input(shape=(LENGTHS[level_group]), name=f'input_{feature}_{suffix}') 
            outputs = models[level_group][f'b{b}f{f}']['convnet'](inputs)
            convnet = tf.keras.Model(inputs=inputs, outputs=outputs, name=f'convnet_{suffix}')
            convnet.save(f'../models/{VERSION}/convnet_{suffix}_b{b}f{f}.h5')

tf.get_logger().setLevel('WARNING')

In [None]:
pickle.dump(tokenizer_map, open(f'../models/{VERSION}/tokenizer_map.pkl', 'wb'))

In [None]:
pickle.dump(oof_preds, open(f'../models/{VERSION}/oof_preds_pretrain.pkl', 'wb'))

In [None]:
ls ../models/"$VERSION"