## Multiclass relations classification used in tree building

1. prepare train/test sets
2. generate config files for bimpm model
3. generate training/prediction script
4. model evaluation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

In [None]:
def _prepare_sequence(sequence):
    symbol_map = {
        'x': 'х',
        'X': 'X',
        'y': 'у',
        '—': '-',
        '“': '«',
        '‘': '«',
        '”': '»',
        '’': '»',
        '😆': '😄',
        '😊': '😄',
        '😑': '😄',
        '😔': '😄',
        '😉': '😄',
        '❗': '😄',
        '🤔': '😄',
        '😅': '😄',
        '⚓': '😄',
        'ε': 'α',
        'ζ': 'α',
        'η': 'α',
        'μ': 'α',
        'δ': 'α',
        'λ': 'α',
        'ν': 'α',
        'β': 'α',
        'γ': 'α',
        'と': '尋',
        'の': '尋',
        '神': '尋',
        '隠': '尋',
        'し': '尋',
    }

    result = []

    for token in sequence.split():

        for key, value in symbol_map.items():
            token = token.replace(key, value)

        for keyword in ['www', 'http']:
            if keyword in token:
                token = '_html_'

        result.append(token)

    return ' '.join(result)

In [None]:
def correct_samples(row):
    if row.snippet_x[0] in (',', '.'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

### Make a directory

In [None]:
MODEL_PATH = 'models/label_predictor_bmgf'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'nlabel_cf_test.tsv')

### 1. prepare train/test sets 

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

In [None]:
MAX_LEN = 100

In [None]:
from tqdm.autonotebook import tqdm

TARGET = 'category_id'
random_state = 45
train_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[[TARGET, 'snippet_x', 'snippet_y', 'order', 'filename']]
    sample = sample[sample.snippet_x.map(len) > 1]
    sample = sample[sample.snippet_y.map(len) > 1]
    train_samples.append(sample)

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
train_samples.reset_index(level=0, inplace=True)
train_samples[TARGET] = train_samples[TARGET].replace([0.0], 'same-unit_m')
train_samples['order'] = train_samples['order'].replace([0.0], 'NN')
train_samples[TARGET] = train_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
train_samples[TARGET] = train_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
train_samples[TARGET] = train_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
train_samples[TARGET] = train_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
train_samples[TARGET] = train_samples[TARGET].replace(['motivation_r',], 'condition_r')
train_samples['relation'] = train_samples[TARGET].map(lambda row: row[:-1]) + train_samples['order']
train_samples['relation'] = train_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
train_samples['relation'] = train_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
train_samples['relation'] = train_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
train_samples['relation'] = train_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                               'elaboration_SN', 'evidence_SN'], 'preparation_SN')
train_samples['relation'].value_counts()

In [None]:
import numpy as np

counts = train_samples['relation'].value_counts(normalize=True).values
NUMBER_CLASSES = len(counts)
print("number of classes:", NUMBER_CLASSES)
print("class weights:")
np.round(counts.min() / counts, decimals=2)

In [None]:
train_samples[['relation', 'snippet_x', 'snippet_y', 'index']].head(1)

In [None]:
train_samples['snippet_x'] = train_samples.snippet_x.map(_prepare_sequence)
train_samples['snippet_y'] = train_samples.snippet_y.map(_prepare_sequence)

In [None]:
train_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TRAIN_FILE_PATH, sep='\t', header=False, index=False)

train_samples[['relation', 'snippet_x', 'snippet_y', 'index']].iloc[:10].to_csv(TRAIN_FILE_PATH, sep='\t', header=False, index=False)

#### Dev/test sets

In [None]:
random_state = 45
dev_samples = []

for file in tqdm(dev):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[[TARGET, 'snippet_x', 'snippet_y', 'order']]
    sample = sample[sample.snippet_x.map(len) > 1]
    sample = sample[sample.snippet_y.map(len) > 1]
    dev_samples.append(sample)

dev_samples = pd.concat(dev_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
dev_samples.reset_index(level=0, inplace=True)
dev_samples[TARGET] = dev_samples[TARGET].replace([0.0], 'same-unit_m')
dev_samples['order'] = dev_samples['order'].replace([0.0], 'NN')
dev_samples[TARGET] = dev_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['motivation_r',], 'condition_r')

In [None]:
dev_samples['relation'] = dev_samples[TARGET].map(lambda row: row[:-1]) + dev_samples['order']
dev_samples['relation'].value_counts()
dev_samples['relation'] = dev_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
dev_samples['relation'] = dev_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                           'elaboration_SN', 'evidence_SN'], 'preparation_SN')
dev_samples['relation'].value_counts()

In [None]:
dev_samples.head()

In [None]:
dev_samples['snippet_x'] = dev_samples.snippet_x.map(_prepare_sequence)
dev_samples['snippet_y'] = dev_samples.snippet_y.map(_prepare_sequence)
dev_samples = dev_samples[dev_samples.snippet_x.map(len) > 0]
dev_samples = dev_samples[dev_samples.snippet_y.map(len) > 0]

In [None]:
dev_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(DEV_FILE_PATH, sep='\t', header=False, index=False)

In [None]:
random_state = 45
test_samples = []

for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[[TARGET, 'snippet_x', 'snippet_y', 'order', 'filename']]
    sample = sample[sample.snippet_x.map(len) > 1]
    sample = sample[sample.snippet_y.map(len) > 1]
    test_samples.append(sample)

test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples.reset_index(level=0, inplace=True)
test_samples[TARGET] = test_samples[TARGET].replace([0.0], 'same-unit_m')
test_samples['order'] = test_samples['order'].replace([0.0], 'NN')
test_samples[TARGET] = test_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
test_samples[TARGET] = test_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
test_samples[TARGET] = test_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
test_samples[TARGET] = test_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
test_samples[TARGET] = test_samples[TARGET].replace(['motivation_r',], 'condition_r')
test_samples['relation'] = test_samples[TARGET].map(lambda row: row[:-1]) + test_samples['order']
test_samples['relation'].value_counts()
test_samples['relation'] = test_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
test_samples['relation'] = test_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
test_samples['relation'] = test_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
test_samples['relation'] = test_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                           'elaboration_SN', 'evidence_SN'], 'preparation_SN')
print(test_samples['relation'].value_counts())
test_samples['snippet_x'] = test_samples.snippet_x.map(_prepare_sequence)
test_samples['snippet_y'] = test_samples.snippet_y.map(_prepare_sequence)
test_samples[['relation', 'snippet_x', 'snippet_y', 'index']].to_csv(TEST_FILE_PATH, sep='\t', header=False, index=False)

In [None]:
test_samples.head(2).values

### 2. Modify model

In [None]:
%%writefile models/customization_package/model/additional_layers.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
#from utils import map_activation_str_to_layer
from torch.nn import Dropout

INF = 1e12
_INF = -1e12

def map_activation_str_to_layer(act_str):
    _act_map = {"none": lambda x: x,
            "relu": nn.ReLU(),
            "tanh": nn.Tanh(),
            "softmax": nn.Softmax(dim=-1),
            "sigmoid": nn.Sigmoid(),
            "leaky_relu": nn.LeakyReLU(),
            "prelu": nn.PReLU()}
    try:
        return _act_map[act_str]
    except:
        raise NotImplementedError("Error: %s activation fuction is not supported now." % (act_str))

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_mlp_layers=2, activation=None):
        super(MLP, self).__init__()

        self.layers = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.num_mlp_layers = num_mlp_layers
        self.activation = activation
        self.output_dim = output_dim 

        if num_mlp_layers == 1:
            self.layers.append(nn.Linear(input_dim, output_dim))
        else:
            self.layers.append(nn.Linear(input_dim, hidden_dim))
            for i in range(num_mlp_layers-2):
                self.bns.append(nn.BatchNorm1d(hidden_dim))
                self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.bns.append(nn.BatchNorm1d(hidden_dim))
            self.layers.append(nn.Linear(hidden_dim, output_dim))
        
        # init
        scale = 1/hidden_dim**0.5
        for layer in self.layers:
            nn.init.normal_(layer.weight, 0.0, scale)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, x):
        for i in range(self.num_mlp_layers-1):
            x = self.layers[i](x)
            x = self.bns[i](x)
            if self.activation:
                x = self.activation(x)
        return self.layers[-1](x)

    def get_output_dim(self):
        return self.output_dim


class FullyConnectedLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FullyConnectedLayer, self).__init__()

        self.mlp = MLP(input_dim, hidden_dim, output_dim, num_mlp_layers=2, activation=None)

    def forward(self, x):
        return self.mlp(x)

    def get_output_dim(self):
        return self.mlp.output_dim


class Highway(nn.Module):
    def __init__(self, input_dim, num_layers=1, activation="relu"):
        super(Highway, self).__init__()
        self.input_dim = input_dim
        self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)])
        self.activation = map_activation_str_to_layer(activation)
        
        # init
        scale = 1/input_dim**0.5
        for layer in self.layers:
            nn.init.normal_(layer.weight, 0.0, scale)
            nn.init.constant_(layer.bias[:input_dim], 0.0)
            nn.init.constant_(layer.bias[input_dim:], 1.0)

    def forward(self, x):
        for layer in self.layers:
            o, g = layer(x).chunk(2, dim=-1)
            o = self.activation(o)
            g = F.sigmoid(g)
            x = g * x + (1 - g) * o
        return x


def multi_perspective_match(vector1, vector2, weight):
    assert vector1.size(0) == vector2.size(0)
    assert weight.size(1) == vector1.size(2)

    # (batch, seq_len, 1)
    similarity_single = F.cosine_similarity(vector1, vector2, 2).unsqueeze(2)

    # (1, 1, num_perspectives, hidden_size)
    weight = weight.unsqueeze(0).unsqueeze(0)

    # (batch, seq_len, num_perspectives, hidden_size)
    vector1 = weight * vector1.unsqueeze(2)
    vector2 = weight * vector2.unsqueeze(2)

    similarity_multi = F.cosine_similarity(vector1, vector2, dim=3)

    return similarity_single, similarity_multi


def multi_perspective_match_pairwise(vector1, vector2, weight, eps=1e-8):
    num_perspectives = weight.size(0)

    # (1, num_perspectives, 1, hidden_size)
    weight = weight.unsqueeze(0).unsqueeze(2)

    # (batch, num_perspectives, seq_len*, hidden_size)
    vector1 = weight * vector1.unsqueeze(1).expand(-1, num_perspectives, -1, -1)
    vector2 = weight * vector2.unsqueeze(1).expand(-1, num_perspectives, -1, -1)

    # (batch, num_perspectives, seq_len*, 1)
    vector1_norm = vector1.norm(p=2, dim=3, keepdim=True)
    vector2_norm = vector2.norm(p=2, dim=3, keepdim=True)

    # (batch, num_perspectives, seq_len1, seq_len2)
    mul_result = torch.matmul(vector1, vector2.transpose(2, 3))
    norm_value = vector1_norm * vector2_norm.transpose(2, 3)

    # (batch, seq_len1, seq_len2, num_perspectives)
    return (mul_result / norm_value.clamp(min=eps)).permute(0, 2, 3, 1)


def masked_max(vector, mask, dim, keepdim=False, min_val=-1e7):
    replaced_vector = vector.masked_fill(mask==0, min_val) if mask is not None else vector
    max_value, _ = replaced_vector.max(dim=dim, keepdim=keepdim)
    return max_value


def masked_mean(vector, mask, dim, keepdim=False, eps=1e-8):
    replaced_vector = vector.masked_fill(mask==0, 0.0) if mask is not None else vector
    value_sum = torch.sum(replaced_vector, dim=dim, keepdim=keepdim)
    value_count = torch.sum(mask.float(), dim=dim, keepdim=keepdim)
    return value_sum / value_count.clamp(min=eps)


def masked_softmax(vector, mask, dim=-1):
    if mask is None:
        result = F.softmax(vector, dim=dim)
    else:
        while mask.dim() < vector.dim():
            mask = mask.unsqueeze(1)
        masked_vector = vector.masked_fill(mask==0, _INF)
        result = F.softmax(masked_vector, dim=dim)
    return result


class BiMpmMatching(nn.Module):
    def __init__(self,
                 hidden_dim,
                 num_perspectives,
                 share_weights_between_directions=True,
                 with_full_match=True,
                 with_maxpool_match=True,
                 with_attentive_match=True,
                 with_max_attentive_match=True):
        super(BiMpmMatching, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_perspectives = num_perspectives

        self.with_full_match = with_full_match
        self.with_maxpool_match = with_maxpool_match
        self.with_attentive_match = with_attentive_match
        self.with_max_attentive_match = with_max_attentive_match

        if not (with_full_match or with_maxpool_match or with_attentive_match or with_max_attentive_match):
            raise ValueError("At least one of the matching method should be enabled")

        def create_parameter():  # utility function to create and initialize a parameter
            param = nn.Parameter(torch.zeros(num_perspectives, hidden_dim))
            nn.init.kaiming_normal_(param)
            return param

        def share_or_create(weights_to_share):  # utility function to create or share the weights
            return weights_to_share if share_weights_between_directions else create_parameter()

        output_dim = 2  # used to calculate total output dimension, 2 is for cosine max and cosine min
        if with_full_match:
            self.full_forward_match_weights = create_parameter()
            self.full_forward_match_weights_reversed = share_or_create(self.full_forward_match_weights)
            self.full_backward_match_weights = create_parameter()
            self.full_backward_match_weights_reversed = share_or_create(self.full_backward_match_weights)
            output_dim += (num_perspectives + 1) * 2

        if with_maxpool_match:
            self.maxpool_match_weights = create_parameter()
            output_dim += num_perspectives * 2

        if with_attentive_match:
            self.attentive_match_weights = create_parameter()
            self.attentive_match_weights_reversed = share_or_create(self.attentive_match_weights)
            output_dim += num_perspectives + 1

        if with_max_attentive_match:
            self.max_attentive_match_weights = create_parameter()
            self.max_attentive_match_weights_reversed = share_or_create(self.max_attentive_match_weights)
            output_dim += num_perspectives + 1

        self.output_dim = output_dim

    def get_output_dim(self):
        return self.output_dim

    def forward(self, context_1, mask_1, context_2, mask_2):
        assert (not mask_2.requires_grad) and (not mask_1.requires_grad)
        assert context_1.size(-1) == context_2.size(-1) == self.hidden_dim

        # (batch,)
        len_1 = mask_1.sum(dim=1).long()
        len_2 = mask_2.sum(dim=1).long()

        # explicitly set masked weights to zero
        # (batch_size, seq_len*, hidden_dim)
        context_1 = context_1 * mask_1.unsqueeze(-1)
        context_2 = context_2 * mask_2.unsqueeze(-1)

        # array to keep the matching vectors for the two sentences
        matching_vector_1 = []
        matching_vector_2 = []

        # Step 0. unweighted cosine
        # First calculate the cosine similarities between each forward
        # (or backward) contextual embedding and every forward (or backward)
        # contextual embedding of the other sentence.

        # (batch, seq_len1, seq_len2)
        cosine_sim = F.cosine_similarity(context_1.unsqueeze(-2), context_2.unsqueeze(-3), dim=3)

        # (batch, seq_len*, 1)
        cosine_max_1 = masked_max(cosine_sim, mask_2.unsqueeze(-2), dim=2, keepdim=True)
        cosine_mean_1 = masked_mean(cosine_sim, mask_2.unsqueeze(-2), dim=2, keepdim=True)
        cosine_max_2 = masked_max(cosine_sim.permute(0, 2, 1), mask_1.unsqueeze(-2), dim=2, keepdim=True)
        cosine_mean_2 = masked_mean(cosine_sim.permute(0, 2, 1), mask_1.unsqueeze(-2), dim=2, keepdim=True)

        matching_vector_1.extend([cosine_max_1, cosine_mean_1])
        matching_vector_2.extend([cosine_max_2, cosine_mean_2])

        # Step 1. Full-Matching
        # Each time step of forward (or backward) contextual embedding of one sentence
        # is compared with the last time step of the forward (or backward)
        # contextual embedding of the other sentence
        if self.with_full_match:
            # (batch, 1, hidden_dim)
            last_position_1 = (len_1 - 1).clamp(min=0)
            last_position_1 = last_position_1.view(-1, 1, 1).expand(-1, 1, self.hidden_dim)
            last_position_2 = (len_2 - 1).clamp(min=0)
            last_position_2 = last_position_2.view(-1, 1, 1).expand(-1, 1, self.hidden_dim)

            context_1_forward_last = context_1.gather(1, last_position_1)
            context_2_forward_last = context_2.gather(1, last_position_2)
            context_1_backward_last = context_1[:, 0:1, :]
            context_2_backward_last = context_2[:, 0:1, :]

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_forward_full = multi_perspective_match(context_1,
                                                                    context_2_forward_last,
                                                                    self.full_forward_match_weights)
            matching_vector_2_forward_full = multi_perspective_match(context_2,
                                                                    context_1_forward_last,
                                                                    self.full_forward_match_weights_reversed)
            matching_vector_1_backward_full = multi_perspective_match(context_1,
                                                                    context_2_backward_last,
                                                                    self.full_backward_match_weights)
            matching_vector_2_backward_full = multi_perspective_match(context_2,
                                                                    context_1_backward_last,
                                                                    self.full_backward_match_weights_reversed)

            matching_vector_1.extend(matching_vector_1_forward_full)
            matching_vector_1.extend(matching_vector_1_backward_full)
            matching_vector_2.extend(matching_vector_2_forward_full)
            matching_vector_2.extend(matching_vector_2_backward_full)

        # Step 2. Maxpooling-Matching
        # Each time step of forward (or backward) contextual embedding of one sentence
        # is compared with every time step of the forward (or backward)
        # contextual embedding of the other sentence, and only the max value of each
        # dimension is retained.
        if self.with_maxpool_match:
            # (batch, seq_len1, seq_len2, num_perspectives)
            matching_vector_max = multi_perspective_match_pairwise(context_1,
                                                                   context_2,
                                                                   self.maxpool_match_weights)

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_max = masked_max(matching_vector_max,
                                               mask_2.unsqueeze(-2).unsqueeze(-1),
                                               dim=2)
            matching_vector_1_mean = masked_mean(matching_vector_max,
                                                 mask_2.unsqueeze(-2).unsqueeze(-1),
                                                 dim=2)
            matching_vector_2_max = masked_max(matching_vector_max.permute(0, 2, 1, 3),
                                               mask_1.unsqueeze(-2).unsqueeze(-1),
                                               dim=2)
            matching_vector_2_mean = masked_mean(matching_vector_max.permute(0, 2, 1, 3),
                                                 mask_1.unsqueeze(-2).unsqueeze(-1),
                                                 dim=2)

            matching_vector_1.extend([matching_vector_1_max, matching_vector_1_mean])
            matching_vector_2.extend([matching_vector_2_max, matching_vector_2_mean])


        # Step 3. Attentive-Matching
        # Each forward (or backward) similarity is taken as the weight
        # of the forward (or backward) contextual embedding, and calculate an
        # attentive vector for the sentence by weighted summing all its
        # contextual embeddings.
        # Finally match each forward (or backward) contextual embedding
        # with its corresponding attentive vector.

        # (batch, seq_len1, seq_len2, hidden_dim)
        att_2 = context_2.unsqueeze(-3) * cosine_sim.unsqueeze(-1)

        # (batch, seq_len1, seq_len2, hidden_dim)
        att_1 = context_1.unsqueeze(-2) * cosine_sim.unsqueeze(-1)

        if self.with_attentive_match:
            # (batch, seq_len*, hidden_dim)
            att_mean_2 = masked_softmax(att_2.sum(dim=2), mask_1.unsqueeze(-1))
            att_mean_1 = masked_softmax(att_1.sum(dim=1), mask_2.unsqueeze(-1))

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_att_mean = multi_perspective_match(context_1,
                                                                 att_mean_2,
                                                                 self.attentive_match_weights)
            matching_vector_2_att_mean = multi_perspective_match(context_2,
                                                                 att_mean_1,
                                                                 self.attentive_match_weights_reversed)
            matching_vector_1.extend(matching_vector_1_att_mean)
            matching_vector_2.extend(matching_vector_2_att_mean)

        # Step 4. Max-Attentive-Matching
        # Pick the contextual embeddings with the highest cosine similarity as the attentive
        # vector, and match each forward (or backward) contextual embedding with its
        # corresponding attentive vector.
        if self.with_max_attentive_match:
            # (batch, seq_len*, hidden_dim)
            att_max_2 = masked_max(att_2, mask_2.unsqueeze(-2).unsqueeze(-1), dim=2)
            att_max_1 = masked_max(att_1.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2)

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_att_max = multi_perspective_match(context_1,
                                                                att_max_2,
                                                                self.max_attentive_match_weights)
            matching_vector_2_att_max = multi_perspective_match(context_2,
                                                                att_max_1,
                                                                self.max_attentive_match_weights_reversed)

            matching_vector_1.extend(matching_vector_1_att_max)
            matching_vector_2.extend(matching_vector_2_att_max)

        return matching_vector_1, matching_vector_2


class MultiHeadAttn(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim, hidden_dim, num_head,
            dropatt=0.0, 
            act_func="softmax", add_zero_attn=False, 
            pre_lnorm=False, post_lnorm=False):
        super(MultiHeadAttn, self).__init__()
        assert hidden_dim%num_head == 0

        self.query_dim = query_dim
        self.key_dim = key_dim
        self.value_dim = value_dim
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.dropatt = nn.Dropout(dropatt)

        head_dim = hidden_dim // num_head

        self.q_net = nn.Linear(query_dim, hidden_dim, bias=False)
        self.k_net = nn.Linear(key_dim, hidden_dim, bias=False)
        self.v_net = nn.Linear(value_dim, hidden_dim, bias=False)
        self.o_net = nn.Linear(hidden_dim, query_dim, bias=False)

        self.scale = 1 / (head_dim ** 0.5)

        self.act_func = act_func
        self.add_zero_attn = add_zero_attn
        self.pre_lnorm = pre_lnorm
        self.post_lnorm = post_lnorm

        if pre_lnorm:
            self.q_layer_norm = nn.LayerNorm(query_dim)
            self.k_layer_norm = nn.LayerNorm(key_dim)
            self.v_layer_norm = nn.LayerNorm(value_dim)
        if post_lnorm:
            self.o_layer_norm = nn.LayerNorm(query_dim)
        
        # init
        for net in [self.q_net, self.k_net, self.v_net, self.o_net]:
            nn.init.xavier_uniform_(net.weight, 1.0)
            if hasattr(net, "bias") and net.bias is not None:
                nn.init.constant_(net.bias, 0.0)

        if self.pre_lnorm:
            for layer_norm in [self.q_layer_norm, self.k_layer_norm, self.v_layer_norm]:
                if hasattr(layer_norm, "weight"):
                    nn.init.normal_(layer_norm.weight, 1.0, self.scale)
                if hasattr(layer_norm, "bias") and layer_norm.bias is not None:
                    nn.init.constant_(layer_norm.bias, 0.0)
        if self.post_lnorm:
            if hasattr(self.o_layer_norm, "weight"):
                nn.init.normal_(self.o_layer_norm.weight, 1.0, self.scale)
            if hasattr(self.o_layer_norm, "bias") and self.o_layer_norm.bias is not None:
                nn.init.constant_(self.o_layer_norm.bias, 0.0)

    def forward(self, query, key, value, attn_mask=None):
        ##### multihead attention
        # [bsz x hlen x num_head x head_dim]
        bsz = query.size(0)

        if self.add_zero_attn:
            key = torch.cat([key, 
                torch.zeros((bsz, 1) + key.size()[2:], dtype=key.dtype, device=key.device)], dim=1)
            value = torch.cat([value, 
                torch.zeros((bsz, 1) + value.size()[2:], dtype=value.dtype, device=value.device)], dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat([attn_mask, 
                    torch.ones((bsz, 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1)

        qlen, klen, vlen = query.size(1), key.size(1), value.size(1)

        if self.pre_lnorm:
            ##### layer normalization
            query = self.q_layer_norm(query)
            key = self.k_layer_norm(key)
            value = self.v_layer_norm(value)

        head_q = self.q_net(query).view(bsz, qlen, self.num_head, self.hidden_dim//self.num_head)
        head_k = self.k_net(key).view(bsz, klen, self.num_head, self.hidden_dim//self.num_head)
        head_v = self.v_net(value).view(bsz, vlen, self.num_head, self.hidden_dim//self.num_head)

        # [bsz x qlen x klen x num_head]
        attn_score = torch.einsum("bind,bjnd->bijn", (head_q, head_k))
        attn_score.mul_(self.scale)
        if attn_mask is not None:
            if attn_mask.dim() == 2:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(1).unsqueeze(-1), _INF)
            elif attn_mask.dim() == 3:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(-1), _INF)

        # [bsz x qlen x klen x num_head]
        if self.act_func is None or self.act_func == "None":
            attn_prob = attn_score
        elif self.act_func == "softmax":
            attn_prob = F.softmax(attn_score, dim=2)
        elif self.act_func == "sigmoid":
            attn_prob = F.sigmoid(attn_score)
        elif self.act_func == "tanh":
            attn_prob = F.tanh(attn_score)
        elif self.act_func == "relu":
            attn_prob = F.relu(attn_score)
        elif self.act_func == "leaky_relu":
            attn_prob = F.leaky_relu(attn_score)
        elif self.act_func == "maximum":
            max_score = torch.max(attn_score, dim=2, keepdim=True)[0]
            max_mask = attn_score == max_score
            cnt = torch.sum(max_mask, dim=2, keepdim=True)
            attn_prob = max_mask.float() / cnt.float()
        else:
            raise NotImplementedError
        attn_prob = self.dropatt(attn_prob)

        # [bsz x qlen x klen x num_head] x [bsz x klen x num_head x head_dim] -> [bsz x qlen x num_head x head_dim]
        attn_vec = torch.einsum("bijn,bjnd->bind", (attn_prob, head_v))
        attn_vec = attn_vec.contiguous().view(bsz, qlen, self.hidden_dim)

        ##### linear projection
        attn_out = self.o_net(attn_vec)
        
        if self.post_lnorm:
            attn_out = self.o_layer_norm(attn_out)

        return attn_out

    def get_output_dim(self):
        return self.query_dim


class GatedMultiHeadAttn(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim, hidden_dim, num_head,
            dropatt=0.0, 
            act_func="softmax", add_zero_attn=False, 
            pre_lnorm=False, post_lnorm=False):
        super(GatedMultiHeadAttn, self).__init__()
        assert hidden_dim%num_head == 0

        self.query_dim = query_dim
        self.key_dim = key_dim
        self.value_dim = value_dim
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.dropatt = nn.Dropout(dropatt)

        head_dim = hidden_dim // num_head

        self.q_net = nn.Linear(query_dim, hidden_dim, bias=False)
        self.k_net = nn.Linear(key_dim, hidden_dim, bias=False)
        self.v_net = nn.Linear(value_dim, hidden_dim, bias=False)
        self.o_net = nn.Linear(hidden_dim, query_dim, bias=False)
        self.g_net = nn.Linear(2*query_dim, query_dim, bias=True)

        self.scale = 1 / (head_dim ** 0.5)

        self.act_func = act_func
        self.add_zero_attn = add_zero_attn
        self.pre_lnorm = pre_lnorm
        self.post_lnorm = post_lnorm

        if pre_lnorm:
            self.q_layer_norm = nn.LayerNorm(query_dim)
            self.k_layer_norm = nn.LayerNorm(key_dim)
            self.v_layer_norm = nn.LayerNorm(value_dim)
        if post_lnorm:
            self.o_layer_norm = nn.LayerNorm(query_dim)
        
        # init
        for net in [self.q_net, self.k_net, self.v_net, self.o_net]:
            nn.init.xavier_uniform_(net.weight, 1.0)
            if hasattr(net, "bias") and net.bias is not None:
                nn.init.constant_(net.bias, 0.0)
        # when new data comes, it prefers to output 1 so that the gate is 1
        nn.init.normal_(self.g_net.weight, 0.0, self.scale)
        if hasattr(self.g_net, "bias") and self.g_net.bias is not None:
            nn.init.constant_(self.g_net.bias, 1.0)

        if self.pre_lnorm:
            for layer_norm in [self.q_layer_norm, self.k_layer_norm, self.v_layer_norm]:
                if hasattr(layer_norm, "weight"):
                    nn.init.normal_(layer_norm.weight, 1.0, self.scale)
                if hasattr(layer_norm, "bias") and layer_norm.bias is not None:
                    nn.init.constant_(layer_norm.bias, 0.0)
        if self.post_lnorm:
            if hasattr(self.o_layer_norm, "weight"):
                nn.init.normal_(self.o_layer_norm.weight, 1.0, self.scale)
            if hasattr(self.o_layer_norm, "bias") and self.o_layer_norm.bias is not None:
                nn.init.constant_(self.o_layer_norm.bias, 0.0)

    def forward(self, query, key, value, attn_mask=None):
        ##### multihead attention
        # [bsz x hlen x num_head x head_dim]
        bsz = query.size(0)

        if self.add_zero_attn:
            key = torch.cat([key, 
                torch.zeros((bsz, 1) + key.size()[2:], dtype=key.dtype, device=key.device)], dim=1)
            value = torch.cat([value, 
                torch.zeros((bsz, 1) + value.size()[2:], dtype=value.dtype, device=value.device)], dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat([attn_mask, 
                    torch.ones((bsz, 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1)

        qlen, klen, vlen = query.size(1), key.size(1), value.size(1)

        if self.pre_lnorm:
            ##### layer normalization
            query = self.q_layer_norm(query)
            key = self.k_layer_norm(key)
            value = self.v_layer_norm(value)

        head_q = self.q_net(query).view(bsz, qlen, self.num_head, self.hidden_dim//self.num_head)
        head_k = self.k_net(key).view(bsz, klen, self.num_head, self.hidden_dim//self.num_head)
        head_v = self.v_net(value).view(bsz, vlen, self.num_head, self.hidden_dim//self.num_head)

        # [bsz x qlen x klen x num_head]
        attn_score = torch.einsum("bind,bjnd->bijn", (head_q, head_k))
        attn_score.mul_(self.scale)
        if attn_mask is not None:
            if attn_mask.dim() == 2:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(1).unsqueeze(-1), _INF)
            elif attn_mask.dim() == 3:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(-1), _INF)

        # [bsz x qlen x klen x num_head]
        if self.act_func is None or self.act_func == "None":
            attn_prob = attn_score
        elif self.act_func == "softmax":
            attn_prob = F.softmax(attn_score, dim=2)
        elif self.act_func == "sigmoid":
            attn_prob = F.sigmoid(attn_score)
        elif self.act_func == "tanh":
            attn_prob = F.tanh(attn_score)
        elif self.act_func == "relu":
            attn_prob = F.relu(attn_score)
        elif self.act_func == "leaky_relu":
            attn_prob = F.leaky_relu(attn_score)
        elif self.act_func == "maximum":
            max_score = torch.max(attn_score, dim=2, keepdim=True)[0]
            max_mask = attn_score == max_score
            cnt = torch.sum(max_mask, dim=2, keepdim=True)
            attn_prob = max_mask.float() / cnt.float()
        else:
            raise NotImplementedError
        attn_prob = self.dropatt(attn_prob)

        # [bsz x qlen x klen x num_head] x [bsz x klen x num_head x head_dim] -> [bsz x qlen x num_head x head_dim]
        attn_vec = torch.einsum("bijn,bjnd->bind", (attn_prob, head_v))
        attn_vec = attn_vec.contiguous().view(bsz, qlen, self.hidden_dim)

        ##### linear projection
        attn_out = self.o_net(attn_vec)

        ##### gate
        gate = F.sigmoid(self.g_net(torch.cat([query, attn_out], dim=2)))
        attn_out = gate * query + (1-gate) * attn_out
        
        if self.post_lnorm:
            attn_out = self.o_layer_norm(attn_out)

        return attn_out

    def get_output_dim(self):
        return self.query_dim



class CnnHighway(nn.Module):
    def __init__(self, input_dim, filters, output_dim, num_highway=1, activation="relu", projection_location="after_highway", layer_norm=False):
        super().__init__()

        assert projection_location in ["after_cnn", "after_highway"]

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.projection_location = projection_location

        self.activation = map_activation_str_to_layer(activation)
        # Create the convolutions
        self.convs = nn.ModuleList()
        for i, (width, num) in enumerate(filters):
            conv = nn.Conv1d(in_channels=input_dim, out_channels=num, kernel_size=width, bias=True)
            self.convs.append(conv)

        # Create the highway layers
        num_filters = sum(num for _, num in filters)
        if projection_location == 'after_cnn':
            highway_dim = output_dim
        else:
            # highway_dim is the number of cnn filters
            highway_dim = num_filters
        self.highways = Highway(highway_dim, num_highway, activation=activation)

        # Projection layer: always num_filters -> output_dim
        self.proj = nn.Linear(num_filters, output_dim)

        # And add a layer norm
        if layer_norm:
            self.layer_norm = nn.LayerNorm(output_dim)
        else:
            self.layer_norm = None

        # init
        scale = 1/num_filters**0.5
        for layer in self.convs:
            nn.init.kaiming_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)
        nn.init.normal_(self.proj.weight, 0.0, scale)
        nn.init.constant_(self.proj.bias, 0.0)

    def forward(self, x, mask):

        # convolutions want (batch_size, input_dim, num_characters)
        x = x.transpose(1, 2)

        output = []
        for conv in self.convs:
            c = conv(x)
            c = torch.max(c, dim=-1)[0]
            c = self.activation(c)
            output.append(c)

        # (batch_size, n_filters)
        output = torch.cat(output, dim=-1)

        if self.projection_location == 'after_cnn':
            output = self.proj(output)

        # apply the highway layers (batch_size, highway_dim)
        output = self.highways(output)

        if self.projection_location == 'after_highway':
            # final projection  (batch_size, output_dim)
            output = self.proj(output)

        # apply layer norm if appropriate
        if self.layer_norm:
            output = self.layer_norm(output)

        return output

    def get_output_dim(self):
        return self.output_dim

class GatedMultiHeadAttn(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim, hidden_dim, num_head,
            dropatt=0.0, 
            act_func="softmax", add_zero_attn=False, 
            pre_lnorm=False, post_lnorm=False):
        super(GatedMultiHeadAttn, self).__init__()
        assert hidden_dim%num_head == 0

        self.query_dim = query_dim
        self.key_dim = key_dim
        self.value_dim = value_dim
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.dropatt = nn.Dropout(dropatt)

        head_dim = hidden_dim // num_head

        self.q_net = nn.Linear(query_dim, hidden_dim, bias=False)
        self.k_net = nn.Linear(key_dim, hidden_dim, bias=False)
        self.v_net = nn.Linear(value_dim, hidden_dim, bias=False)
        self.o_net = nn.Linear(hidden_dim, query_dim, bias=False)
        self.g_net = nn.Linear(2*query_dim, query_dim, bias=True)

        self.scale = 1 / (head_dim ** 0.5)

        self.act_func = act_func
        self.add_zero_attn = add_zero_attn
        self.pre_lnorm = pre_lnorm
        self.post_lnorm = post_lnorm

        if pre_lnorm:
            self.q_layer_norm = nn.LayerNorm(query_dim)
            self.k_layer_norm = nn.LayerNorm(key_dim)
            self.v_layer_norm = nn.LayerNorm(value_dim)
        if post_lnorm:
            self.o_layer_norm = nn.LayerNorm(query_dim)
        
        # init
        for net in [self.q_net, self.k_net, self.v_net, self.o_net]:
            nn.init.xavier_uniform_(net.weight, 1.0)
            if hasattr(net, "bias") and net.bias is not None:
                nn.init.constant_(net.bias, 0.0)
        # when new data comes, it prefers to output 1 so that the gate is 1
        nn.init.normal_(self.g_net.weight, 0.0, self.scale)
        if hasattr(self.g_net, "bias") and self.g_net.bias is not None:
            nn.init.constant_(self.g_net.bias, 1.0)

        if self.pre_lnorm:
            for layer_norm in [self.q_layer_norm, self.k_layer_norm, self.v_layer_norm]:
                if hasattr(layer_norm, "weight"):
                    nn.init.normal_(layer_norm.weight, 1.0, self.scale)
                if hasattr(layer_norm, "bias") and layer_norm.bias is not None:
                    nn.init.constant_(layer_norm.bias, 0.0)
        if self.post_lnorm:
            if hasattr(self.o_layer_norm, "weight"):
                nn.init.normal_(self.o_layer_norm.weight, 1.0, self.scale)
            if hasattr(self.o_layer_norm, "bias") and self.o_layer_norm.bias is not None:
                nn.init.constant_(self.o_layer_norm.bias, 0.0)

    def forward(self, query, key, value, attn_mask=None):
        ##### multihead attention
        # [bsz x hlen x num_head x head_dim]
        bsz = query.size(0)

        if self.add_zero_attn:
            key = torch.cat([key, 
                torch.zeros((bsz, 1) + key.size()[2:], dtype=key.dtype, device=key.device)], dim=1)
            value = torch.cat([value, 
                torch.zeros((bsz, 1) + value.size()[2:], dtype=value.dtype, device=value.device)], dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat([attn_mask, 
                    torch.ones((bsz, 1), dtype=attn_mask.dtype, device=attn_mask.device)], dim=1)

        qlen, klen, vlen = query.size(1), key.size(1), value.size(1)

        if self.pre_lnorm:
            ##### layer normalization
            query = self.q_layer_norm(query)
            key = self.k_layer_norm(key)
            value = self.v_layer_norm(value)

        head_q = self.q_net(query).view(bsz, qlen, self.num_head, self.hidden_dim//self.num_head)
        head_k = self.k_net(key).view(bsz, klen, self.num_head, self.hidden_dim//self.num_head)
        head_v = self.v_net(value).view(bsz, vlen, self.num_head, self.hidden_dim//self.num_head)

        # [bsz x qlen x klen x num_head]
        attn_score = torch.einsum("bind,bjnd->bijn", (head_q, head_k))
        attn_score.mul_(self.scale)
        if attn_mask is not None:
            if attn_mask.dim() == 2:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(1).unsqueeze(-1), _INF)
            elif attn_mask.dim() == 3:
                attn_score.masked_fill_((attn_mask == 0).unsqueeze(-1), _INF)

        # [bsz x qlen x klen x num_head]
        if self.act_func is None or self.act_func == "None":
            attn_prob = attn_score
        elif self.act_func == "softmax":
            attn_prob = F.softmax(attn_score, dim=2)
        elif self.act_func == "sigmoid":
            attn_prob = F.sigmoid(attn_score)
        elif self.act_func == "tanh":
            attn_prob = F.tanh(attn_score)
        elif self.act_func == "relu":
            attn_prob = F.relu(attn_score)
        elif self.act_func == "leaky_relu":
            attn_prob = F.leaky_relu(attn_score)
        elif self.act_func == "maximum":
            max_score = torch.max(attn_score, dim=2, keepdim=True)[0]
            max_mask = attn_score == max_score
            cnt = torch.sum(max_mask, dim=2, keepdim=True)
            attn_prob = max_mask.float() / cnt.float()
        else:
            raise NotImplementedError
        attn_prob = self.dropatt(attn_prob)

        # [bsz x qlen x klen x num_head] x [bsz x klen x num_head x head_dim] -> [bsz x qlen x num_head x head_dim]
        attn_vec = torch.einsum("bijn,bjnd->bind", (attn_prob, head_v))
        attn_vec = attn_vec.contiguous().view(bsz, qlen, self.hidden_dim)

        ##### linear projection
        attn_out = self.o_net(attn_vec)

        ##### gate
        gate = F.sigmoid(self.g_net(torch.cat([query, attn_out], dim=2)))
        attn_out = gate * query + (1-gate) * attn_out
        
        if self.post_lnorm:
            attn_out = self.o_layer_norm(attn_out)

        return attn_out

    def get_output_dim(self):
        return self.query_dim


In [None]:
%%writefile models/customization_package/model/multiclass_bmgf.py

"""
BMGF (Bilateral Matching and Gated Fusion) model implementation.
"""

from typing import Dict, Optional, List, Any

from overrides import overrides
import torch
import numpy

from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy, F1Measure

from customization_package.model.additional_layers import *
from allennlp.modules.bimpm_matching import BiMpmMatching

from allennlp.nn.util import get_text_field_mask
import torch.nn.functional as F


@Model.register("multiclass_bmgf")
class BMGFModel(Model):
    """
    This ``Model`` mimics the BMGF model described in `On the Importance of Word and Sentence Representation Learning in
Implicit Discourse Relation Classification <https://arxiv.org/pdf/2004.12617v2.pdf>`_ by Xin Liu et al., 2020.
    implemented in https://github.com/HKUST-KnowComp/BMGF-RoBERTa>`_.
    Additional features are added before the feedforward classifier.
    """
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 matcher_word: BiMpmMatching,
                 encoder1: Seq2SeqEncoder,
                 matcher_forward1: BiMpmMatching,
                 matcher_backward1: BiMpmMatching,
                 encoder2: Seq2SeqEncoder,
                 matcher_forward2: BiMpmMatching,
                 matcher_backward2: BiMpmMatching,
                 aggregator: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 dropout: float = 0.1,
                 hidden_dim: int = 128,
                 num_filters: int = 64,
                 num_perspectives: int = 16,
                 class_weights: list = [],
                 encode_together: bool = True,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BMGFModel, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.encode_together = encode_together

        self.matcher_word = matcher_word

        self.encoder1 = encoder1
        self.matcher_forward1 = matcher_forward1
        self.matcher_backward1 = matcher_backward1

        self.encoder2 = encoder2
        self.matcher_forward2 = matcher_forward2
        self.matcher_backward2 = matcher_backward2

        self.aggregator = aggregator

#         matching_dim = self.matcher_word.get_output_dim() + \
#                        self.matcher_forward1.get_output_dim() + self.matcher_backward1.get_output_dim() + \
#                        self.matcher_forward2.get_output_dim() + self.matcher_backward2.get_output_dim()

#         check_dimensions_match(matching_dim, self.aggregator.get_input_dim(),
#                                "sum of dim of all matching layers", "aggregator input dim")
        
        output_dim = self.matcher_word.get_output_dim() + self.text_field_embedder.get_output_dim()
        output_dim = 1399
        
        self.gated_attn_layer = GatedMultiHeadAttn(
            query_dim=output_dim,
            key_dim=output_dim,
            value_dim=output_dim,
            hidden_dim=hidden_dim,
            num_head=num_perspectives,
            dropatt=dropout,
            act_func="softmax",
            add_zero_attn=False,
            pre_lnorm=False,
            post_lnorm=False)

        self.conv_layer = CnnHighway(
            input_dim=self.gated_attn_layer.get_output_dim(),
            output_dim=hidden_dim,
            filters=[(1, num_filters)], # , (2, num_filters)the shortest length is 2
            num_highway=1,
            activation="leaky_relu",
            layer_norm=False)

        self.classifier_feedforward = classifier_feedforward

        self.dropout = torch.nn.Dropout(dropout)
        
        self.num_perspectives = num_perspectives
        
        if class_weights:
            self.class_weights = class_weights
        else:
            self.class_weights = [1.] * self.classifier_feedforward.get_output_dim()

        self.metrics = {"accuracy": CategoricalAccuracy(),
                        "f1_rel0": F1Measure(0),
                        "f1_rel1": F1Measure(1),
                        "f1_rel2": F1Measure(2)}

        self.loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(self.class_weights))

        initializer(self)

    @overrides
    def forward(self,  # type: ignore
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None  # pylint:disable=unused-argument
               ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            The premise from a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            The hypothesis from a ``TextField``
        label : torch.LongTensor, optional (default = None)
            The label for the pair of the premise and the hypothesis
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information about the pair
        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        
        def encode_pair(x1, x2, mask1=None, mask2=None):
            _joined_pair: Dict[str, torch.LongTensor] = {}
            
            for key in premise.keys():
                bsz = premise[key].size(0)
                x1_len, x2_len = premise[key].size(1), hypothesis[key].size(1)
                sep = torch.empty([bsz, 1], dtype=torch.long, device=premise[key].device)
                sep.data.fill_(0) # 2 is the id for </s>
                
                x = torch.cat([premise[key], hypothesis[key]], dim=1)
                _joined_pair[key] = x
                
            x_output = self.dropout(self.text_field_embedder(_joined_pair))
            return x_output[:, :x1_len], x_output[:, -x2_len:], mask1, mask2

        mask_premise = util.get_text_field_mask(premise)
        mask_hypothesis = util.get_text_field_mask(hypothesis)
        
        if self.encode_together:
            embedded_premise, embedded_hypothesis, _, _ = encode_pair(premise, hypothesis)
        else:
            embedded_premise = self.dropout(self.text_field_embedder(premise))
            embedded_hypothesis = self.dropout(self.text_field_embedder(hypothesis))

        # embedding and encoding of the premise
        encoded_premise1 = self.dropout(self.encoder1(embedded_premise, mask_premise))
        encoded_premise2 = self.dropout(self.encoder2(encoded_premise1, mask_premise))

        # embedding and encoding of the hypothesis
        encoded_hypothesis1 = self.dropout(self.encoder1(embedded_hypothesis, mask_hypothesis))
        encoded_hypothesis2 = self.dropout(self.encoder2(encoded_hypothesis1, mask_hypothesis))
        
        matching_vector_premise: List[torch.Tensor] = []
        matching_vector_hypothesis: List[torch.Tensor] = []

        def add_matching_result(matcher, encoded_premise, encoded_hypothesis):
            # utility function to get matching result and add to the result list
            matching_result = matcher(encoded_premise, mask_premise, encoded_hypothesis, mask_hypothesis)
            matching_vector_premise.extend(matching_result[0])
            matching_vector_hypothesis.extend(matching_result[1])

        # calculate matching vectors from word embedding, first layer encoding, and second layer encoding
        add_matching_result(self.matcher_word, embedded_premise, embedded_hypothesis)
        half_hidden_size_1 = self.encoder1.get_output_dim() // 2
        add_matching_result(self.matcher_forward1,
                            encoded_premise1[:, :, :half_hidden_size_1],
                            encoded_hypothesis1[:, :, :half_hidden_size_1])
        add_matching_result(self.matcher_backward1,
                            encoded_premise1[:, :, half_hidden_size_1:],
                            encoded_hypothesis1[:, :, half_hidden_size_1:])

        half_hidden_size_2 = self.encoder2.get_output_dim() // 2
        add_matching_result(self.matcher_forward2,
                            encoded_premise2[:, :, :half_hidden_size_2],
                            encoded_hypothesis2[:, :, :half_hidden_size_2])
        add_matching_result(self.matcher_backward2,
                            encoded_premise2[:, :, half_hidden_size_2:],
                            encoded_hypothesis2[:, :, half_hidden_size_2:])

        # concat the matching vectors
        matching_vector_cat_premise = self.dropout(torch.cat(matching_vector_premise, dim=2))
        matching_vector_cat_hypothesis = self.dropout(torch.cat(matching_vector_hypothesis, dim=2))

        # aggregate the matching vectors
        aggregated_premise = self.dropout(self.aggregator(matching_vector_cat_premise, mask_premise))
        aggregated_hypothesis = self.dropout(self.aggregator(matching_vector_cat_hypothesis, mask_hypothesis))
        
        #print('>>', (embedded_premise.size(), aggregated_premise.size()))
        arg1_self_attned_feats = torch.cat([embedded_premise, matching_vector_cat_premise], dim=2)
        arg2_self_attned_feats = torch.cat([embedded_hypothesis, matching_vector_cat_hypothesis], dim=2)
        arg1_self_attned_feats = self.dropout(self.gated_attn_layer(
            arg1_self_attned_feats, arg1_self_attned_feats, arg1_self_attned_feats, attn_mask=mask_premise))
        arg2_self_attned_feats = self.dropout(self.gated_attn_layer(
            arg2_self_attned_feats, arg2_self_attned_feats, arg2_self_attned_feats, attn_mask=mask_hypothesis))
        
        arg1_conv = self.dropout(self.conv_layer(arg1_self_attned_feats, mask_premise))
        arg2_conv = self.dropout(self.conv_layer(arg2_self_attned_feats, mask_hypothesis))

        # encode additional information
        #batch_size, _ = aggregated_premise.size()
        #encoded_meta = metadata.float().view(batch_size, -1)
        
        # the final forward layer
        #logits = self.classifier_feedforward(torch.cat([aggregated_premise, aggregated_hypothesis], dim=-1))
        logits = self.classifier_feedforward(torch.cat([arg1_conv, arg2_conv], dim=1))
        probs = torch.nn.functional.softmax(logits, dim=-1)

        output_dict = {'logits': logits, "probs": probs}
        
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

    @overrides
    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Converts indices to string labels, and adds a ``"label"`` key to the result.
        """
        predictions = output_dict["probs"].cpu().data.numpy()
        argmax_indices = numpy.argmax(predictions, axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels")
                  for x in argmax_indices]
        output_dict['label'] = labels
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {
            "f1_rel0": self.metrics["f1_rel0"].get_metric(reset=reset)[2],
            "f1_rel1": self.metrics["f1_rel1"].get_metric(reset=reset)[2],
            "f1_rel2": self.metrics["f1_rel2"].get_metric(reset=reset)[2],
            "accuracy": self.metrics["accuracy"].get_metric(reset=reset)
        }
        metrics["f1_top3"] = numpy.mean([metrics["f1_rel0"], metrics["f1_rel1"], metrics["f1_rel2"]])
        return metrics


### 2. Generate config files

In [None]:
%%writefile $MODEL_PATH/config_elmo.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.

{
  "dataset_reader": {
    "type": "quora_paraphrase",
    "lazy": false,
    "tokenizer": {
      "type": "word",
      "word_splitter": {
        "type": "just_spaces"
      }
    },
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 50
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": "label_predictor_bmgf/nlabel_cf_train.tsv",
  "validation_data_path": "label_predictor_bmgf/nlabel_cf_dev.tsv",
  "test_data_path": "label_predictor_bmgf/nlabel_cf_test.tsv",
  "model": {
    "type": "multiclass_bmgf",
    "dropout": 0.5,
    "class_weights": [
        0.03, 0.03, 0.07, 0.11, 0.12, 0.13, 0.13, 0.15, 0.17, 0.17, 0.18,
        0.19, 0.23, 0.23, 0.33, 0.34, 0.35, 0.52, 0.57, 0.7 , 0.85, 0.87,
        1.0  ],
    "encode_together": true,
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "dropout": 0.1
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0,
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": 50,
                    "num_layers": 1,
                    "bidirectional": true,
                    "dropout": 0.1,
                },
            },
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": 1024+100,
      "num_perspectives": 10,
      "with_full_match": true
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 1024+100,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 400,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 275,
      "hidden_size": 100,
      "num_layers": 1,
    },
    "classifier_feedforward": {
      "input_dim": 256,
      "num_layers": 1,
      "hidden_dims": [23,],
      "activations": ["relu"],
      "dropout": [0.0]
    },
    "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_normal"}],
      [".*linear_layers.*bias", {"type": "constant", "val": 0}],
      [".*weight_ih.*", {"type": "xavier_normal"}],
      [".*weight_hh.*", {"type": "orthogonal"}],
      [".*bias.*", {"type": "constant", "val": 0}],
      [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
    ],
  },
  "iterator": {
    "type": "bucket",
    "padding_noise": 0,
    "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]],
    "batch_size": 20,
  },
  "trainer": {
    "num_epochs": 100,
    "patience": 10,
    "cuda_device": 1,
    "grad_norm": 2.0,
    "validation_metric": "+accuracy",
    "optimizer": {
      "type": "adam",
      "lr": 0.0001,
      "weight_decay": 0.0005
    }
  }
}

### 3. Script for training/prediction 

In [None]:
%%writefile models/train_label_predictor_bmgf.sh
# usage:
# $ cd models 
# $ sh train_label_predictor_bmgf.sh {bert|elmo} result_000

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

rm -r label_predictor_bmgf/${RESULT_DIR}/
allennlp train -s label_predictor_bmgf/${RESULT_DIR}/ label_predictor_bmgf/config_${METHOD}.json \
    --include-package customization_package
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bmgf/${RESULT_DIR}/predictions_dev.json label_predictor_bmgf/${RESULT_DIR}/model.tar.gz label_predictor_bmgf/${DEV_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bmgf/${RESULT_DIR}/predictions_test.json label_predictor_bmgf/${RESULT_DIR}/model.tar.gz label_predictor_bmgf/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

In [None]:
%%writefile models/eval_label_predictor_bmgf.sh
# usage:
# $ cd models 
# $ sh eval_label_predictor_bmgf.sh {bert|elmo} result_000

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="nlabel_cf_dev.tsv"
export TEST_FILE_PATH="nlabel_cf_test.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bmgf/${RESULT_DIR}/predictions_dev.json label_predictor_bmgf/${RESULT_DIR}/model.tar.gz label_predictor_bmgf/${DEV_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment
allennlp predict --use-dataset-reader --silent \
    --output-file label_predictor_bmgf/${RESULT_DIR}/predictions_test.json label_predictor_bmgf/${RESULT_DIR}/model.tar.gz label_predictor_bmgf/${TEST_FILE_PATH} \
    --include-package customization_package \
    --predictor textual-entailment

### 4. Evaluate classifier

In [None]:
def load_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            result.append(json.loads(line)["label"])
            
    print('length of result:', len(result))
    return result

In [None]:
RESULT_DIR = 'result_100'

On dev set

In [None]:
import pandas as pd
import json

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
len(true)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
from utils.plot_confusion_matrix import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

labels = list(set(true))
labels.sort()
plot_confusion_matrix(confusion_matrix(true[:len(pred)], pred, labels), target_names=labels, normalize=True)

In [None]:
# class_mapper = {
#     'background_NS': 'other_NS',
#     'background_SN': 'other_SN',
#     'comparison_NN': 'other_NN',
#     'interpretation-evaluation_SN': 'other_SN',
#     'interpretation-evaluation_NS': 'other_NS',
#     'evidence_NS': 'other_NS',
#     'restatement_NN': 'other_NN',
#     'sequence_NN': 'other_NN',
# #     'solutionhood_SN': 'other_NS',
#     'cause-effect_SN': 'joint_NN',
#     'preparation_SN': 'elaboration_NS',
#     'background_SN': 'joint_NN',
#     'elaboration_NS': 'joint_NN',
    
# }

In [None]:
top_classes = [
    'attribution_NS',
    'attribution_SN',
    'purpose_NS',
    'purpose_SN',
    'condition_SN',
    'contrast_NN',
    'condition_NS',
    'joint_NN',
    'concession_NS',
    'same-unit_NN',
    'elaboration_NS',
    'cause-effect_NS',
    'solutionhood_SN',
    'cause-effect_SN'
]

class_mapper = {weird_class: 'other' + weird_class[-3:] for weird_class in labels if not weird_class in top_classes}

In [None]:
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]

pred_mapper = {
    'other_NN': 'joint_NN',
    'other_NS': 'joint_NN',
    'other_SN': 'joint_NN'
}
pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]

#pred = [value if not 'other' in value else true[i] for i, value in enumerate(pred)]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay[:len(pred)]]
labels = list(set(_true))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred, average='macro')*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred, average='macro')*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred, average='macro')*100))

In [None]:
labels

In [None]:
# labels = ['attribution_NS', 'purpose_NS', 'joint_NN', 'attribution_SN', 'purpose_SN', 'condition_SN', 
#           'condition_NS', 'contrast_NN', 'elaboration_NS', 'same-unit_NN', 'cause-effect_NS', 
#           'interpretation-evaluation_NS', 'concession_NS', ]

# labels = ['attribution_NS', 'purpose_NS', 'attribution_SN', 'purpose_SN', 'condition_SN', 'contrast_NN', 
#           'joint_NN', 'solutionhood_SN', 'concession_NS', 
#           'condition_NS', 'cause-effect_NS', 
#           'interpretation-evaluation_NS', 'same-unit_NN', 'elaboration_NS', 'restatement_NN', 
#           'interpretation-evaluation_SN', 'preparation_SN', 'background_SN']

plot_confusion_matrix(confusion_matrix(_true[:len(_pred)], _pred), target_names=labels, normalize=True)

In [None]:
import numpy as np

for rel in np.unique(_true):
    print(rel)

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

print(classification_report(true[:len(pred)], pred, digits=4))

In [None]:
len(true)

In [None]:
true = [class_mapper.get(value) if class_mapper.get(value) else value for value in true]
pred = [class_mapper.get(value) if class_mapper.get(value) else value for value in pred]
pred = [pred_mapper.get(value) if pred_mapper.get(value) else value for value in pred]

_to_stay = (np.array(true) != 'other_NN') & (np.array(true) != 'other_SN') & (np.array(true) != 'other_NS')

_true = np.array(true)[_to_stay]
_pred = np.array(pred)[_to_stay]

In [None]:
print(classification_report(_true[:len(_pred)], _pred, digits=4))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(_true[:len(_pred)], _pred, average='macro')*100))
print('pr: %.2f'%(precision_score(_true[:len(_pred)], _pred, average='macro')*100))
print('re: %.2f'%(recall_score(_true[:len(_pred)], _pred, average='macro')*100))

### Ensemble: (Logreg+Catboost) + BiMPM

In [None]:
import pandas as pd

random_state = 41

# train_samples = []
test_samples = []
dev_samples = []

# for file in train:
#     train_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl')))

for file in dev:
    dev_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl')))
    
for file in test:
    test_samples.append(pd.read_pickle(file.replace('.edus', '.gold.pkl')))

# train_samples = pd.concat(train_samples).sample(
#     frac=1, random_state=random_state).reset_index(drop=True)
dev_samples = pd.concat(dev_samples).sample(
    frac=1, random_state=random_state).reset_index(drop=True)
test_samples = pd.concat(test_samples).sample(
    frac=1, random_state=random_state).reset_index(drop=True)

In [None]:
TARGET = 'category_id'
MAX_LEN = 100

dev_samples[TARGET] = dev_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
dev_samples[TARGET] = dev_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
dev_samples[TARGET] = dev_samples[TARGET].replace(['motivation_r',], 'condition_r')
dev_samples['relation'] = dev_samples[TARGET].map(lambda row: row[:-1]) + dev_samples['order']
dev_samples['relation'] = dev_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
dev_samples['relation'] = dev_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
dev_samples['relation'] = dev_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                               'elaboration_SN', 'evidence_SN'], 'preparation_SN')
dev_samples = dev_samples[dev_samples.tokens_x.map(len) < MAX_LEN]
dev_samples = dev_samples[dev_samples.tokens_y.map(len) < MAX_LEN]

test_samples[TARGET] = test_samples[TARGET].replace(['antithesis_r',], 'contrast_m')
test_samples[TARGET] = test_samples[TARGET].replace(['cause_r', 'effect_r'], 'cause-effect_r')
test_samples[TARGET] = test_samples[TARGET].replace(['conclusion_r',], 'restatement_m')
test_samples[TARGET] = test_samples[TARGET].replace(['evaluation_r'], 'interpretation-evaluation_r')
test_samples[TARGET] = test_samples[TARGET].replace(['motivation_r',], 'condition_r')
test_samples['relation'] = test_samples[TARGET].map(lambda row: row[:-1]) + test_samples['order']
test_samples['relation'] = test_samples['relation'].replace(['restatement_SN', 'restatement_NS'], 'restatement_NN')
test_samples['relation'] = test_samples['relation'].replace(['contrast_SN', 'contrast_NS'], 'contrast_NN')
test_samples['relation'] = test_samples['relation'].replace(['solutionhood_NS', 'preparation_NS'], 'elaboration_NS')
test_samples['relation'] = test_samples['relation'].replace(['concession_SN', 'evaluation_SN', 
                                                               'elaboration_SN', 'evidence_SN'], 'preparation_SN')
test_samples = test_samples[test_samples.tokens_x.map(len) < MAX_LEN]
test_samples = test_samples[test_samples.tokens_y.map(len) < MAX_LEN]

TARGET = 'relation'

In [None]:
import pickle

fs_catboost_plus_logreg = pickle.load(open('models/label_predictor/model.pkl', 'rb'))
lab_encoder = pickle.load(open('models/label_predictor/label_encoder.pkl', 'rb'))
scaler = pickle.load(open('models/label_predictor/scaler.pkl', 'rb'))
drop_columns = pickle.load(open('models/label_predictor/drop_columns.pkl', 'rb'))

In [None]:
# y_train, X_train = train_samples[TARGET].to_frame(), train_samples.drop(TARGET, axis=1).drop(
#     columns=drop_columns + ['category_id'])

y_dev, X_dev = dev_samples[TARGET].to_frame(), dev_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples[TARGET].to_frame(), test_samples.drop(TARGET, axis=1).drop(
    columns=drop_columns + ['category_id'])

X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)#, columns=X.columns)

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)#, columns=X.columns)

In [None]:
from sklearn import metrics

predicted = lab_encoder.inverse_transform(fs_catboost_plus_logreg.predict(X_test))

print('weighted f1: ', metrics.f1_score(y_test.values, predicted, average='weighted'))
print('macro f1: ', metrics.f1_score(y_test.values, predicted, average='macro'))
print('accuracy: ', metrics.accuracy_score(y_test.values, predicted))
print()
print(metrics.classification_report(y_test, predicted, digits=4))
print('macro precision: %.2f'%(metrics.precision_score(y_test, predicted, average='macro')*100.))
print('macro recall: %.2f'%(metrics.recall_score(y_test, predicted, average='macro')*100.))