## Binary structure classification used in tree building

1. prepare train/test sets
2. generate config file for bimpm model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

In [None]:
def _prepare_sequence(sequence):
    symbol_map = {
        'x': 'х',
        'X': 'X',
        'y': 'у',
        '—': '-',
        '“': '«',
        '‘': '«',
        '”': '»',
        '’': '»',
        '😆': '😄',
        '😊': '😄',
        '😑': '😄',
        '😔': '😄',
        '😉': '😄',
        '❗': '😄',
        '🤔': '😄',
        '😅': '😄',
        '⚓': '😄',
        'ε': 'α',
        'ζ': 'α',
        'η': 'α',
        'μ': 'α',
        'δ': 'α',
        'λ': 'α',
        'ν': 'α',
        'β': 'α',
        'γ': 'α',
        'と': '尋',
        'の': '尋',
        '神': '尋',
        '隠': '尋',
        'し': '尋',
    }

    result = []

    for token in sequence.split():

        for key, value in symbol_map.items():
            token = token.replace(key, value)

        for keyword in ['www', 'http']:
            if keyword in token:
                token = '_html_'

        result.append(token)

    return ' '.join(result)

In [None]:
def correct_samples(row):
    if row.snippet_x[0] in (',', '.'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

### Make a directory

In [None]:
MODEL_PATH = 'models/structure_predictor_lstm'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf4_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf4_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf4_test.tsv')

###  Generate train/test files

In [None]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

In [None]:
MAX_LEN = 250
MAX_DOCS = -1

In [None]:
gold.keys()

In [None]:
from tqdm.autonotebook import tqdm

random_state = 45
train_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    negative['len_x'] = negative.tokens_x.map(len)
    negative = negative[negative.len_x < MAX_LEN]
    negative['len_y'] = negative.tokens_y.map(len)
    negative = negative[negative.len_y < MAX_LEN]
    negative['snippet_x'] = negative.tokens_x.map(lambda row: ' '.join(row))
    negative['snippet_y'] = negative.tokens_y.map(lambda row: ' '.join(row))
    negative = negative.apply(correct_samples, axis=1)
    sample = pd.concat([sample, negative[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]])
    sample = sample.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last')    
    train_samples.append(sample)

train_samples = pd.concat(train_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
train_samples['snippet_x'] = train_samples.snippet_x.map(_prepare_sequence)
train_samples['snippet_y'] = train_samples.snippet_y.map(_prepare_sequence)
train_samples.reset_index(level=0, inplace=True)

In [None]:
train_samples.relation.value_counts()

In [None]:
train_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'index']].head()

In [None]:
train_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'index']].to_csv(TRAIN_FILE_PATH, sep='\t', header=False, index=False)

#### Make dev set

In [None]:
random_state = 45
dev_samples = []

for file in tqdm(dev):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    negative['len_x'] = negative.tokens_x.map(len)
    negative = negative[negative.len_x < MAX_LEN]
    negative['len_y'] = negative.tokens_y.map(len)
    negative = negative[negative.len_y < MAX_LEN]
    negative['snippet_x'] = negative.tokens_x.map(lambda row: ' '.join(row))
    negative['snippet_y'] = negative.tokens_y.map(lambda row: ' '.join(row))
    negative = negative.apply(correct_samples, axis=1)
    sample = pd.concat([sample, negative[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]])
    sample = sample.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last') 
    dev_samples.append(sample)

dev_samples = pd.concat(dev_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
dev_samples.reset_index(level=0, inplace=True)
dev_samples['snippet_x'] = dev_samples.snippet_x.map(_prepare_sequence)
dev_samples['snippet_y'] = dev_samples.snippet_y.map(_prepare_sequence)
dev_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'index']].to_csv(DEV_FILE_PATH, sep='\t', header=False, index=False)

#### Make test set

In [None]:
random_state = 45
test_samples = []

for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    gold['len_x'] = gold.tokens_x.map(len)
    gold = gold[gold.len_x < MAX_LEN]
    gold['len_y'] = gold.tokens_y.map(len)
    gold = gold[gold.len_y < MAX_LEN]
    gold['snippet_x'] = gold.tokens_x.map(lambda row: ' '.join(row))
    gold['snippet_y'] = gold.tokens_y.map(lambda row: ' '.join(row))
    gold = gold.apply(correct_samples, axis=1)
    sample = gold[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    negative['len_x'] = negative.tokens_x.map(len)
    negative = negative[negative.len_x < MAX_LEN]
    negative['len_y'] = negative.tokens_y.map(len)
    negative = negative[negative.len_y < MAX_LEN]
    negative['snippet_x'] = negative.tokens_x.map(lambda row: ' '.join(row))
    negative['snippet_y'] = negative.tokens_y.map(lambda row: ' '.join(row))
    negative = negative.apply(correct_samples, axis=1)
    sample = pd.concat([sample, negative[['relation', 'snippet_x', 'snippet_y', 'same_sentence']]])
    sample = sample.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last') 
    test_samples.append(sample)

test_samples = pd.concat(test_samples).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_samples.reset_index(level=0, inplace=True)
test_samples['snippet_x'] = test_samples.snippet_x.map(_prepare_sequence)
test_samples['snippet_y'] = test_samples.snippet_y.map(_prepare_sequence)
test_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'index']].to_csv(TEST_FILE_PATH, sep='\t', header=False, index=False)

### Customize model with adding inputs 

In [None]:
! rm -r models/customization_package
! mkdir models/customization_package
! touch models/customization_package/__init__.py
! mkdir models/customization_package/dataset_readers
! mkdir models/customization_package/model

In [None]:
%%writefile models/customization_package/dataset_readers/__init__.py

from customization_package.dataset_readers.custom_reader import CustomDataReader

In [None]:
%%writefile models/customization_package/dataset_readers/custom_reader.py

from typing import Dict, List
import logging
import csv

from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, CharacterTokenizer, WordTokenizer

import numpy as np

logger = logging.getLogger(__name__)

@Tokenizer.register("simple")
class WhitespaceTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()

    def _tokenize(self, text):
        return [Token(token) for token in text.split()]

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        tokens = self._tokenize(text)

        return tokens

@DatasetReader.register("custom_pairs_reader")
class CustomDataReader(DatasetReader):
    """
    # Parameters
    tokenizer : `Tokenizer`, optional
        Tokenizer to use to split the premise and hypothesis into words or other kinds of tokens.
        Defaults to `WhitespaceTokenizer`.
    token_indexers : `Dict[str, TokenIndexer]`, optional
        Indexers used to define input token representations. Defaults to `{"tokens":
        SingleIdTokenIndexer()}`.
    """

    def __init__(
        self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = True) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter="\t")
            for row in tsv_in:
                if len(row) == 5:
                    yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0], same_sentence=row[3])

    @overrides
    def text_to_instance(
        self,  # type: ignore
        premise: str,
        hypothesis: str,
        label: str,
        same_sentence: float,
    ) -> Instance:

        fields: Dict[str, Field] = {}
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
        fields["premise"] = TextField(tokenized_premise, self._token_indexers)
        fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers)
        additional_features = list(map(list, zip(*same_sentence)))
        fields["metadata"] = ArrayField(np.array(additional_features))
        if label is not None:
            fields["label"] = LabelField(label)

        return Instance(fields)

In [None]:
%%writefile models/customization_package/model/__init__.py

from customization_package.model.custom_bimpm import BiMpm

In [None]:
%%writefile models/customization_package/model/custom_bimpm.py

"""
BiMPM (Bilateral Multi-Perspective Matching) model implementation.
"""

from typing import Dict, Optional, List, Any

from overrides import overrides
import torch
import numpy

from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy

from allennlp.modules.bimpm_matching import BiMpmMatching

from allennlp.nn.util import get_text_field_mask
import torch.nn.functional as F


@Model.register("custom_bimpm")
class BiMpm(Model):
    """
    This ``Model`` augments with additional features the BiMPM model described in `Bilateral Multi-Perspective 
    Matching for Natural Language Sentences <https://arxiv.org/abs/1702.03814>`_ by Zhiguo Wang et al., 2017.
    implemented in https://github.com/galsang/BIMPM-pytorch>`_.
    Additional features are added before the feedforward classifier.
    """
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 matcher_word: BiMpmMatching,
                 encoder1: Seq2SeqEncoder,
                 matcher_forward1: BiMpmMatching,
                 matcher_backward1: BiMpmMatching,
                 encoder2: Seq2SeqEncoder,
                 matcher_forward2: BiMpmMatching,
                 matcher_backward2: BiMpmMatching,
                 aggregator: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 dropout: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BiMpm, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder

        self.matcher_word = matcher_word

        self.encoder1 = encoder1
        self.matcher_forward1 = matcher_forward1
        self.matcher_backward1 = matcher_backward1

        self.encoder2 = encoder2
        self.matcher_forward2 = matcher_forward2
        self.matcher_backward2 = matcher_backward2

        self.aggregator = aggregator

        matching_dim = self.matcher_word.get_output_dim() + \
                       self.matcher_forward1.get_output_dim() + self.matcher_backward1.get_output_dim() + \
                       self.matcher_forward2.get_output_dim() + self.matcher_backward2.get_output_dim()

        check_dimensions_match(matching_dim, self.aggregator.get_input_dim(),
                               "sum of dim of all matching layers", "aggregator input dim")

        self.classifier_feedforward = classifier_feedforward

        self.dropout = torch.nn.Dropout(dropout)

        self.metrics = {"accuracy": CategoricalAccuracy()}

        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)

    @overrides
    def forward(self,  # type: ignore
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                metadata: List[Dict[str, torch.FloatTensor]],
                label: torch.LongTensor=None,# pylint:disable=unused-argument
               ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            The premise from a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            The hypothesis from a ``TextField``
        label : torch.LongTensor, optional (default = None)
            The label for the pair of the premise and the hypothesis
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information about the pair
        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """

        mask_premise = util.get_text_field_mask(premise)
        mask_hypothesis = util.get_text_field_mask(hypothesis)

        # embedding and encoding of the premise
        embedded_premise = self.dropout(self.text_field_embedder(premise))
        encoded_premise1 = self.dropout(self.encoder1(embedded_premise, mask_premise))
        encoded_premise2 = self.dropout(self.encoder2(encoded_premise1, mask_premise))

        # embedding and encoding of the hypothesis
        embedded_hypothesis = self.dropout(self.text_field_embedder(hypothesis))
        encoded_hypothesis1 = self.dropout(self.encoder1(embedded_hypothesis, mask_hypothesis))
        encoded_hypothesis2 = self.dropout(self.encoder2(encoded_hypothesis1, mask_hypothesis))
        
        matching_vector_premise: List[torch.Tensor] = []
        matching_vector_hypothesis: List[torch.Tensor] = []

        def add_matching_result(matcher, encoded_premise, encoded_hypothesis):
            # utility function to get matching result and add to the result list
            matching_result = matcher(encoded_premise, mask_premise, encoded_hypothesis, mask_hypothesis)
            matching_vector_premise.extend(matching_result[0])
            matching_vector_hypothesis.extend(matching_result[1])

        # calculate matching vectors from word embedding, first layer encoding, and second layer encoding
        add_matching_result(self.matcher_word, embedded_premise, embedded_hypothesis)
        half_hidden_size_1 = self.encoder1.get_output_dim() // 2
        add_matching_result(self.matcher_forward1,
                            encoded_premise1[:, :, :half_hidden_size_1],
                            encoded_hypothesis1[:, :, :half_hidden_size_1])
        add_matching_result(self.matcher_backward1,
                            encoded_premise1[:, :, half_hidden_size_1:],
                            encoded_hypothesis1[:, :, half_hidden_size_1:])

        half_hidden_size_2 = self.encoder2.get_output_dim() // 2
        add_matching_result(self.matcher_forward2,
                            encoded_premise2[:, :, :half_hidden_size_2],
                            encoded_hypothesis2[:, :, :half_hidden_size_2])
        add_matching_result(self.matcher_backward2,
                            encoded_premise2[:, :, half_hidden_size_2:],
                            encoded_hypothesis2[:, :, half_hidden_size_2:])

        # concat the matching vectors
        matching_vector_cat_premise = self.dropout(torch.cat(matching_vector_premise, dim=2))
        matching_vector_cat_hypothesis = self.dropout(torch.cat(matching_vector_hypothesis, dim=2))

        # aggregate the matching vectors
        aggregated_premise = self.dropout(self.aggregator(matching_vector_cat_premise, mask_premise))
        aggregated_hypothesis = self.dropout(self.aggregator(matching_vector_cat_hypothesis, mask_hypothesis))

        # encode additional information
        batch_size, _ = aggregated_premise.size()
        encoded_meta = metadata.float().view(batch_size, -1)
        
        # the final forward layer
        logits = self.classifier_feedforward(torch.cat([aggregated_premise, aggregated_hypothesis, encoded_meta], dim=-1))
        probs = torch.nn.functional.softmax(logits, dim=-1)

        output_dict = {'logits': logits, "probs": probs}
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

    @overrides
    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Converts indices to string labels, and adds a ``"label"`` key to the result.
        """
        predictions = output_dict["probs"].cpu().data.numpy()
        argmax_indices = numpy.argmax(predictions, axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels")
                  for x in argmax_indices]
        output_dict['label'] = labels
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {metric_name: metric.get_metric(reset) for metric_name, metric in self.metrics.items()}

###  Generate config file

In [None]:
%%writefile $MODEL_PATH/config4_bert.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.

{
  "dataset_reader": {
    "type": "custom_pairs_reader",
    "lazy": false,
    "token_indexers": {
      "bert": {
          "type": "bert-pretrained",
          "pretrained_model": "rubert_cased_L-12_H-768_A-12_pt",
          "do_lowercase": false,
          "use_starting_offsets": true
      },
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      }
    }
  },
  "train_data_path": "structure_predictor_lstm/structure_cf4_train.tsv",
  "validation_data_path": "structure_predictor_lstm/structure_cf4_dev.tsv",
  "model": {
    "type": "custom_bimpm",
    "dropout": 0.2,
    "text_field_embedder": {
        "allow_unmatched_keys": true,
        "embedder_to_indexer_map": {
            "bert": ["bert", "bert-offsets"],
            "token_characters": ["token_characters"],
        },
        "token_embedders": {
            "bert": {
                "type": "bert-pretrained",
                "pretrained_model": "rubert_cased_L-12_H-768_A-12_pt",
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": 50,
                    "num_layers": 1,
                    "bidirectional": true
              }
            }
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": 768+100,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 768+100,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 400,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": 100,
      "num_layers": 2,
      "dropout": 0.5
    },
    "classifier_feedforward": {
      "input_dim": 200+200+1,
      "num_layers": 1,
      "hidden_dims": [200, 2],
      "activations": ["relu", "linear"],
      "dropout": [0.5, 0.0]
    },
    "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_normal"}],
      [".*linear_layers.*bias", {"type": "constant", "val": 0}],
      [".*weight_ih.*", {"type": "xavier_normal"}],
      [".*weight_hh.*", {"type": "orthogonal"}],
      [".*bias.*", {"type": "constant", "val": 0}],
      [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "num_epochs": 10,
    "patience": 2,
    "cuda_device": 0,
    "grad_norm": 10.0,
    "validation_metric": "+accuracy",
    "optimizer": {
      "type": "bert_adam",
      "lr": 0.001
    }
  }
}

In [None]:
%%writefile $MODEL_PATH/config4_elmo.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.

{
  "dataset_reader": {
    "type": "custom_pairs_reader",
    "lazy": false,
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": "structure_predictor_lstm/structure_cf4_train.tsv",
  "validation_data_path": "structure_predictor_lstm/structure_cf4_dev.tsv",
  "model": {
    "type": "custom_bimpm",
    "dropout": 0.2,
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "dropout": 0.0
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": 50,
                    "num_layers": 1,
                    "bidirectional": true
              }
            }
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": 1024+100,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 1024+100,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 400,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": 100,
      "num_layers": 1,
      "dropout": 0.2
    },
    "classifier_feedforward": {
      "input_dim": 200+200+1,
      "num_layers": 2,
      "hidden_dims": [200, 2],
      "activations": ["relu", "linear"],
      "dropout": [0.5, 0.0]
    },
    "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_normal"}],
      [".*linear_layers.*bias", {"type": "constant", "val": 0}],
      [".*weight_ih.*", {"type": "xavier_normal"}],
      [".*weight_hh.*", {"type": "orthogonal"}],
      [".*bias.*", {"type": "constant", "val": 0}],
      [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
    ]
  },
  "iterator": {
    "type": "basic",
    "batch_size": 2
  },
  "trainer": {
    "num_epochs": 50,
    "patience": 5,
    "cuda_device": 0,
    "grad_norm": 10.0,
    "validation_metric": "+accuracy",
    "optimizer": {
      "type": "adam",
      "lr": 0.001
    }
  }
}

### Train classifier 

In [None]:
%%writefile models/train_structure_predictor4.sh
# usage:
# $ cd models 
# $ sh train_structure_predictor4.sh {bert|elmo} result_4

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="structure_cf4_dev.tsv"
export TEST_FILE_PATH="structure_cf4_test.tsv"

rm -r structure_predictor_lstm/${RESULT_DIR}/
allennlp train -s structure_predictor_lstm/${RESULT_DIR}/ structure_predictor_lstm/config4_${METHOD}.json --include-package customization_package
allennlp predict --use-dataset-reader --silent --output-file structure_predictor_lstm/${RESULT_DIR}/predictions_dev.json structure_predictor_lstm/${RESULT_DIR}/model.tar.gz structure_predictor_lstm/${DEV_FILE_PATH}
allennlp predict --use-dataset-reader --silent --output-file structure_predictor_lstm/${RESULT_DIR}/predictions_test.json structure_predictor_lstm/${RESULT_DIR}/model.tar.gz structure_predictor_lstm/${TEST_FILE_PATH}

###  Evaluate classifier

In [None]:
def load_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            result.append(json.loads(line)["label"])
            
    result = list(map(int, result))
    print('length of result:', len(result))
    return result

On dev set

In [None]:
RESULT_DIR = 'results_4'

In [None]:
import pandas as pd
import json

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true[:len(pred)], pred))

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))