### Generate inputs for SRE models

In [1]:
import os
import io
import re
import sys

import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt

import pickle
from csv import reader

import tensorflow as tf

from transformers import BertTokenizer, TFBertModel

#### Helper Functions

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [3]:
full_path = '../data/sre_em/sre_em_sample.csv'

In [4]:
def generate_entity_start_mask(snippetTokens, max_length, start1, start2):
    """
    Helper function that generates a mask 
    that picks out the start marker for each entity 
    given a list of snippet tokens
    """
    
    e1_mask = np.zeros(shape=(max_length,), dtype=bool)
    e1_mask[np.argwhere(np.array(snippetTokens) == start1)] = True

    e2_mask = np.zeros(shape=(max_length,), dtype=bool)
    e2_mask[np.argwhere(np.array(snippetTokens) == start2)] = True

    return e1_mask, e2_mask

In [5]:
def generate_entity_mention_mask(snippetTokens, max_length, start1, start2):
    """
    Helper function that generates a mask
    that picks out the tokens for each entity
    between (but not including) the entity markers
    """
    
    em_markers = [start1, '[/E1]', start2, '[/E2]']
    
    e1_mask = np.zeros(shape=(max_length,), dtype=bool)
    e2_mask = np.zeros(shape=(max_length,), dtype=bool)
    in_e1 = False
    in_e2 = False
    
    for (i, t) in enumerate(snippetTokens):
        if t in em_markers:
            if t in [start1, '[/E1]']:
                in_e1 = not in_e1
            elif t in [start2, '[/E2]']:
                in_e2 = not in_e2
        else:
            if in_e1 is True:
                e1_mask[i] = True
            elif in_e2 is True:
                e2_mask[i] = True
                
    return e1_mask, e2_mask

In [6]:
def generate_ner_mention_mask(snippetTokens, max_length, start1, start2):
    """
    Helper function that generates a mask
    that picks out the tokens for each entity
    between the entity markers, including the ner marker
    """
    
    em_markers = [start1, '[/E1]', start2, '[/E2]']
    
    e1_mask = np.zeros(shape=(max_length,), dtype=bool)
    e2_mask = np.zeros(shape=(max_length,), dtype=bool)
    in_e1 = False
    in_e2 = False
    
    for (i, t) in enumerate(snippetTokens):
        if t in em_markers:
            if t in [start1, '[/E1]']:
                in_e1 = not in_e1
            elif t in [start2, '[/E2]']:
                in_e2 = not in_e2
        else:
            if in_e1 is True:
                e1_mask[i] = True
            elif in_e2 is True:
                e2_mask[i] = True
    
    x1 = snippetTokens.index(start1)
    e1_mask[x1] = True
    
    x2 = snippetTokens.index(start2)
    e2_mask[x2] = True
                
    return e1_mask, e2_mask

In [7]:
# test for mask functions
tokens = ['[CLS]', 'The', 'Α', 'toxic', 'compound', '[/E1]', 'was', 'Δ', 'heated', 'to', 'reflux', '[/E2]', 'for', 'one', 'hour', '[SEP]']
start1 = 'Α'
start2 = 'Δ'

mask1, mask2 = generate_entity_start_mask(tokens, len(tokens), start1, start2)
mask3, mask4 = generate_entity_mention_mask(tokens, len(tokens), start1, start2)
mask5, mask6 = generate_ner_mention_mask(tokens, len(tokens), start1, start2)

In [7]:
def generate_input_lists(full_path, marker_type, head_type, max_length=500):
    """
    [Function description here]
    
    marker_type = marker used: 'em' or 'ner'
    head_type = relation representation for classification: 'cls', 'start', 'pool', 'ner'
    """
    
    # lists for BERT input
    bertTokenIDs = []
    bertMasks = []
    bertSeqIDs = []
    
    # list for labels
    origLabels = []
    codedLabels = []

    # lists for entity masks
    entity1Masks = []
    entity2Masks = []
    
    # lists for processing
    snippetLengthList = []
    discardedEntries = []
    
    # dictionary for converting labels to code
    code = {'ARG1': 0, 'ARGM': 1}

    # determine which marker list to use
    if marker_type == 'em':
        markers = ['[E1]', '[/E1]', '[E2]', '[/E2]']
    elif marker_type == 'ner':
        markers = ['Α', 'Β', 'Π', 'Σ', 'Ο', 'Τ', 'Θ', 'Ψ', 'Υ', 'Χ', 'Λ', 'Δ', '[/E1]', '[/E2]']
        
    
    # open file and read lines of text
    # each line is an entry
    with io.open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.readlines()

    for line in text:

        parsed_line = line.strip().split('\t')

        snippet_id = parsed_line[0]
        label = parsed_line[1]
        snippet = parsed_line[2].split()

        # generate inputs for BERT
        # convert snippets to tokenIDs, cap snippet length using max_length
        # snippets which are shorter than max_length are padded
        # snippets which are longer are truncated
        # truncated snippets with only one entity are discarded
        # all snippets end with a [SEP] token, padded or not

        # tokenize snippet, except for entity markers
        # identify start markers for each entity
        snippetTokens = ['[CLS]']
        start1 = ''
        start2 = ''
        i = 1

        for word in snippet:
            if word not in markers:
                tokens = tokenizer.tokenize(word)
                snippetTokens.extend(tokens)
            else:
                snippetTokens.append(word)
                if i == 1:
                    start1 = word
                if i == 3:
                    start2 = word
                i += 1

        # check that both entities will make it within max_length
        # by finding the index for [/E2] and comparing it to (max_length - 1)
        check = snippetTokens.index('[/E2]')

        # discard if only one entity will make it
        if check >= (max_length - 1):
            discardedEntries.append(snippet_id)
            continue

        # create space for at least a final [SEP] token
        if len(snippetTokens) >= max_length:
            snippetTokens = snippetTokens[:(max_length - 1)]
        
        # figure out snippet length for padding or truncating
        snippetLength = len(snippetTokens) + 1
        snippetLengthList.append(snippetLength - 2)

        # add [SEP] token and padding
        snippetTokens += ['[SEP]'] + ['[PAD]'] * (max_length - snippetLength)

        # generate BERT input lists
        bertTokenIDs.append(tokenizer.convert_tokens_to_ids(snippetTokens))
        bertMasks.append(([1] * snippetLength) + ([0] * (max_length - snippetLength)))
        bertSeqIDs.append([0] * (max_length))

        # generate label lists
        origLabels.append(label)
        codedLabels.append(code[label])
        
        # generate entity masks
        if head_type == 'start':
            e1_mask, e2_mask = generate_entity_start_mask(snippetTokens, max_length, start1, start2)
            entity1Masks.append(e1_mask)
            entity2Masks.append(e2_mask)
        
        elif head_type == 'pool':
            e1_mask, e2_mask = generate_entity_mention_mask(snippetTokens, max_length, start1, start2)
            entity1Masks.append(e1_mask)
            entity2Masks.append(e2_mask)
        
        elif head_type == 'ner':
            e1_mask, e2_mask = generate_ner_mention_mask(snippetTokens, max_length, start1, start2)
            entity1Masks.append(e1_mask)
            entity2Masks.append(e2_mask)

    all_lists = [bertTokenIDs, bertMasks, bertSeqIDs, 
                 origLabels, codedLabels, 
                 snippetLengthList, discardedEntries, 
                 entity1Masks, entity2Masks]
    
    return all_lists

#### Sample data for building models

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
full_path = '../data/sre_em/sre_em_sample.csv'

##### Sample #1: EM, start head

In [11]:
all_lists = generate_input_lists(full_path, marker_type='em', head_type='start', max_length=500)

In [12]:
bert_inputs = [all_lists[0], all_lists[1], all_lists[2], all_lists[7], all_lists[8]]
codedLabels = all_lists[4]

In [13]:
numSentences = len(bert_inputs[0])
np.random.seed(0)
training_examples = np.random.binomial(1, 0.7, numSentences)

In [14]:
trainSentence_ids = []
trainMasks = []
trainSequence_ids = []
trainE1Masks = []
trainE2Masks = []

testSentence_ids = []
testMasks = []
testSequence_ids = []
testE1Masks = []
testE2Masks = []

labels_train =[]
labels_test = []

for example in range(numSentences):
    if training_examples[example] == 1:
        trainSentence_ids.append(bert_inputs[0][example])
        trainMasks.append(bert_inputs[1][example])
        trainSequence_ids.append(bert_inputs[2][example])
        trainE1Masks.append(bert_inputs[3][example])
        trainE2Masks.append(bert_inputs[4][example])
        labels_train.append(codedLabels[example])
    else:
        testSentence_ids.append(bert_inputs[0][example])
        testMasks.append(bert_inputs[1][example])
        testSequence_ids.append(bert_inputs[2][example])
        testE1Masks.append(bert_inputs[3][example])
        testE2Masks.append(bert_inputs[4][example])
        labels_test.append(codedLabels[example])
        
X_train = np.array([trainSentence_ids, trainMasks, trainSequence_ids, trainE1Masks, trainE2Masks])
X_test = np.array([testSentence_ids, testMasks, testSequence_ids, testE1Masks, trainE2Masks])

reLabels_train = np.array(labels_train)
reLabels_test = np.array(labels_test)

  X_test = np.array([testSentence_ids, testMasks, testSequence_ids, testE1Masks, trainE2Masks])


In [15]:
train_all = [X_train, reLabels_train]
test_all = [X_test, reLabels_test]

In [16]:
with open(r"../pickles/sample/train_em_start_base_cased.pickle", "wb") as output_file:
    pickle.dump(train_all, output_file)
    
with open(r"../pickles/sample/test_em_start_base_cased.pickle", "wb") as output_file:
    pickle.dump(test_all, output_file)

##### Sample #2: EM, pool head

In [17]:
all_lists = generate_input_lists(full_path, marker_type='em', head_type='pool', max_length=500)

In [18]:
bert_inputs = [all_lists[0], all_lists[1], all_lists[2], all_lists[7], all_lists[8]]
codedLabels = all_lists[4]

In [19]:
numSentences = len(bert_inputs[0])
np.random.seed(0)
training_examples = np.random.binomial(1, 0.7, numSentences)

In [20]:
trainSentence_ids = []
trainMasks = []
trainSequence_ids = []
trainE1Masks = []
trainE2Masks = []

testSentence_ids = []
testMasks = []
testSequence_ids = []
testE1Masks = []
testE2Masks = []

labels_train =[]
labels_test = []

for example in range(numSentences):
    if training_examples[example] == 1:
        trainSentence_ids.append(bert_inputs[0][example])
        trainMasks.append(bert_inputs[1][example])
        trainSequence_ids.append(bert_inputs[2][example])
        trainE1Masks.append(bert_inputs[3][example])
        trainE2Masks.append(bert_inputs[4][example])
        labels_train.append(codedLabels[example])
    else:
        testSentence_ids.append(bert_inputs[0][example])
        testMasks.append(bert_inputs[1][example])
        testSequence_ids.append(bert_inputs[2][example])
        testE1Masks.append(bert_inputs[3][example])
        testE2Masks.append(bert_inputs[4][example])
        labels_test.append(codedLabels[example])
        
X_train = np.array([trainSentence_ids, trainMasks, trainSequence_ids, trainE1Masks, trainE2Masks])
X_test = np.array([testSentence_ids, testMasks, testSequence_ids, testE1Masks, trainE2Masks])

reLabels_train = np.array(labels_train)
reLabels_test = np.array(labels_test)

  X_test = np.array([testSentence_ids, testMasks, testSequence_ids, testE1Masks, trainE2Masks])


In [21]:
train_all = [X_train, reLabels_train]
test_all = [X_test, reLabels_test]

In [22]:
with open(r"../pickles/sample/train_em_pool_base_cased.pickle", "wb") as output_file:
    pickle.dump(train_all, output_file)
    
with open(r"../pickles/sample/test_em_pool_base_cased.pickle", "wb") as output_file:
    pickle.dump(test_all, output_file)