In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import zipfile
import sys
import time
import gc
import json
from tempfile import TemporaryFile

import seaborn as sns
import spacy
from sklearn import metrics as skm

# Any results you write to the current directory are saved as output.

 # Data Path

In [None]:
DATA_ROOT = '../input/'
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap-coreference')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')
FAST_TEXT_DATA_FOLDER = os.path.join(DATA_ROOT, 'fasttext-crawl-300d-2m')

In [None]:
test_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
train_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-test.tsv')
dev_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-validation.tsv')

# Clean Text

## Clean up Entity Name

In [None]:
AT_NAME = "AAAAAAXXXXXXXXX"
BT_NAME = "BBBBBBXXXXXXXXX"
A_NAME = 'John'
B_NAME = 'Bob'

def find_all_substring(a_str, sub):
    start = 0
    result = list()
    while True:
        start = a_str.find(sub, start)
        if start == -1:
            return result
        result.append(start)
        start += len(sub) # use start += 1 to find overlapping matches

def _update_offset(text, old_, new_, offset):
    len_in = len(new_) - len(old_)
    text_ = text[0:offset]
    return offset + len_in * len(find_all_substring(text_, old_))
    
def replace_entity_name(text, a_name, b_name, a_offset, b_offset, p_offset):
    a_name = a_name.strip()
    b_name = b_name.strip()
    
    if len(a_name) < len(b_name):
        a_name, b_name = b_name, a_name
        AT_NAME_, BT_NAME_ = BT_NAME, AT_NAME
        A_NAME_, B_NAME_ = B_NAME, A_NAME
    else:
        AT_NAME_, BT_NAME_ = AT_NAME, BT_NAME
        A_NAME_, B_NAME_ = A_NAME, B_NAME
    
    # replace the whole name
    a_offset = _update_offset(text, a_name, AT_NAME_, a_offset)
    b_offset = _update_offset(text, a_name, AT_NAME_, b_offset)
    p_offset = _update_offset(text, a_name, AT_NAME_, p_offset)
    text = text.replace(a_name, AT_NAME_)

    a_offset = _update_offset(text, b_name, BT_NAME_, a_offset)
    b_offset = _update_offset(text, b_name, BT_NAME_, b_offset)
    p_offset = _update_offset(text, b_name, BT_NAME_, p_offset)
    text = text.replace(b_name, BT_NAME_)
    
    # replace sub name
    a_name_list = a_name.split(" ")
    b_name_list = b_name.split(" ")
    for a_subname in a_name_list:
        a_offset = _update_offset(text, a_subname, AT_NAME_, a_offset)
        b_offset = _update_offset(text, a_subname, AT_NAME_, b_offset)
        p_offset = _update_offset(text, a_subname, AT_NAME_, p_offset)
        text = text.replace(a_subname, AT_NAME_)
    for b_subname in b_name_list:
        a_offset = _update_offset(text, b_subname, BT_NAME_, a_offset)
        b_offset = _update_offset(text, b_subname, BT_NAME_, b_offset)
        p_offset = _update_offset(text, b_subname, BT_NAME_, p_offset)
        text = text.replace(b_subname, BT_NAME_)
    
    # remove suffix
    # replace the whole name
    a_offset = _update_offset(text, AT_NAME_, A_NAME_, a_offset)
    b_offset = _update_offset(text, AT_NAME_, A_NAME_, b_offset)
    p_offset = _update_offset(text, AT_NAME_, A_NAME_, p_offset)
    text = text.replace(AT_NAME_, A_NAME_)

    a_offset = _update_offset(text, BT_NAME_, B_NAME_, a_offset)
    b_offset = _update_offset(text, BT_NAME_, B_NAME_, b_offset)
    p_offset = _update_offset(text, BT_NAME_, B_NAME_, p_offset)
    text = text.replace(BT_NAME_, B_NAME_)
    
    if len(a_name) < len(b_name):
        a_offset, b_offset = b_offset, a_offset
    
    return text, a_offset, b_offset, p_offset

In [None]:
def entity_replace_func(row):
    text, a_offset, b_offset, p_offset = replace_entity_name(
        row['Text'], row['A'], row['B'], row['A-offset'], row['B-offset'], row['Pronoun-offset']
    )
    
    row_ = row.copy()
    row_['Text'] = text
    row_['A'] = A_NAME
    row_['B'] = B_NAME
    row_['A-offset'] = a_offset
    row_['B-offset'] = b_offset
    row_['Pronoun-offset'] = p_offset
    
    return row_

# Clip Text

In [None]:
train_df = pd.read_csv(train_df_path, sep='\t')
sns.distplot(train_df['Text'].map(lambda ele: len(ele.split(" "))), kde_kws={"label": "text"})

del train_df
gc.collect()

Set max length to 150 covers most of cases. Clip text where the token length longer than 150.

In [None]:
MAX_LEN = 150

In [None]:
def bs(list_, target_):
    lo, hi = 0, len(list_) -1
    
    while lo < hi:
        mid = lo + int((hi - lo) / 2)
        
        if target_ < list_[mid]:
            hi = mid
        elif target_ > list_[mid]:
            lo = mid + 1
        else:
            return mid + 1
    return lo

In [None]:
def clip_text(text, max_len, char_offset_p, char_offset_a, char_offset_b):
    doc = nlp(text)
    
    if len(doc) <= max_len:
        return text, 0
    
    token_lens = [token.idx for token in doc]
    char_offset_min = min(char_offset_p, char_offset_a, char_offset_b)
    char_offset_max = max(char_offset_p, char_offset_a, char_offset_b)
    
    # char offset to token offset
    mention_offset_min = bs(token_lens, char_offset_min) - 1
    mention_offset_max = bs(token_lens, char_offset_max) - 1
    
    if mention_offset_max - mention_offset_min + 1 > max_len:
        raise ValueError
    
    # make sure the mention is in the sentence span
    if mention_offset_max < max_len-1:
        hi = doc[max_len].idx
        return text[0:hi].strip(), 0
    else:
        len_span = mention_offset_max - mention_offset_min + 1
        hi_idx = min((max_len - len_span) / 2 + mention_offset_max + 1, len(doc))
        lo_idx = hi_idx - max_len
        text_append = text + " "
        return text_append[doc[lo_idx].idx: doc[hi_idx-1].idx + (doc[hi_idx-1].idx + len(doc[hi_idx-1]))].strip(), doc[lo_idx].idx
    
def text_clip_func(row, max_len):
    text, shift = clip_text(row['Text'], max_len, row['Pronoun-offset'], row['A-offset'], row['B-offset'])
    return pd.Series([text, shift], index=['Text', 'Shift'])

def text_clip_update(df, max_len):
    clip_info = df.apply(lambda row: text_clip_func(row, MAX_LEN), axis=1)
    df['Text'] = clip_info['Text']
    df['Pronoun-offset'] = df['Pronoun-offset'] - clip_info['Shift']
    df['A-offset'] = df['A-offset'] - clip_info['Shift']
    df['B-offset'] = df['B-offset'] - clip_info['Shift']

# Encode By BERT

Downloading the pre-trained BERT -Base, Uncased model. The kernel needs an Internet connection to do this, so make sure it's enabled.

In [None]:
#downloading weights and cofiguration file for the model
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
with zipfile.ZipFile("uncased_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall()
!ls 'uncased_L-12_H-768_A-12'

Next, in order to feed our data to the model, we'll use some scripts from the bert repo on GitHub.

In [None]:
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

In [None]:
import modeling
import extract_features
import tokenization
import tensorflow as tf

Next, we feed BERT the data from these three files. For each line, we want to obtain contextual embeddings for the 3 target words (A, B, Pronoun). Here are some helper functions to keep track of the offsets of the target words.

In [None]:
def compute_offset_no_spaces(text, offset):
    count = 0
    for pos in range(offset):
        if text[pos] != " ": count += 1
    return count


def count_chars_no_special(text):
    count = 0
    special_char_list = ["#"]
    for pos in range(len(text)):
        if text[pos] not in special_char_list: count += 1
    return count


def count_length_no_special(text):
    count = 0
    special_char_list = ["#", " "]
    for pos in range(len(text)):
        if text[pos] not in special_char_list: count += 1
    return count

The following method takes the data from a file, passes it through BERT to obtain contextual embeddings for the target words, then returns these embeddings in the emb DataFrame. Below, we will use it 3 times, once for each of the files gap-test, gap-development, gap-validation.

In [None]:
def encode_by_bert(data, embed_file_name):
    '''
    Runs a forward propagation of BERT on input text, extracting contextual word embeddings
    Input: data, a pandas DataFrame containing the information in one of the GAP files

    Output: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768)
    columns: "emb_A": the embedding for word A
             "emb_B": the embedding for word B
             "emb_P": the embedding for the pronoun
             "label": the answer to the coreference problem: "A", "B" or "NEITHER"
    '''
    # From the current file, take the text only, and write it in a file which will be passed to BERT
    data["Text"].to_csv("input.txt", index=False, header=False)
    # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl
    # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers.
    os.system("python3 extract_features.py \
      --input_file=input.txt \
      --output_file=" + embed_file_name + " \
      --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \
      --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \
      --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \
      --layers=-1 \
      --max_seq_length=" + str(MAX_LEN + 10) + " \
      --batch_size=8")
    os.system("rm input.txt")

    bert_output = pd.read_json(embed_file_name, lines=True)

    index = data.index
    columns = ["emb_A", "emb_B", "emb_P", "label"]
    emb = pd.DataFrame(index=index, columns=columns)
    emb.index.name = "ID"

    rdata = np.zeros(shape=(len(data), 3))
    for i in range(len(data)):  # For each line in the data file

        # For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
        P_offset = compute_offset_no_spaces(data.loc[i, "Text"], data.loc[i, "Pronoun-offset"])
        A_offset = compute_offset_no_spaces(data.loc[i, "Text"], data.loc[i, "A-offset"])
        B_offset = compute_offset_no_spaces(data.loc[i, "Text"], data.loc[i, "B-offset"])

        # Initialize counts
        count_chars = 0

        # find token index for P A and B
        features = pd.DataFrame(
            bert_output.loc[i, "features"])  # Get the BERT embeddings for the current line in the data file
        times_to_try = 7
        for j in range(2 if features.loc[1, "token"] == "\"" else 1, len(
                features)):  # Iterate over the BERT  tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
            token = features.loc[j, "token"]

            # See if the character count until the current token matches the offset of any of the 3 target words
            if count_chars == P_offset:
                found = False
                for tt in range(times_to_try):
                    if features.loc[max(0, j - tt), "token"].lower().strip() == data.iloc[i]['Pronoun'].lower().strip():
                        rdata[i, 0] = max(0, j - tt)
                        found = True
                        break
                if not found:
                    for tt in range(times_to_try - 1):
                        if features.loc[min(len(features) - 1, j + tt + 1), "token"].lower().strip() == data.iloc[i][
                            'Pronoun'].lower().strip():
                            rdata[i, 0] = min(len(features) - 1, j + tt + 1)
                            found = True
                            break
                if not found:
                    print("TOKEN NOT FOUND!")
                    print(features.loc[max(0, j - times_to_try):min(len(features) - 1, j + times_to_try), "token"])
                    print(data.iloc[i]['Pronoun'])
                    print(data.iloc[i]['Text'])
                    print()

            if count_chars == A_offset:
                found = False
                for tt in range(times_to_try):
                    if features.loc[max(0, j - tt), "token"].lower().strip() == data.iloc[i]['A'].lower().strip():
                        rdata[i, 1] = max(0, j - tt)
                        found = True
                        break
                if not found:
                    for tt in range(times_to_try - 1):
                        if features.loc[min(len(features) - 1, j + tt + 1), "token"].lower().strip() == data.iloc[i][
                            'A'].lower().strip():
                            rdata[i, 1] = min(len(features) - 1, j + tt + 1)
                            found = True
                            break
                if not found:
                    print("TOKEN NOT FOUND!")
                    print(features.loc[max(0, j - times_to_try):min(len(features) - 1, j + times_to_try), "token"])
                    print(data.iloc[i]['A'])
                    print(data.iloc[i]['Text'])
                    print()
                    
            if count_chars == B_offset:
                found = False
                for tt in range(times_to_try):
                    if features.loc[max(0, j - tt), "token"].lower().strip() == data.iloc[i]['B'].lower().strip():
                        rdata[i, 2] = max(0, j - tt)
                        found = True
                        break
                if not found:
                    for tt in range(times_to_try - 1):
                        if features.loc[min(len(features) - 1, j + tt + 1), "token"].lower().strip() == data.iloc[i][
                            'B'].lower().strip():
                            rdata[i, 2] = min(len(features) - 1, j + tt + 1)
                            found = True
                            break
                if not found:
                    print("TOKEN NOT FOUND!")
                    print(features.loc[max(0, j - times_to_try):min(len(features) - 1, j + times_to_try), "token"])
                    print(data.iloc[i]['B'])
                    print(data.iloc[i]['Text'])
                    print()
            # Update the character count
            count_chars += count_length_no_special(token)

        data['Pronoun-index'] = rdata[:, 0]
        data['A-index'] = rdata[:, 1]
        data['B-index'] = rdata[:, 2]


## Split Embedding Files

In [None]:
def batch_file_path(dst_folder, dataset_name, batch_index, file_format):
    return dst_folder + "/" + dataset_name + '_' + str(batch_index) + file_format

In [None]:
def split_embed_files(embed_file_name, batch_size, dst_folder, dataset_name):
    bert_output = pd.read_json(embed_file_name, lines=True)
    os.system("rm " + embed_file_name)
    
    batch_matrix = list()
    batch_index = 0
    
    # iterate through texts
    for i in range(len(bert_output)):
        features = pd.DataFrame(bert_output.loc[i, "features"])
        batch_matrix.append(list())
        # iterate through tokens
        for j in range(0, len(features)):
            batch_matrix[-1].append(features.loc[j,"layers"][0]['values'])
        batch_matrix[-1] = np.asarray(batch_matrix[-1])
        
        if len(batch_matrix) == batch_size:
            batch_matrix = np.asarray(batch_matrix)
            np.save(batch_file_path(dst_folder, dataset_name, batch_index, ".npy"), batch_matrix)
            batch_matrix = list()
            batch_index += 1
        
    if len(batch_matrix) > 0:
        batch_matrix = np.asarray(batch_matrix)
        np.save(batch_file_path(dst_folder, dataset_name, batch_index, ".npy"), batch_matrix)

In [None]:
def map_to_batch_index(index, batch_size):
    return index / batch_size

# Process Data

In [None]:
! mkdir embs
embed_folder = "embs"

nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat'])

### Running Tests

In [None]:
# pd.options.display.max_colwidth = 1000

# dev_df = pd.read_csv(dev_df_path, sep='\t').drop(columns=['URL',])
# # clean text
# dev_df_ = dev_df.apply(entity_replace_func, axis=1)
# # clip text
# text_clip_update(dev_df_, MAX_LEN)

# for i in range(dev_df_.shape[0]):
#     row = dev_df_.iloc[i]
#     text = row['Text']
#     if text[row['A-offset']] != 'J':
#         print('J')
#         print(text)
#         print(text[row['A-offset']:row['A-offset']+10] )
#         print(dev_df.iloc[i][['Text', 'A', 'B']])
#     if text[row['B-offset']] != 'B':
#         print('B')
#         print(text)
#         print(text[row['B-offset']:row['B-offset']+10] )
#         print(dev_df.iloc[i][['Text', 'A', 'B']])


# # encode
# dev_embed_file_name = "dev_embed.json"
# encode_by_bert(dev_df_, dev_embed_file_name)
# os.system("rm " + dev_embed_file_name)
# # split
# # split_embed_files(dev_embed_file_name, 32, embed_folder, "dev_embed")
# print("Finished at ", time.ctime())

In [None]:
print("Started at ", time.ctime())

nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat'])

train_embed_file_name = "train_embed.json"
test_embed_file_name = "test_embed.json"
# dev_embed_file_name = "dev_embed.json"

batch_size =32

train_df = pd.read_csv(train_df_path, sep='\t').drop(columns=['URL',])
dev_df = pd.read_csv(dev_df_path, sep='\t').drop(columns=['URL',])
train_df = pd.concat([train_df, dev_df], axis=0, ignore_index=True)
train_df.reset_index(drop=True, inplace=True)
del dev_df
gc.collect()
# clean text
train_df = train_df.apply(entity_replace_func, axis=1)
# clip text
text_clip_update(train_df, MAX_LEN)
# encode
encode_by_bert(train_df, train_embed_file_name)
print("Finished at ", time.ctime())

test_df = pd.read_csv(test_df_path, sep='\t').drop(columns=['URL',])
# clean text
test_df = test_df.apply(entity_replace_func, axis=1)
# clip text
text_clip_update(test_df, MAX_LEN)
encode_by_bert(test_df, test_embed_file_name)
print("Finished at ", time.ctime())

# dev_df = pd.read_csv(dev_df_path, sep='\t').drop(columns=['URL',])
# # clean text
# dev_df = dev_df.apply(entity_replace_func, axis=1)
# # clip text
# text_clip_update(dev_df, MAX_LEN)
# encode_by_bert(dev_df, dev_embed_file_name)
# # split
# split_embed_files(dev_embed_file_name, 32, embed_folder, "dev_embed")
# os.system("rm " + dev_embed_file_name)
# print("Finished at ", time.ctime())

In [None]:
os.system("rm " + "uncased_L-12_H-768_A-12.zip")
os.system("rm -rdf " + "uncased_L-12_H-768_A-12")
os.system("rm " + "bert*")
os.system("rm " + "vocab.text")
os.system("rm " + "extract_features*")
os.system("rm " + "modeling*")
os.system("rm " + "tokenization*")
!ls

In [None]:
# split
split_embed_files(train_embed_file_name, batch_size, embed_folder, "train_embed")
split_embed_files(test_embed_file_name, batch_size, embed_folder, "test_embed")
print("Finished at ", time.ctime())

In [None]:
def _row_to_y(row):
    if row.loc['A-coref']:
        return 0
    if row.loc['B-coref']:
        return 1
    return 2

train_df['Label'] = train_df.apply(_row_to_y, axis=1)
#dev_df['Label'] = dev_df.apply(_row_to_y, axis=1)
test_df['Label'] = test_df.apply(_row_to_y, axis=1)

## Reduce Memory Sapce

In [None]:
focus_columns = ['Pronoun-index', 'A-index', 'B-index', 'Label']
train_df = train_df[focus_columns]
#dev_df = dev_df[focus_columns]
test_df = test_df[focus_columns]
gc.collect()

Now that we have the embeddings, we pass them to a multi-layer perceptron (i.e. vanilla neural network), which learns to classify the triples of embeddings (emb_A, emb_B,emb_P) as "A", "B" or "NEITHER".

# Modeling

In [None]:
import numpy as np
from keras import backend
from keras import layers
from keras import models

from keras import initializers, regularizers, constraints, activations
from keras.engine import Layer
import keras.backend as K
from keras.layers import merge
from keras import callbacks as kc
from keras import optimizers as ko\

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import log_loss

import tensorflow as tf

histories = list()
file_paths = list()
cos = list()

## Define Keras Layers

In [None]:
def _dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    
    
class AttentionWeight(Layer):
    """
        This code is a modified version of cbaziotis implementation:  GithubGist cbaziotis/AttentionWithContext.py
        Attention operation, with a context/query vector, for temporal data.
        Supports Masking.
        Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
        "Hierarchical Attention Networks for Document Classification"
        by using a context vector to assist the attention
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, steps)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(AttentionWeight())
        """

    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWeight, self).__init__(**kwargs)

    def build(self, input_shape):
        shape1 = input_shape[0]
        shape2 = input_shape[1]

        self.W = self.add_weight((shape2[-1], shape1[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((shape2[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, inputs, mask=None):
        x = inputs[0]
        u = inputs[1]
        
        uit = _dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = K.batch_dot(uit, u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        return a

    def compute_output_shape(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError('A `Dot` layer should be called '
                             'on a list of 2 inputs.')
        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])
        
        return shape1[0], shape1[1]

    def get_config(self):
        config = {
            'W_regularizer': regularizers.serialize(self.W_regularizer),
            'b_regularizer': regularizers.serialize(self.b_regularizer),
            'W_constraint': constraints.serialize(self.W_constraint),
            'b_constraint': constraints.serialize(self.b_constraint),
            'bias': self.bias
        }
        base_config = super(AttentionWeight, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    
class FeatureSelection1D(Layer):
    """
        Normalize feature along a specific axis.
        Supports Masking.

        # Input shape
            A ND tensor with shape: `(samples, timesteps, features)
            A 2D tensor with shape: [samples, num_selected_features]
        # Output shape
            ND tensor with shape: `(samples, num_selected_features, features)`.
        :param kwargs:
        """

    def __init__(self, num_selects, **kwargs):

        self.num_selects = num_selects
        self.supports_masking = True
        super(FeatureSelection1D, self).__init__(**kwargs)

    def build(self, input_shape):

        super(FeatureSelection1D, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # don't pass the mask to the next layers
        return None

    def call(self, inputs, mask=None):
        if not isinstance(inputs, list) or len(inputs) != 2:
            raise ValueError('FeatureSelection1D layer should be called '
                             'on a list of 2 inputs.')

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a = K.cast(mask, K.floatx()) * inputs[0]
        else:
            a = inputs[0]

        b = inputs[1]

        a = tf.batch_gather(
            a, b
        )

        return a

    def compute_output_shape(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError('A `FeatureSelection1D` layer should be called '
                             'on a list of 2 inputs.')
        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])

        if shape2[0] != shape1[0]:
            raise ValueError("batch size must be same")

        if shape2[1] != self.num_selects:
            raise ValueError("must conform to the num_select")

        return (shape1[0], self.num_selects, shape1[2])

    def get_config(self):
        config = {
            'num_selects': self.num_selects
        }
        base_config = super(FeatureSelection1D, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

## Define  Model

In [None]:
def build_mapped_mlp_model(
        embed_dim, time_steps, extra_feature_dims, output_dim, model_dim, mlp_dim,
        mlp_depth=1, embed_drop_out=0.5, drop_out=0.5,
        gpu=False, return_customized_layers=False):
    
    # sequences inputs
    input = models.Input(shape=(time_steps, embed_dim), dtype='float32', name='input')
    x = input
    
    # mention position in the sentence
    inputpi = models.Input(shape=(1,), dtype='int32', name='inputpi')
    inputai = models.Input(shape=(1,), dtype='int32', name='inputai')
    inputbi = models.Input(shape=(1,), dtype='int32', name='inputbi')
    xis = [inputpi, inputai, inputbi]
    
    # addtional mention-pair features
#     inputpa = models.Input(shape=(extra_feature_dims,), dtype='float32', name='inputpa')
#     inputpb = models.Input(shape=(extra_feature_dims,), dtype='float32', name='inputpb')
#     xextrs = [inputpa, inputpb]
    
    select_layer = FeatureSelection1D(1, name='boundary_selection_layer')
    embed_bn = layers.BatchNormalization(name = 'embed_batch_norm_layer')
    embed_dropout = layers.Dropout(embed_drop_out, name='embed_dropout_layer')
    map_layer = layers.Dense(model_dim, activation="relu", name="embed_map_layer")
    
    flatten_layer1 = layers.Flatten('channels_first', name="flatten_layer1")
    
    xs = list()
    for i in range(len(xis)):
        select_ = select_layer([x, xis[i]])
        flatten_select_ = flatten_layer1(select_)
        flatten_select_ = embed_bn(flatten_select_)
        flatten_select_ = embed_dropout(flatten_select_)
        flatten_select_ = map_layer(flatten_select_)
        xs.append(flatten_select_)
    
#     feature_dropout_layer = layers.Dropout(rate=drop_out, name="feature_dropout_layer")
#     feature_map_layer = layers.Dense(model_dim, activation="relu",name="feature_map_layer")
#     xextrs = [feature_map_layer(feature_dropout_layer(xextr)) for xextr in xextrs]
    
    #x = layers.Concatenate(axis=1, name="concat_feature_layer")(xs + xextrs)
    x = layers.Concatenate(axis=1, name="concat_feature_layer")(xs)
    x = layers.BatchNormalization(name = 'batch_norm_layer')(x)
    x = layers.Dropout(drop_out, name='dropout_layer')(x)

    # MLP Layers
    for i in range(mlp_depth - 1):
        x = layers.Dense(mlp_dim, activation='selu', kernel_initializer='lecun_normal', name='selu_layer' + str(i))(x)
        x = layers.AlphaDropout(drop_out, name='alpha_layer' + str(i))(x)

    outputs = layers.Dense(output_dim, activation="softmax", name="softmax_layer0")(x)

    model = models.Model([input,] + xis, outputs)

    if return_customized_layers:
        return model, {'FeatureSelection1D': FeatureSelection1D}

    return model

## Prepare Data

In [None]:
embed_dim = 768
time_steps = MAX_LEN + 10
extra_feature_dims = 0
output_dim = 3
model_dim = 20
mlp_dim = 37
mlp_depth=1
embed_drop_out = 0.6
drop_out=0.3
gpu = True
return_customized_layers=True
epochs = 40

In [None]:
def pad_array(array_, max_len):
    array_list = list()
    for i in range(array_.shape[0]):
        sub_array = array_[i]
        if sub_array.shape[0] < max_len:
            array_list.append(np.concatenate((sub_array, np.zeros(shape=(max_len - sub_array.shape[0], sub_array.shape[1])))))
        elif sub_array.shape[0] == max_len:
            array_list.append(sub_array)
        else:
            raise ValueError("max_len: " + str(max_len) + ", actual: " + str(sub_array.shape[0]))
            
    return np.asarray(array_list)

In [None]:
def data_generator(data_df, data_folder, dataset_name, batch_size, time_steps, batch_indices, return_label=True, shuffle=False):
    
    while True:
        if shuffle:
            np.random.shuffle(batch_indices)
        
        for batch_index in batch_indices:
            data_indices = np.array(
                list(range(batch_index * batch_size, min(batch_index * batch_size + batch_size, data_df.shape[0]))))

            tmp_df = data_df.iloc[data_indices]
            tmp_embed = np.load(batch_file_path(data_folder, dataset_name, batch_index, ".npy"))
            tmp_embed = pad_array(tmp_embed, time_steps)

            if return_label:
                yield [tmp_embed, tmp_df['Pronoun-index'].values, tmp_df['A-index'].values, tmp_df['B-index'].values], \
                      tmp_df['Label'].values
            else:
                yield [tmp_embed, tmp_df['Pronoun-index'].values, tmp_df['A-index'].values, tmp_df['B-index'].values]


In [None]:
def load_data(data_df, data_folder, dataset_name, batch_size, time_steps, batch_indices):
    
    X_df = list()
    embeds = list()
    
    for batch_index in batch_indices:
        data_indices = np.array(list(range(batch_index*batch_size, min(batch_index*batch_size+batch_size, data_df.shape[0]))))
        
        tmp_df = data_df.iloc[data_indices]
        tmp_embed = np.load(batch_file_path(data_folder, dataset_name, batch_index, ".npy"))
        tmp_embed = pad_array(tmp_embed, time_steps)
        
        X_df.append(tmp_df)
        embeds.append(tmp_embed)
        
    X_df = pd.concat(X_df, axis=0, ignore_index=True)
    X_df.reset_index(drop=True, inplace=True)
    embeds = np.concatenate(tuple(embeds), axis=0)
    
    return [embeds, X_df['Pronoun-index'].values, X_df['A-index'].values, X_df['B-index'].values], X_df['Label'].values

In [None]:
def measure_log_loss(ground, preds):
    preds = preds.tolist()
    return skm.log_loss(ground, preds, labels=[0, 1, 2], eps=10**-15)

## Train Model

In [None]:
n_fold = 5

In [None]:
# Training and cross-validation
num_batches = int(np.ceil(float(train_df.shape[0]) / batch_size))
dummy_X = np.zeros(shape=(num_batches, 1))

folds = KFold(n_splits=n_fold, shuffle=True, random_state=3)
preds = None
for fold_n, (train_batch_index, dev_batch_index) in enumerate(folds.split(dummy_X)):
    # split training and validation data
    print('Fold', fold_n, 'started at', time.ctime())
    
    # train and dev data
    train_generator = data_generator(train_df, embed_folder, "train_embed", batch_size, time_steps, train_batch_index, return_label=True, shuffle=True)
    X_dev, y_dev = load_data(train_df, embed_folder, "train_embed", batch_size, time_steps, dev_batch_index)
    # test data
    num_test_batches = int(np.ceil(float(test_df.shape[0]) / batch_size))
    test_generator = data_generator(test_df, embed_folder, "test_embed", batch_size, time_steps, np.arange(num_test_batches), return_label=False)
    
    model, co = build_mapped_mlp_model(
        embed_dim, time_steps, extra_feature_dims, output_dim, model_dim, mlp_dim,
        mlp_depth=mlp_depth, embed_drop_out=embed_drop_out, drop_out=drop_out,
        gpu=gpu, return_customized_layers=return_customized_layers)
    
    adam = ko.Nadam()
    model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])
    
    model_path = "best_bert_atten_model.hdf5"
    check_point = kc.ModelCheckpoint(model_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
    early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
    model.fit_generator(train_generator, steps_per_epoch=train_batch_index.shape[0], epochs=epochs, validation_data=(X_dev, y_dev), callbacks = [check_point, early_stop])
    del model
    gc.collect()
    
    print("load best model: " + model_path)
    model = models.load_model(model_path, co)
    preds_tmp = model.predict_generator(test_generator, num_test_batches)
    
    print("single model accuracy: ")
    print(measure_log_loss(test_df['Label'].values, preds_tmp))
    
    if preds is None:
        preds = preds_tmp
    else:
        preds += preds_tmp
    
preds /= n_fold
print("cv model accuracy: ")
print(measure_log_loss(test_df['Label'].values, preds))

# Save Results

In [None]:
sub_df_path = os.path.join(SUB_DATA_FOLDER, 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(preds[:, 0])
sub_df.loc[:, 'B'] = pd.Series(preds[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(preds[:, 2])

sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)