This kernel implements 4 DL models for coreference resolution. All the model in this kernel are Non-RNN Based DL models.

Features extraction used in this kernel follows Clark and Mannings work: https://nlp.stanford.edu/pubs/clark2016improving.pdf
If you are interested in RNN based End2End coreference solution model, please check this kernel: https://www.kaggle.com/keyit92/end2end-coref-resolution-by-attention-rnn.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import gc
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

  return f(*args, **kwds)


['.DS_Store', 'gendered-pronoun-resolution', 'gap-coreference']


  return f(*args, **kwds)


In [2]:
DATA_ROOT = '../input/'
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap-coreference')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')
FAST_TEXT_DATA_FOLDER = os.path.join(DATA_ROOT, 'fasttext-crawl-300d-2m')

# Import Data

In [3]:
test_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
train_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-test.tsv')
dev_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-validation.tsv')

train_df = pd.read_csv(train_df_path, sep='\t')
test_df = pd.read_csv(test_df_path, sep='\t')
dev_df = pd.read_csv(dev_df_path, sep='\t')

# pd.options.display.max_colwidth = 1000

In [4]:
train_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers


# Explore Features for Building Mention-Pair Distributed Representation

In [5]:
from spacy.lang.en import English
from spacy.pipeline import DependencyParser
import spacy
from nltk import Tree
from category_encoders.one_hot import OneHotEncoder

In [41]:
nlp = spacy.load('en_core_web_sm')

## Clean Text

### Clean up Entity Names
Replace Entity Names A and B by Alice and Bob.

In [8]:
A_NAME = 'Alice'
B_NAME = 'Bob'

def find_all_substring(a_str, sub):
    start = 0
    result = list()
    while True:
        start = a_str.find(sub, start)
        if start == -1:
            return result
        result.append(start)
        start += len(sub) # use start += 1 to find overlapping matches

def _update_offset(text, old_, new_, offset):
    len_in = len(new_) - len(old_)
    text_ = text[0:offset]
    return offset + len_in * len(find_all_substring(text_, old_))
    
def replace_entity_name(text, a_name, b_name, a_offset, b_offset, p_offset):
    # replace the whole name
    a_offset = _update_offset(text, a_name, A_NAME, a_offset)
    b_offset = _update_offset(text, a_name, A_NAME, b_offset)
    p_offset = _update_offset(text, a_name, A_NAME, p_offset)
    text = text.replace(a_name, A_NAME)

    a_offset = _update_offset(text, b_name, B_NAME, a_offset)
    b_offset = _update_offset(text, b_name, B_NAME, b_offset)
    p_offset = _update_offset(text, b_name, B_NAME, p_offset)
    text = text.replace(b_name, B_NAME)
    
    # replace sub name
    a_name_list = a_name.strip().split(" ")
    b_name_list = b_name.strip().split(" ")
    for a_subname in a_name_list:
        a_offset = _update_offset(text, a_subname, A_NAME, a_offset)
        b_offset = _update_offset(text, a_subname, A_NAME, b_offset)
        p_offset = _update_offset(text, a_subname, A_NAME, p_offset)
        text = text.replace(a_subname, A_NAME)
    for b_subname in b_name_list:
        a_offset = _update_offset(text, b_subname, B_NAME, a_offset)
        b_offset = _update_offset(text, b_subname, B_NAME, b_offset)
        p_offset = _update_offset(text, b_subname, B_NAME, p_offset)
        text = text.replace(a_subname, B_NAME)
    
    return text, a_offset, b_offset, p_offset

In [9]:
def entity_replace_func(row):
    text, a_offset, b_offset, p_offset = replace_entity_name(
        row['Text'], row['A'], row['B'], row['A-offset'], row['B-offset'], row['Pronoun-offset']
    )
    
    row_ = row.copy()
    row_['Text'] = text
    row_['A'] = A_NAME
    row_['B'] = B_NAME
    row_['A-offset'] = a_offset
    row_['B-offset'] = b_offset
    row_['Pronoun-offset'] = p_offset
    
    return row_

train_df = train_df.apply(entity_replace_func, axis=1)
test_df = test_df.apply(entity_replace_func, axis=1)
dev_df = dev_df.apply(entity_replace_func, axis=1)

## Train POS Tag Embeddings

In [11]:
from gensim.models import Word2Vec

In [12]:
def pos_tags(text):
    doc = nlp(text)
    
    return [tok_.pos_ for tok_ in doc]

In [13]:
all_texts = list(train_df['Text'].values.tolist() + test_df['Text'].values.tolist() + dev_df['Text'].values.tolist())
pos_tags = [pos_tags(text_) for text_ in all_texts]

In [14]:
pos_emb_size = 16
pos_w2v = Word2Vec(pos_tags, size=pos_emb_size, min_count=1)

## Embedding Features

Follow the idea from the work by Clark and Manning, extract word embedding of head word, dependency parent, first word, last word, two preceding words and two following words of the mention.  Average word embeding of the five preceding words, five following words, all words in the mention, all words in the sentences.

### Parse Text

In [15]:
def bs_(list_, target_):
    lo, hi = 0, len(list_) -1
    
    while lo < hi:
        mid = lo + int((hi - lo) / 2)
        
        if target_ < list_[mid]:
            hi = mid
        elif target_ > list_[mid]:
            lo = mid + 1
        else:
            return mid
    return lo

def ohe_dist(dist, buckets):
    idx = bs_(buckets, dist)
    oh = np.zeros(shape=(len(buckets),), dtype=np.float32)
    oh[idx] = 1
    
    return oh

In [16]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

def bs(list_, target_):
    lo, hi = 0, len(list_) -1
    
    while lo < hi:
        mid = lo + int((hi - lo) / 2)
        
        if target_ < list_[mid]:
            hi = mid
        elif target_ > list_[mid]:
            lo = mid + 1
        else:
            return mid + 1
    return lo

def _get_preceding_words(tokens, offset, k):
    start = offset - k
    
    precedings = [None] * max(0, 0-start)
    start = max(0, start)
    precedings += tokens[start: offset]
    
    return precedings

def _get_following_words(tokens, offset, k):
    end = offset + k
    
    followings = [None] * max(0, end - len(tokens))
    end = min(len(tokens), end)
    followings += tokens[offset: end]
    
    return followings
        

def extrac_embed_features_tokens(text, char_offset):
    doc = nlp(text)
    
    # char offset to token offset
    lens = [token.idx for token in doc]
    mention_offset = bs(lens, char_offset) - 1
    # mention_word
    mention = doc[mention_offset]
    
    # token offset to sentence offset
    lens = [len(sent) for sent in doc.sents]
    acc_lens = [len_ for len_ in lens]
    pre_len = 0
    for i in range(0, len(acc_lens)):
        pre_len += acc_lens[i]
        acc_lens[i] = pre_len
    sent_index = bs(acc_lens, mention_offset)
    # mention sentence
    sent = list(doc.sents)[sent_index]
    
    # dependency parent
    head = mention.head
    
    # last word and first word
    first_word, last_word = sent[0], sent[-2]
    
    assert mention_offset >= 0
    
    # two preceding words and two following words
    tokens = list(doc)
    precedings2 = _get_preceding_words(tokens, mention_offset, 2)
    followings2 = _get_following_words(tokens, mention_offset, 2)
    
    # five preceding words and five following words
    precedings5 = _get_preceding_words(tokens, mention_offset, 5)
    followings5 = _get_following_words(tokens, mention_offset, 5)
    
    # sentence words
    sent_tokens = [token for token in sent]
    
     # buckets
    bucket_pos = [0, 1, 2, 3, 4, 5, 8, 16, 32]
    # absolute position in the sentence
    sent_pos = mention_offset
    if sent_index > 0:
        sent_pos = mention_offset - acc_lens[sent_index-1]
    sent_pos_oh = ohe_dist(sent_pos, bucket_pos)
    sent_pos_inv = len(sent) - sent_pos - 1
    assert sent_pos_inv >= 0
    sent_pos_inv_oh = ohe_dist(sent_pos_inv, bucket_pos)
    
    return mention, head, first_word, last_word, precedings2, followings2, precedings5, followings5, sent_tokens, sent_pos_oh, sent_pos_inv_oh

Example:

In [38]:
print("Texts: ")
text = test_df.iloc[0]['Text']
print(text)

#print("\nDependency parsing trees: ")
doc = nlp(text)
#[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

print("\nFeatures:")
mention, parent, first_word, last_word, precedings2, followings2, precedings5, followings5, sent_tokens, sent_pos_oh, sent_pos_inv_oh = extrac_embed_features_tokens(text, test_df.iloc[0]['A-offset'])
features = pd.Series([str(feature) for feature in (mention, parent, first_word, last_word, precedings2, followings2, precedings5, followings5, sent_tokens, sent_pos_oh, sent_pos_inv_oh)], index=['mention', 'parent', 'first_word', 'last_word', 'precedings2', 'followings2', 'precedings5', 'followings5', 'sent_tokens', 'sent_pos_oh', 'sent_pos_inv_oh'])
features

Texts: 
Zoe Telford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after he slept with Jenny, and is not seen again. Phoebe Thomas played Alice, Bob's friend and also a year 11 pupil in Simon's class. Dumped her boyfriend following Simon's advice after he wouldn't have sex with her but later realised this was due to him catching crabs off her friend Bob.

Features:


mention                                                        Alice
parent                                                        played
first_word                                                    Phoebe
last_word                                                      class
precedings2                                         [Thomas, played]
followings2                                               [Alice, ,]
precedings5                       [again, ., Phoebe, Thomas, played]
followings5                              [Alice, ,, Bob, 's, friend]
sent_tokens        [Phoebe, Thomas, played, Alice, ,, Bob, 's, fr...
sent_pos_oh                             [0. 0. 0. 1. 0. 0. 0. 0. 0.]
sent_pos_inv_oh                         [0. 0. 0. 0. 0. 0. 0. 1. 0.]
dtype: object

### Generate Embedding Features

In [18]:
num_embed_features = 11
embed_dim = 384

In [19]:
def spacy_feats(tokens):
    feats = list()
    for token in tokens:
        if token is None:
            feats += ["na", "na", "na", "na"]
            continue
        feats += [token.dep_, token.shape_, str(token.is_alpha), str(token.is_stop)]
    return feats
    
def create_embedding_features(df, text_column, offset_column):
    text_offset_list = df[[text_column, offset_column]].values.tolist()
    num_features = num_embed_features
    
    embed_feature_matrix = np.zeros(shape=(len(text_offset_list), num_features, embed_dim + pos_emb_size))
    other_features = list()
    pos_features = list()
    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        mention, parent, first_word, last_word, precedings2, followings2, precedings5, followings5, sent_tokens, sent_pos_oh, sent_pos_inv_oh = extrac_embed_features_tokens(text_offset[0], text_offset[1])
        
        feature_index = 0
        embed_feature_matrix[text_offset_index, feature_index, :] = np.concatenate((mention.vector, pos_w2v[mention.pos_]))
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index, :] = np.concatenate((parent.vector, pos_w2v[parent.pos_]))
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index, :] = np.concatenate((first_word.vector, pos_w2v[first_word.pos_]))
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index, :] = np.concatenate((last_word.vector, pos_w2v[last_word.pos_]))
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index:feature_index+2, :] = np.asarray([np.concatenate((token.vector, pos_w2v[token.pos_])) if token is not None else np.zeros((embed_dim+pos_emb_size,)) for token in precedings2])
        feature_index += len(precedings2)
        embed_feature_matrix[text_offset_index, feature_index:feature_index+2, :] = np.asarray([np.concatenate((token.vector, pos_w2v[token.pos_])) if token is not None else np.zeros((embed_dim+pos_emb_size,)) for token in followings2])
        feature_index += len(followings2)
        precedings5 = list(filter(
            lambda token_: token_ is not None,
            precedings5
        ))
        followings5 = list(filter(
            lambda token_: token_ is not None,
            followings5
        ))
        embed_feature_matrix[text_offset_index, feature_index, :] = np.mean(np.asarray([np.concatenate((token.vector, pos_w2v[token.pos_])) if token is not None else np.zeros((embed_dim+pos_emb_size,)) for token in precedings5]), axis=0)
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index, :] = np.mean(np.asarray([np.concatenate((token.vector, pos_w2v[token.pos_])) if token is not None else np.zeros((embed_dim+pos_emb_size,)) for token in followings5]), axis=0)
        feature_index += 1
        embed_feature_matrix[text_offset_index, feature_index, :] = np.mean(np.asarray([np.concatenate((token.vector, pos_w2v[token.pos_])) for token in sent_tokens]), axis=0) if len(sent_tokens) > 0 else np.zeros((embed_dim+pos_emb_size,))
        feature_index += 1
        
        other_features.append(list())
        other_features[-1] += spacy_feats([mention, parent, first_word, last_word])
        other_features[-1] += spacy_feats(precedings2)
        other_features[-1] += spacy_feats(followings2)
        
        pos_features.append(np.concatenate((sent_pos_oh, sent_pos_inv_oh)))
    
    return embed_feature_matrix, other_features, np.asarray(pos_features)

 ##  Position Features

Encode the absolute positions in the sentence and the relative position between the pronoun and the entities.

In [20]:
def extrac_positional_features(text, char_offset1, char_offset2):
    doc = nlp(text)
    max_len = 64
    
    # char offset to token offset
    lens = [token.idx for token in doc]
    mention_offset1 = bs(lens, char_offset1) - 1
    mention_offset2 = bs(lens, char_offset2) - 1
    
    # token offset to sentence offset
    lens = [len(sent) for sent in doc.sents]
    acc_lens = [len_ for len_ in lens]
    pre_len = 0
    for i in range(0, len(acc_lens)):
        pre_len += acc_lens[i]
        acc_lens[i] = pre_len
    sent_index1 = bs(acc_lens, mention_offset1)
    sent_index2 = bs(acc_lens, mention_offset2)
    
    sent1 = list(doc.sents)[sent_index1]
    sent2 = list(doc.sents)[sent_index2]
    
    # buckets
    bucket_dist = [1, 2, 3, 4, 5, 8, 16, 32, 64]
    
    # relative distance
    dist = mention_offset2 - mention_offset1
    dist_oh = ohe_dist(dist, bucket_dist)
    
    return dist_oh

In [21]:
num_pos_features = 9

In [22]:
def create_dist_features(df, text_column, pronoun_offset_column, name_offset_column):
    text_offset_list = df[[text_column, pronoun_offset_column, name_offset_column]].values.tolist()
    num_features = num_pos_features
    
    pos_feature_matrix = np.zeros(shape=(len(text_offset_list), num_features))
    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        dist_oh = extrac_positional_features(text_offset[0], text_offset[1], text_offset[2])
        
        feature_index = 0
        pos_feature_matrix[text_offset_index, feature_index:feature_index+len(dist_oh)] = np.asarray(dist_oh)
        feature_index += len(dist_oh)
#         pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_oh1)] = np.asarray(sent_pos_oh1)
#         feature_index += len(sent_pos_oh1)
#         pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_oh2)] = np.asarray(sent_pos_oh2)
#         feature_index += len(sent_pos_oh2)
#         pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_inv_oh1)] = np.asarray(sent_pos_inv_oh1)
#         feature_index += len(sent_pos_inv_oh1)
#         pos_feature_matrix[text_offset_index, feature_index:feature_index+len(sent_pos_inv_oh2)] = np.asarray(sent_pos_inv_oh2)
#         feature_index += len(sent_pos_inv_oh2)
    
    return pos_feature_matrix

### Generate Training, Validation and Testing Data

In [23]:
p_emb_tra, p_feats_tra, p_pos_tra = create_embedding_features(train_df, 'Text', 'Pronoun-offset')
p_emb_dev, p_feats_dev, p_pos_dev = create_embedding_features(dev_df, 'Text', 'Pronoun-offset')
p_emb_test, p_feats_test, p_pos_test = create_embedding_features(test_df, 'Text', 'Pronoun-offset')

a_emb_tra, a_feats_tra, a_pos_tra = create_embedding_features(train_df, 'Text', 'A-offset')
a_emb_dev, a_feats_dev, a_pos_dev = create_embedding_features(dev_df, 'Text', 'A-offset')
a_emb_test, a_feats_test, a_pos_test = create_embedding_features(test_df, 'Text', 'A-offset')

b_emb_tra, b_feats_tra, b_pos_tra = create_embedding_features(train_df, 'Text', 'B-offset')
b_emb_dev, b_feats_dev, b_pos_dev = create_embedding_features(dev_df, 'Text', 'B-offset')
b_emb_test, b_feats_test, b_pos_test = create_embedding_features(test_df, 'Text', 'B-offset')

pa_dist_tra = create_dist_features(train_df, 'Text', 'Pronoun-offset', 'A-offset')
pa_dist_dev = create_dist_features(dev_df, 'Text', 'Pronoun-offset', 'A-offset')
pa_dist_test = create_dist_features(test_df, 'Text', 'Pronoun-offset', 'A-offset')

pb_dist_tra = create_dist_features(train_df, 'Text', 'Pronoun-offset', 'B-offset')
pb_dist_dev = create_dist_features(dev_df, 'Text', 'Pronoun-offset', 'B-offset')
pb_dist_test = create_dist_features(test_df, 'Text', 'Pronoun-offset', 'B-offset')



ValueError: could not broadcast input array from shape (316) into shape (400)

In [None]:
# One hot encode other features
feats_encoder = OneHotEncoder(return_df=False)
feats_encoder.fit(
    p_feats_tra + p_feats_dev + p_feats_test + a_feats_tra + a_feats_dev + a_feats_test + b_feats_tra + b_feats_dev + b_feats_test
)

p_encode_tra = np.concatenate((feats_encoder.transform(p_feats_tra), p_pos_tra), axis=1)
a_encode_tra = np.concatenate((feats_encoder.transform(a_feats_tra), a_pos_tra), axis=1)
b_encode_tra = np.concatenate((feats_encoder.transform(b_feats_tra), b_pos_tra), axis=1)

p_encode_test = np.concatenate((feats_encoder.transform(p_feats_test), p_pos_test), axis=1)
a_encode_test = np.concatenate((feats_encoder.transform(a_feats_test), a_pos_test), axis=1)
b_encode_test = np.concatenate((feats_encoder.transform(b_feats_test), b_pos_test), axis=1)

p_encode_dev = np.concatenate((feats_encoder.transform(p_feats_dev), p_pos_dev), axis=1)
a_encode_dev = np.concatenate((feats_encoder.transform(a_feats_dev), a_pos_dev), axis=1)
b_encode_dev = np.concatenate((feats_encoder.transform(b_feats_dev), b_pos_dev), axis=1)

In [None]:
def _row_to_y(row):
    if row.loc['A-coref']:
        return 0
    if row.loc['B-coref']:
        return 1
    return 2

y_tra = train_df.apply(_row_to_y, axis=1)
y_dev = dev_df.apply(_row_to_y, axis=1)
y_test = test_df.apply(_row_to_y, axis=1)

In [None]:
X_train = [p_emb_tra, a_emb_tra, b_emb_tra, p_encode_tra, a_encode_tra, b_encode_tra, pa_dist_tra, pb_dist_tra]
X_dev = [p_emb_dev, a_emb_dev, b_emb_dev, p_encode_dev, a_encode_dev, b_encode_dev, pa_dist_dev, pb_dist_dev]
X_test = [p_emb_test, a_emb_test, b_emb_test, p_encode_test, a_encode_test, b_encode_test, pa_dist_test, pb_dist_test]

# Define DL Models

In [34]:
import numpy as np
from keras import backend
from keras import layers
from keras import models

## Coattention Model

#### Define Co-attention Layer

In [35]:
from keras import initializers, regularizers, constraints, activations
from keras.engine import Layer
import keras.backend as K
from keras.layers import merge

In [36]:
def _dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class CoAttentionWeight(Layer):
    """
        Unnormalized Co-Attention operation for temporal data.
        Supports Masking.
        Follows the work of Ankur et al. [https://aclweb.org/anthology/D16-1244]
        "A Decomposable Attention Model for Natural Language Inference"
        # Input shape
            List of 2 3D tensor with shape: `(samples, steps1, features1)` and `(samples, steps2, features2)`.
        # Output shape
            3D tensor with shape: `(samples, steps1, step2)`.
        :param kwargs:
        """

    def __init__(self, W_regularizer=None, W_constraint=None, **kwargs):

        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.W_constraint = constraints.get(W_constraint)

        super(CoAttentionWeight, self).__init__(**kwargs)

    def build(self, input_shape):
        super(CoAttentionWeight, self).build(input_shape)

        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError('A `Coattention` layer should be called '
                             'on a list of 2 inputs.')
        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])

        if shape1[-1] != shape2[-1]:
            raise ValueError("The last dimention of input tensors must be same. "
                             "Otherwise use RemappedCoattentionWeight instead")

        self.W = self.add_weight((shape1[-1], shape1[-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)

    def compute_mask(self, input, input_mask=None):
        # pass the mask to the next layers
        return input_mask

    def call(self, inputs, **kwargs):
        if len(inputs) != 2:
            raise ValueError('A `Subtract` layer should be called '
                             'on exactly 2 inputs')

        x1, x2 = inputs[0], inputs[1]

        if x1.shape[-1] != x2.shape[-1]:
            raise ValueError("The last dimention of input tensors must be same. "
                             "Otherwise use RemappedCoattentionWeight instead")

        # atten = exp(u1 W u2^T)
        atten = _dot_product(x1, self.W)
        atten = K.batch_dot(atten, x2, axes=[2, 2])
        atten = K.exp(atten)

        return atten

    def compute_output_shape(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError('A `Coattention` layer should be called '
                             'on a list of 2 inputs.')
        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])

        if shape1[0] != shape2[0]:
            raise ValueError("batch size must be same")

        return shape1[0], shape1[1], shape2[1]

    def get_config(self):
        config = {
            'W_regularizer': regularizers.serialize(self.W_regularizer),
            'W_constraint': constraints.serialize(self.W_constraint),
        }
        base_config = super(CoAttentionWeight, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    
class RemappedCoAttentionWeight(Layer):
    """
        Unnormalized Co-Attention operation for temporal data.
        Supports Masking.
        Follows the work of Ankur et al. [https://aclweb.org/anthology/D16-1244]
        "A Decomposable Attention Model for Natural Language Inference"
        # Input shape
            List of 2 3D tensor with shape: `(samples, steps1, features1)` and `(samples, steps2, features2)`.
        # Output shape
            3D tensor with shape: `(samples, steps1, step2)`.
        :param kwargs:
        """

    def __init__(self, model_size, activation='sigmoid',
                 W1_regularizer=None, W2_regularizer=None, b1_regularizer=None, b2_regularizer=None,
                 W1_constraint=None, W2_constraint=None, b1_constraint=None, b2_constraint=None,
                 bias1=True, bias2=True, **kwargs):

        self.model_size = model_size
        self.init = initializers.get('glorot_uniform')

        self.W1_regularizer = regularizers.get(W1_regularizer)
        self.W2_regularizer = regularizers.get(W2_regularizer)
        self.b1_regularizer = regularizers.get(b1_regularizer)
        self.b2_regularizer = regularizers.get(b2_regularizer)

        self.W1_constraint = constraints.get(W1_constraint)
        self.W2_constraint = constraints.get(W2_constraint)
        self.b1_constraint = constraints.get(b1_constraint)
        self.b2_constraint = constraints.get(b2_constraint)

        self.bias1 = bias1
        self.bias2 = bias2
        self.activation = activations.get(activation)
        super(RemappedCoAttentionWeight, self).__init__(**kwargs)

    def build(self, input_shape):

        if len(input_shape) != 2:
            raise ValueError("input must be a size two list which contains two tensors")

        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])

        self.W1 = self.add_weight((self.model_size, shape1[-1]),
                                  initializer=self.init,
                                  name='{}_W1'.format(self.name),
                                  regularizer=self.W1_regularizer,
                                  constraint=self.W1_constraint)

        self.W2 = self.add_weight((self.model_size, shape2[-1]),
                                  initializer=self.init,
                                  name='{}_W2'.format(self.name),
                                  regularizer=self.W2_regularizer,
                                  constraint=self.W2_constraint)

        if self.bias1:
            self.b1 = self.add_weight((self.model_size,),
                                      initializer='zero',
                                      name='{}_b1'.format(self.name),
                                      regularizer=self.b1_regularizer,
                                      constraint=self.b1_constraint)

        if self.bias2:
            self.b2 = self.add_weight((self.model_size,),
                                      initializer='zero',
                                      name='{}_b2'.format(self.name),
                                      regularizer=self.b2_regularizer,
                                      constraint=self.b2_constraint)

    def compute_mask(self, input, input_mask=None):
        # pass the mask to the next layers
        return input_mask

    def call(self, inputs, **kwargs):
        if len(inputs) != 2:
            raise ValueError('A `Subtract` layer should be called '
                             'on exactly 2 inputs')

        x1, x2 = inputs[0], inputs[1]

        # u = Wx + b
        u1 = _dot_product(x1, self.W1)
        if self.bias1:
            u1 += self.b1

        u2 = _dot_product(x2, self.W2)
        if self.bias2:
            u2 += self.b2

        # u = Activation(Wx + b)
        u1 = self.activation(u1)
        u2 = self.activation(u2)

        # atten = exp(u1 u2^T)
        atten = K.batch_dot(u1, u2, axes=[2, 2])
        atten = K.exp(atten)

        return atten

    def compute_output_shape(self, input_shape):
        if not isinstance(input_shape, list) or len(input_shape) != 2:
            raise ValueError('A `Dot` layer should be called '
                             'on a list of 2 inputs.')
        shape1 = list(input_shape[0])
        shape2 = list(input_shape[1])

        if shape1[0] != shape2[0]:
            raise ValueError("batch size must be same")

        return shape1[0], shape1[1], shape2[1]

    def get_config(self):
        config = {
            'activation': self.activation,
            'model_size': self.model_size,
            'W1_regularizer': regularizers.serialize(self.W1_regularizer),
            'W2_regularizer': regularizers.serialize(self.W2_regularizer),
            'b1_regularizer': regularizers.serialize(self.b1_regularizer),
            'b2_regularizer': regularizers.serialize(self.b2_regularizer),
            'W1_constraint': constraints.serialize(self.W1_constraint),
            'W2_constraint': constraints.serialize(self.W2_constraint),
            'b1_constraint': constraints.serialize(self.b1_constraint),
            'b2_constraint': constraints.serialize(self.b2_constraint),
            'bias1': self.bias1,
            'bias2': self.bias2
        }
        base_config = super(RemappedCoAttentionWeight, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
class FeatureNormalization(Layer):
    """
        Normalize feature along a specific axis.
        Supports Masking.

        # Input shape
            A ND tensor with shape: `(samples, feature1 ... featuresN).
        # Output shape
            ND tensor with shape: `(samples, feature1 ... featuresN)`.
        :param kwargs:
        """

    def __init__(self, axis=-1, **kwargs):

        self.axis = axis
        self.supports_masking = True
        super(FeatureNormalization, self).__init__(**kwargs)

    def build(self, input_shape):

        super(FeatureNormalization, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # don't pass the mask to the next layers
        return None

    def call(self, inputs, mask=None):
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a = K.cast(mask, K.floatx()) * inputs
        else:
            a = inputs

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=self.axis, keepdims=True) + K.epsilon(), K.floatx())
        
        return a

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {
            'axis': self.axis
        }
        base_config = super(FeatureNormalization, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

# Build and Train Model

In [32]:
from keras import callbacks as kc
from keras import optimizers as ko
from keras import initializers, regularizers, constraints

import matplotlib.pyplot as plt
from IPython.display import SVG


from numpy.random import seed
from tensorflow import set_random_seed

seed(1)
set_random_seed(2)

histories = list()
cos = list()
model_paths = list()

## Baseline Model MLP

In [None]:
def build_mlp_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, mlp_dim, 
    mlp_depth=1, embed_dropout=0.5, drop_out=0.2, 
    return_customized_layers=False):
    """
    Create A Multi-Layer Perceptron Model.
    
    inputs: 
        embeddings: [batch, num_embed_feature, embed_dims] * 3 ## pronoun, A, B
        features: [batch, num_other_feature] * 3 ## pronoun, A, B
        mention_pair_features: [batch, num_mention_pair_feature] * 2 ## pronoun-A, pronoun-B
        
    outputs: 
        [batch, num_classes] # in our case there should be 3 output classes: A, B, None
        
    :param output_dim: the output dimension size
    :param model_dim: rrn dimension size
    :param mlp_dim: the dimension size of fully connected layer
    :param mlp_depth: the depth of fully connected layers
    :param drop_out: dropout rate of fully connected layers
    :param return_customized_layers: boolean, default=False
        If True, return model and customized object dictionary, otherwise return model only
    :return: keras model
    """

    # inputs
    inputs1 = list()
    for fi in range(num_channels1):
        inputs1.append(models.Input(shape=(num_features1, feature_dim1), dtype='float32', name='input1_' + str(fi)))
        
    inputs2 = list()
    for fi in range(num_channels2):
        inputs2.append(models.Input(shape=(num_features2,), dtype='float32', name='input2_' + str(fi)))
        
    inputs3 = list()
    for fi in range(num_channels3):
        inputs3.append(models.Input(shape=(num_features3,), dtype='float32', name='input3_' + str(fi)))
        
    features1_pip = models.Sequential()
    features1_pip.add(layers.TimeDistributed(layers.Dropout(rate=embed_dropout, name="embed_dropout_layer")))
    features1_pip.add(layers.Flatten(name="embed_flatten_layer"))
    
    x1 = [features1_pip(input_) for input_ in inputs1]
    x2 = inputs2
    x3 = inputs3
    
    x = layers.Concatenate(axis=1, name="concate_layer")(x1+x2+x3)
    
    # MLP Layers
    x = layers.BatchNormalization(name='batch_norm_layer')(x)
    x = layers.Dropout(rate=drop_out, name="dropout_layer")(x)
        
    for i in range(mlp_depth - 1):
        x = layers.Dense(mlp_dim, activation='selu', kernel_initializer='lecun_normal', name='selu_layer' + str(i))(x)
        x = layers.AlphaDropout(drop_out, name='alpha_layer' + str(i))(x)

    outputs = layers.Dense(output_dim, activation="softmax", name="softmax_layer0")(x)

    model = models.Model(inputs1 + inputs2 + inputs3, outputs)

    if return_customized_layers:
        return model, {}

    return model

### Build Model 

In [None]:
num_channels1 = 3
num_channels2 = 3
num_channels3 = 2
num_features1 = p_emb_tra.shape[1]
num_features2 = p_encode_tra.shape[1]
#num_features2 = p_pos_tra.shape[1]
num_features3 = pa_dist_tra.shape[1]
feature_dim1 = p_emb_tra.shape[2]
output_dim = 3
mlp_dim = 10
mlp_depth=2
embed_dropout=0.5
drop_out=0.2
return_customized_layers=True

model, co = build_mlp_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, mlp_dim, 
    mlp_depth=mlp_depth, embed_dropout=embed_dropout, drop_out=drop_out, 
    return_customized_layers=return_customized_layers
)

cos.append(co)

In [None]:
print(model.summary())

### Train Model

In [None]:
adam = ko.Nadam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

file_path = "best_mlp_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
history = model.fit(X_train, y_tra, batch_size=20, epochs=20, validation_data=(X_dev, y_dev), callbacks = [check_point, early_stop])

histories.append(np.min(np.asarray(history.history['val_loss'])))
model_paths.append(file_path)

del model, history
gc.collect()

## Remapped Model MLP

In [None]:
def build_remap_mlp_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    mlp_depth=1, embed_dropout=0.5, drop_out=0.2, 
    return_customized_layers=False):
    """
    Create A Multi-Layer Perceptron Model.
    
    inputs: 
        embeddings: [batch, num_embed_feature, embed_dims] * 3 ## pronoun, A, B
        features: [batch, num_other_feature] * 3 ## pronoun, A, B
        mention_pair_features: [batch, num_mention_pair_feature] * 2 ## pronoun-A, pronoun-B
        
    outputs: 
        [batch, num_classes] # in our case there should be 3 output classes: A, B, None
        
    :param output_dim: the output dimension size
    :param model_dim: rrn dimension size
    :param mlp_dim: the dimension size of fully connected layer
    :param mlp_depth: the depth of fully connected layers
    :param drop_out: dropout rate of fully connected layers
    :param return_customized_layers: boolean, default=False
        If True, return model and customized object dictionary, otherwise return model only
    :return: keras model
    """

        # inputs
    inputs1 = list()
    for fi in range(num_channels1):
        inputs1.append(models.Input(shape=(num_features1, feature_dim1), dtype='float32', name='input1_' + str(fi)))
        
    inputs2 = list()
    for fi in range(num_channels2):
        inputs2.append(models.Input(shape=(num_features2,), dtype='float32', name='input2_' + str(fi)))
        
    inputs3 = list()
    for fi in range(num_channels3):
        inputs3.append(models.Input(shape=(num_features3,), dtype='float32', name='input3_' + str(fi)))
        
    features1_pip = models.Sequential()
    features1_pip.add(layers.TimeDistributed(layers.Dropout(rate=embed_dropout, name="embed_dropout_layer")))
    features1_pip.add(layers.TimeDistributed(layers.Dense(model_dim1, name="feature_map_layer1", activation="relu")))
    features1_pip.add(layers.Flatten(name="embed_flatten_layer"))
    
    features2_pip = models.Sequential()
    features2_pip.add(layers.Dropout(rate=embed_dropout, name="dropout_layer2"))
    features2_pip.add(layers.Dense(model_dim2, name="feature_map_layer2", activation="relu"))
    
    features3_pip = models.Sequential()
    features3_pip.add(layers.Dense(model_dim3, name="feature_map_layer3", activation="relu"))
    
    x1 = [features1_pip(input_) for input_ in inputs1]
    x2 = [features2_pip(input_) for input_ in inputs2]
    x3 = [features3_pip(input_) for input_ in inputs3]
    
    x = layers.Concatenate(axis=1, name="concate_layer")(x1+x2+x3)
    
    # MLP Layers
    x = layers.BatchNormalization(name='batch_norm_layer')(x)
    x = layers.Dropout(rate=drop_out, name="dropout_layer")(x)
        
    for i in range(mlp_depth - 1):
        x = layers.Dense(mlp_dim, activation='selu', kernel_initializer='lecun_normal', name='selu_layer' + str(i))(x)
        x = layers.AlphaDropout(drop_out, name='alpha_layer' + str(i))(x)

    outputs = layers.Dense(output_dim, activation="softmax", name="softmax_layer0")(x)

    model = models.Model(inputs1 + inputs2 + inputs3, outputs)

    if return_customized_layers:
        return model, {}

    return model

### Build Model 

In [None]:
num_channels1 = 3
num_channels2 = 3
num_channels3 = 2
num_features1 = p_emb_tra.shape[1]
num_features2 = p_encode_tra.shape[1]
#num_features2 = p_pos_tra.shape[1]
num_features3 = pa_dist_tra.shape[1]
feature_dim1 = p_emb_tra.shape[2]
output_dim = 3
mlp_dim = 60
model_dim1 = 10
model_dim2 = 10
model_dim3 = 10
mlp_depth=1
embed_dropout=0.5
drop_out=0.5
return_customized_layers=True

model, co = build_remap_mlp_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    mlp_depth=mlp_depth, embed_dropout=embed_dropout, drop_out=drop_out, 
    return_customized_layers=return_customized_layers
)

cos.append(co)

In [None]:
print(model.summary())

### Train Model

In [None]:
adam = ko.Nadam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

file_path = "best_remap_mlp_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
history = model.fit(X_train, y_tra, batch_size=20, epochs=20, validation_data=(X_dev, y_dev), callbacks = [check_point, early_stop])

histories.append(np.min(np.asarray(history.history['val_loss'])))
model_paths.append(file_path)

del model, history
gc.collect()

## Multi-Channel CNN 

In [None]:
def build_multi_channel_cnn_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    num_filters, filter_sizes, padding, pooling,
    mlp_depth=1, embed_dropout=0.5, drop_out=0.2, 
    return_customized_layers=False):
    """
    Create A Multi-Layer Perceptron Model.
    
    inputs: 
        embeddings: [batch, num_embed_feature, embed_dims] * 3 ## pronoun, A, B
        features: [batch, num_other_feature] * 3 ## pronoun, A, B
        mention_pair_features: [batch, num_mention_pair_feature] * 2 ## pronoun-A, pronoun-B
        
    outputs: 
        [batch, num_classes] # in our case there should be 3 output classes: A, B, None
        
    :param output_dim: the output dimension size
    :param model_dim: rrn dimension size
    :param mlp_dim: the dimension size of fully connected layer
    :param mlp_depth: the depth of fully connected layers
    :param drop_out: dropout rate of fully connected layers
    :param return_customized_layers: boolean, default=False
        If True, return model and customized object dictionary, otherwise return model only
    :return: keras model
    """

    # inputs
    inputs1 = list()
    for fi in range(num_channels1):
        inputs1.append(models.Input(shape=(num_features1, feature_dim1), dtype='float32', name='input1_' + str(fi)))
        
    inputs2 = list()
    for fi in range(num_channels2):
        inputs2.append(models.Input(shape=(num_features2,), dtype='float32', name='input2_' + str(fi)))
        
    inputs3 = list()
    for fi in range(num_channels3):
        inputs3.append(models.Input(shape=(num_features3,), dtype='float32', name='input3_' + str(fi)))
        
    features1_pip = models.Sequential()
    features1_pip.add(layers.TimeDistributed(layers.Dropout(rate=embed_dropout, name="embed_dropout_layer")))
    features1_pip.add(layers.TimeDistributed(layers.Dense(model_dim1, name="feature_map_layer1", activation="relu")))
    
    features2_pip = models.Sequential()
    features2_pip.add(layers.Dropout(rate=embed_dropout, name="dropout_layer2"))
    features2_pip.add(layers.Dense(model_dim2, name="feature_map_layer2", activation="relu"))
    
    features3_pip = models.Sequential()
    features3_pip.add(layers.Dense(model_dim3, name="feature_map_layer3", activation="relu"))
    
    x1 = [features1_pip(input_) for input_ in inputs1]
    x2 = [features2_pip(input_) for input_ in inputs2]
    x3 = [features3_pip(input_) for input_ in inputs3]
    
    # cnn layers
    cnns = []
    for i in range(len(filter_sizes)):
        cnns.append(models.Sequential())
        cnns[-1].add(layers.Conv1D(num_filters[i], kernel_size=filter_sizes[i], padding=padding, activation='relu', name="cc_layer1" + str(i)))
        if pooling == 'max':
            cnns[-1].add(layers.GlobalMaxPooling1D(name='global_pooling_layer' + str(i)))
        else:
            cnns[-1].add(layers.GlobalAveragePooling1D(name='global_pooling_layer' + str(i)))
    
    x1s = list()
    for x1_ in x1:
        x1s += [cnn_(x1_) for cnn_ in cnns]
    x1 = x1s
    
    x = layers.Concatenate(axis=1, name="concate_layer")(x1+x2+x3)
    
    # MLP Layers
    x = layers.BatchNormalization(name='batch_norm_layer')(x)
    x = layers.Dropout(rate=drop_out, name="dropout_layer")(x)
        
    for i in range(mlp_depth - 1):
        x = layers.Dense(mlp_dim, activation='selu', kernel_initializer='lecun_normal', name='selu_layer' + str(i))(x)
        x = layers.AlphaDropout(drop_out, name='alpha_layer' + str(i))(x)

    outputs = layers.Dense(output_dim, activation="softmax", name="softmax_layer0")(x)

    model = models.Model(inputs1 + inputs2 + inputs3, outputs)

    if return_customized_layers:
        return model, {}

    return model

### Build Model

In [None]:
num_channels1 = 3
num_channels2 = 3
num_channels3 = 2
num_features1 = p_emb_tra.shape[1]
num_features2 = p_encode_tra.shape[1]
#num_features2 = p_pos_tra.shape[1]
num_features3 = pa_dist_tra.shape[1]
feature_dim1 = p_emb_tra.shape[2]
output_dim = 3
mlp_dim = 60
model_dim1 = 10
model_dim2 = 10
model_dim3 = 10

filter_sizes = [1, num_features1]
num_filters = [5] * len(filter_sizes)
pooling='max'
padding='valid'

mlp_depth=1
embed_dropout=0.5
drop_out=0.5
return_customized_layers=True

model, co = build_multi_channel_cnn_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    num_filters, filter_sizes, padding, pooling,
    mlp_depth=mlp_depth, embed_dropout=embed_dropout, drop_out=drop_out, 
    return_customized_layers=return_customized_layers
)

cos.append(co)

In [None]:
print(model.summary())

### Train Model

In [None]:
adam = ko.Nadam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

file_path = "best_mc_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
history = model.fit(X_train, y_tra, batch_size=20, epochs=20, validation_data=(X_dev, y_dev), callbacks = [check_point, early_stop])

histories.append(np.min(np.asarray(history.history['val_loss'])))
model_paths.append(file_path)

del model, history
gc.collect()

## Intra-Mention-Pair Coattention Model

In [None]:
def build_intra_coattention_cnn_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, atten_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    num_filters, filter_sizes, padding, pooling,
    mlp_depth=1, embed_dropout=0.5, drop_out=0.2, 
    return_customized_layers=False):
    """
    Create A Multi-Layer Perceptron Model with Coattention Mechanism.
    
    inputs: 
        embeddings: [batch, num_embed_feature, embed_dims] * 3 ## pronoun, A, B
        positional_features: [batch, num_pos_feature] * 2 ## pronoun-A, pronoun-B
        
    outputs: 
        [batch, num_classes] # in our case there should be 3 output classes: A, B, None
        
    :param output_dim: the output dimension size
    :param model_dim: rrn dimension size
    :param mlp_dim: the dimension size of fully connected layer
    :param mlp_depth: the depth of fully connected layers
    :param drop_out: dropout rate of fully connected layers
    :param return_customized_layers: boolean, default=False
        If True, return model and customized object dictionary, otherwise return model only
    :return: keras model
    """
    
    # inputs
    inputs1 = list()
    for fi in range(num_channels1):
        inputs1.append(models.Input(shape=(num_features1, feature_dim1), dtype='float32', name='input1_' + str(fi)))
        
    inputs2 = list()
    for fi in range(num_channels2):
        inputs2.append(models.Input(shape=(num_features2,), dtype='float32', name='input2_' + str(fi)))
        
    inputs3 = list()
    for fi in range(num_channels3):
        inputs3.append(models.Input(shape=(num_features3,), dtype='float32', name='input3_' + str(fi)))
        
    features1_pip = models.Sequential()
    features1_pip.add(layers.TimeDistributed(layers.Dropout(rate=embed_dropout, name="embed_dropout_layer")))
    
    features2_pip = models.Sequential()
    features2_pip.add(layers.Dropout(rate=embed_dropout, name="dropout_layer2"))
    features2_pip.add(layers.Dense(model_dim2, name="feature_map_layer2", activation="relu"))
    
    features3_pip = models.Sequential()
    features3_pip.add(layers.Dense(model_dim3, name="feature_map_layer3", activation="relu"))
    
    x1 = [features1_pip(input_) for input_ in inputs1]
    x2 = [features2_pip(input_) for input_ in inputs2]
    x3 = [features3_pip(input_) for input_ in inputs3]
    
    # From mention-pair embeddings
#     reshape_layer = layers.Reshape((1, feature_dim1), name="reshape_layer")
#     x2_ = [reshape_layer(x_) for x_ in x2]
    
    feature_concat_layer = layers.Concatenate(axis=1, name="concate_pair_layer")
    coatten_layer = RemappedCoAttentionWeight(atten_dim, name="coattention_weights_layer")
    featnorm_layer1 = FeatureNormalization(name="normalized_coattention_weights_layer1", axis=1)
    featnorm_layer2 = FeatureNormalization(name="normalized_coattention_weights_layer2", axis=2)
    focus_layer1 = layers.Dot((1, 1), name="focus_layer1")
    focus_layer2 = layers.Dot((2, 1), name="focus_layer2")
    pair_layer1 = layers.Concatenate(axis=-1, name="pair_layer1")
    pair_layer2 = layers.Concatenate(axis=-1, name="pair_layer2")
    
    def coatten_compare(
        feature_concat_layer, coatten_layer, 
        featnorm_layer1, featnorm_layer2, 
        focus_layer1, focus_layer2, 
        pair_layer1, pair_layer2, 
        mention1_x1, mention2_x1):
        
        _x1 = mention1_x1
        _x2 = mention2_x1
#         _x1 = feature_concat_layer([mention1_x1, mention1_x2])
#         _x2 = feature_concat_layer([mention2_x1, mention2_x2])
        
        # attention
        attens = coatten_layer([_x1, _x2])
        attens1 = featnorm_layer1(attens)
        attens2 = featnorm_layer2(attens)
        # compare
        focus1 = focus_layer1([attens1, _x1])
        focus2 = focus_layer2([attens2, _x2])
        _x1 = pair_layer1([_x1, focus2])
        _x2 = pair_layer2([_x2, focus1])
        
        return _x1, _x2
    
    pairs = list()
    pairs += list(coatten_compare(
        feature_concat_layer, coatten_layer,
        featnorm_layer1, featnorm_layer2, 
        focus_layer1, focus_layer2, 
        pair_layer1, pair_layer2, 
        x1[0], x1[1]))
    pairs += list(coatten_compare(
        feature_concat_layer, coatten_layer,
        featnorm_layer1, featnorm_layer2, 
        focus_layer1, focus_layer2, 
        pair_layer1, pair_layer2, 
        x1[0], x1[2]))
    
#     x1 = layers.Concatenate(axis=1, name="atten_concate_layer")(pairs)
#     x1 = layers.TimeDistributed(layers.Dropout(rate=drop_out, name="pair_dropout_layer"))(x1)
#     x1 = layers.TimeDistributed(layers.Dense(model_dim1, name="pair_feature_map_layer", activation="relu"))(x1)
#     x1 = layers.Flatten(name="pair_feature_flatten_layer1")(x1)

    # cnn layers
    cnns = []
    for i in range(len(filter_sizes)):
        cnns.append(models.Sequential())
        cnns[-1].add(layers.Conv1D(num_filters[i], kernel_size=filter_sizes[i], padding=padding, activation='relu', name="cc_layer1" + str(i)))
        if pooling == 'max':
            cnns[-1].add(layers.GlobalMaxPooling1D(name='global_pooling_layer' + str(i)))
        else:
            cnns[-1].add(layers.GlobalAveragePooling1D(name='global_pooling_layer' + str(i)))
    
    x1 = layers.Concatenate(axis=1, name="cnn_concate_layer")(pairs)
    x1 = layers.TimeDistributed(layers.Dropout(rate=drop_out, name="pair_dropout_layer"))(x1)
    x1 = [cnn_(x1) for cnn_ in cnns]
    
    x = layers.Concatenate(axis=1, name="concate_layer")(x1+x2+x3)
    
    # MLP Layers
    x = layers.BatchNormalization(name='batch_norm_layer')(x)
    x = layers.Dropout(rate=drop_out, name="dropout_layer")(x)
        
    for i in range(mlp_depth - 1):
        x = layers.Dense(mlp_dim, activation='selu', kernel_initializer='lecun_normal', name='selu_layer' + str(i))(x)
        x = layers.AlphaDropout(drop_out, name='alpha_layer' + str(i))(x)

    outputs = layers.Dense(output_dim, activation="softmax", name="softmax_layer0")(x)
    
    model = models.Model(inputs1 + inputs2 + inputs3, outputs)

    if return_customized_layers:
        return model, {'RemappedCoAttentionWeight': RemappedCoAttentionWeight,
                       "FeatureNormalization": FeatureNormalization}

    return model

### Build Model

In [None]:
num_channels1 = 3
num_channels2 = 3
num_channels3 = 2
num_features1 = p_emb_tra.shape[1]
num_features2 = p_encode_tra.shape[1]
#num_features2 = p_pos_tra.shape[1]
num_features3 = pa_dist_tra.shape[1]
feature_dim1 = p_emb_tra.shape[2]
output_dim = 3
mlp_dim = 60
atten_dim = 10
model_dim1 = 2
model_dim2 = 10
model_dim3 = 10

filter_sizes = [1]
num_filters = [10] * len(filter_sizes)
pooling='average'
padding='valid'

mlp_depth=1
embed_dropout=0.5
drop_out=0.5
return_customized_layers=True

model, co = build_intra_coattention_cnn_model(
    num_channels1, num_channels2, num_channels3, 
    num_features1, num_features2, num_features3, 
    feature_dim1, output_dim, atten_dim, model_dim1, model_dim2, model_dim3, mlp_dim, 
    num_filters, filter_sizes, padding, pooling,
    mlp_depth=mlp_depth, embed_dropout=embed_dropout, drop_out=drop_out, 
    return_customized_layers=return_customized_layers
)

cos.append(co)

In [None]:
print(model.summary())

### Train Model

In [None]:
adam = ko.Nadam()
model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])

file_path = "best_intra_coatt_cnn_model.hdf5"
check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = kc.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
history = model.fit(X_train, y_tra, batch_size=30, epochs=40, validation_data=(X_dev, y_dev), callbacks = [check_point, early_stop])

histories.append(np.min(np.asarray(history.history['val_loss'])))
model_paths.append(file_path)

del model, history
gc.collect()

###  Make Prediction

In [None]:
print("load best model: " + str(model_paths[np.argmin(histories)]))
model = models.load_model(
    model_paths[np.argmin(histories)], cos[np.argmin(histories)])

In [None]:
print("load best model: " + str(model_paths[2]))
model = models.load_model(
    model_paths[2], cos[2])
y_preds = model.predict(X_test, batch_size = 1024, verbose = 1)

sub_df_path = os.path.join(SUB_DATA_FOLDER, 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(y_preds[:, 0])
sub_df.loc[:, 'B'] = pd.Series(y_preds[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(y_preds[:, 2])

sub_df.head()

# Measure Test Set

In [29]:
from sklearn import metrics as skm
from keras import activations, constraints

Using TensorFlow backend.


In [30]:
def measure_log_loss(ground, preds):
    preds = preds.tolist()
    return skm.log_loss(ground, preds, labels=[0, 1, 2], eps=10**-15)

In [31]:
print(measure_log_loss(y_test, y_preds))

NameError: name 'y_test' is not defined

# Ensemble Models

In [27]:
from scipy.optimize import differential_evolution

In [28]:
y_dev_preds_list = list()
y_test_preds_list = list()

for i in range(0, len(model_paths)):
    model_path = model_paths[i]
    model = models.load_model(
        model_path, cos[i])
    y_dev_preds_list.append(model.predict(X_dev, batch_size = 1024, verbose = 1))
    y_test_preds_list.append(model.predict(X_test, batch_size = 1024, verbose = 1))

NameError: name 'models' is not defined

In [None]:
lo = 0
hi = len(y_dev_preds_list) - 1
y_dev_preds_list1 = y_dev_preds_list[lo:hi]
y_test_preds_list1 = y_test_preds_list[lo:hi]

len(y_test_preds_list1)

## Ensemble Baseline

In [None]:
y_preds = None

num_models = 0
for i in range(lo, hi):
    y_preds_ = y_test_preds_list[i]
    
    if y_preds is None:
        y_preds = y_preds_
    else:
        y_preds += y_preds_
    
    num_models += 1

y_preds /= num_models

In [None]:
print(measure_log_loss(y_test, y_preds))

## Weighted Average By Stochastic Global Search 

In [None]:
# normalize a vector to have unit norm
def normalize(weights):
    # calculate l1 vector norm
    #print(weights.shape)
    result = np.linalg.norm(weights, 1)
    # check for a vector of all zeros
    if result == 0.0:
        return weights
    # return normalized vector (unit norm)
    return weights / result

def wa_model(weights, X_meta_list):
    # normalize weights
    normalized = normalize(weights)
    
    X_meta = np.array(X_meta_list)
    # weighted sum across ensemble members
    summed = np.tensordot(X_meta, normalized, axes=((0),(0)))
    return summed
    
def loss_function(weights, X_meta_list, y):
    y_pred_ = wa_model(weights, X_meta_list)
    
    return measure_log_loss(y, y_pred_)

In [None]:
def ensemble(X_meta_list, y, X_test_meta_list):
    # define bounds on each weight
    bound_w = [(0.0, 1.0)  for _ in range(hi-lo)]
    # arguments to the loss function
    search_arg = (X_meta_list, y)
    # global optimization of ensemble weights
    weights = differential_evolution(loss_function, bound_w, search_arg, maxiter=1000, tol=1e-7)['x']
    
    print(normalize(weights))
    
    return wa_model(weights, X_test_meta_list)

In [None]:
y_preds = ensemble(y_dev_preds_list1, y_dev, y_test_preds_list1)

In [None]:
print(measure_log_loss(y_test, y_preds))

## Weighted Average By Keras

In [None]:
class WeightedAverage(Layer):

    def __init__(self,**kwargs):

        self.supports_masking = False
        self.init = initializers.get('lecun_normal')
        
        self.w_regularizer = regularizers.get(None)
        self.w_constraint = constraints.get(None)
        
        super(WeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):

        self.w = self.add_weight((input_shape[1], ),
                                 initializer=self.init,
                                 name='{}_ww'.format(self.name),
                                 regularizer=self.w_regularizer,
                                 constraint=self.w_constraint)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, inputs, mask=None):
        x = inputs
        
        w = activations.sigmoid(self.w)
        w = w / (K.sum(w) + K.epsilon())
        w = K.expand_dims(w)
        print(w.shape)
        weighted_input = x * w
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

    def get_config(self):
        config = {
        }
        base_config = super(WeightedAverage, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def build_ensemble_model(num_models, input_dims, output_dims):
    
    # inputs
    inputs = models.Input(shape=(num_models, input_dims), dtype='float32', name='input')
    
    x = inputs
    
    outputs = WeightedAverage(name="softmax_layer")(x)
    
    model = models.Model(inputs, outputs)
    return model

In [None]:
def ensemble(X_meta_list, y, X_test_meta_list):
    en_model = build_ensemble_model(hi-lo, 3, 3)
    adam = ko.Adam(lr=0.001)
    early_stop = kc.EarlyStopping(monitor = "loss", mode = "min", patience=5)
    en_model.compile(adam, loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])
    
    print(en_model.summary())

    X_ = np.stack(X_meta_list, axis=1)
    X_test_ = np.stack(X_test_meta_list, axis=1)
    
    en_model.fit(X_, y, batch_size=10000, epochs=300, callbacks = [early_stop])
    
    return en_model.predict(X_test_)

In [None]:
y_preds = ensemble(y_test_preds_list1, y_test, y_test_preds_list1)

In [None]:
print(measure_log_loss(y_test, y_preds))

# Save Results

In [None]:
sub_df_path = os.path.join(SUB_DATA_FOLDER, 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(y_preds[:, 0])
sub_df.loc[:, 'B'] = pd.Series(y_preds[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(y_preds[:, 2])

sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)