# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

# Use only one GPU
#import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')
sys.path.append('../../src/isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')

In [2]:
# Supress tensorflow memory appetites

import tensorflow as tf

config = tf.ConfigProto()
#config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [3]:
# Check available GPUs

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

[]

In [4]:
import os
import time
import isanlp
import json
import pickle

import numpy as np
np.random.seed(31)

import pandas as pd

# Data loading

In [5]:
input_data_path = '../../data/preprocessed_framebank/annotated_corpus.json'

with open(input_data_path, 'r') as f:
    data = json.load(f)
    
print('Number of examples: ', len(data))

Number of examples:  59861


In [6]:
from convert_corpus_to_brat import make_text, create_verb_example_index

min_n_examples = 10

verb_index = create_verb_example_index(data)
print('Original number of verbs: ', len(verb_index))

stat = sorted([(verb, len(examples)) for verb, examples in verb_index.items()], 
              key = lambda x: x[1], reverse=True)

verbs_to_keep = [verb for verb, count in stat if count >= min_n_examples]
print('Number of left verbs: ', len(verbs_to_keep))

Original number of verbs:  803
Number of left verbs:  572


In [7]:
examples = list()

for verb in verbs_to_keep:
    indexes = verb_index[verb]
    
    for ind in indexes:
        examples.append((ind, data[ind]))

print('Number of framebank examples left: ', len(examples))

Number of framebank examples left:  32612


In [8]:
cleared_corpus_path = '../../data/cleared_corpus.json'

In [13]:
with open(cleared_corpus_path, 'w') as f:
    json.dump(examples, f)

In [14]:
with open(cleared_corpus_path, 'r') as f:
    examples = json.load(f)

In [9]:
ling_data_path = '../../data/results_final_fixed.pckl'
with open(ling_data_path, 'rb') as f:
    ling_data = pickle.load(f)

ling_data_cache = {k: v for k,v in ling_data}

In [10]:
error_examples = {}

# Feature construction

In [11]:
from isanlp.annotation_repr import CSentence
from convert_corpus_to_brat import make_text


def find_address_by_offset(offset, ling_ann):
    for tok_num, tok in enumerate(ling_ann['tokens']):
        if tok.begin <= offset and offset < tok.end:
            break
    
    for sent_num, sent in enumerate(ling_ann['sentences']):
        if sent.begin <= tok_num and tok_num < sent.end:
            break
    
    return sent_num, tok_num - sent.begin


def process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, example):
    feature_sets = list()
    
    text, offset_index = make_text(example, 0)
    ling_ann = ling_cache[ex_id]
    
    pred_offset = offset_index[(pred[0], pred[1])]
    pred_ling_sent, pred_ling_word = find_address_by_offset(pred_offset, ling_ann)
    
    for arg in args:
        arg_offset = offset_index[(arg[0], arg[1])]
        arg_ling_sent, arg_ling_word = find_address_by_offset(arg_offset, ling_ann)
        
        #print("-"*20)
        #print('ex_id: ', ex_id)
        #print('ling_ann_sent: ', arg_ling_sent)
        #print('total number of postags: ', len(ling_ann['postag']))
        #print('total number of morph featues: ', len(ling_ann['morph']))
        #print('total number of lemmas: ', len(ling_ann['lemma']))
        #print('total number of syntax trees: ', len(ling_ann['syntax_dep_tree']))
        
        lens = {
            'len_postags' : len(ling_ann['postag']),
            'len_morph' : len(ling_ann['morph']),
            'len_lemma' : len(ling_ann['lemma']),
            'len_syntax' : len(ling_ann['syntax_dep_tree'])
        }
        
        #print("-"*20)
        #print(ex_id)
        #print(lens)
        #print("arg_ling_sent: ", arg_ling_sent)
        
        if arg_ling_sent > min(lens.values()) or len(set(lens.values())) != 1:
            lens['len_arg_ling_sent'] = arg_ling_sent
            if ex_id not in error_examples:
                error_examples[ex_id] = []
            error_examples[ex_id].append((ex_id, lens, "length mismatch"))
            continue
        
        
        fb_pred_word = example[pred[0]][pred[1]]
        fb_arg_word = example[arg[0]][arg[1]]
        
        role = fb_arg_word['rolepred1']

        if arg_ling_sent != pred_ling_sent:
            global num_of_errors
            num_of_errors += 1
            # We miss some examples due to mistakes in framebank or discrepancy in 
            # automatica annotation of sentences.
            print('Error #{}'.format(num_of_errors))
            continue
        
        try:
            features = feature_extractor.extract_features(pred_ling_word, 
                                                      arg_ling_word, 
                                                      ling_ann['postag'][arg_ling_sent],
                                                      ling_ann['morph'][arg_ling_sent],
                                                      ling_ann['lemma'][arg_ling_sent],
                                                      ling_ann['syntax_dep_tree'][arg_ling_sent])
        except Exception as e:
            lens['len_arg_ling_sent'] = arg_ling_sent
            if ex_id not in error_examples:
                error_examples[ex_id] = []
            error_examples[ex_id].append((ex_id, lens, str(e)))
            continue
            
                    
        feature_sets.append((features, role, ex_id, arg))
    
    return feature_sets


def process_example(feature_extractor, ling_cache, ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == 'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, sentences)


num_of_errors = 0
def prepare_train_data(examples, ling_data_cache, feature_extractor):
    feature_sets = []
    for ex_num, (ex_id, ex) in enumerate(examples):    
        if ex_num % 100 == 0:
            print('{0:.2f}%'.format((ex_num / len(examples)) * 100.))
            
        feature_sets += process_example(feature_extractor, ling_data_cache, ex_id, ex)

    print('Number of training examples:', len(feature_sets))
    return feature_sets

In [12]:
main_model_path_root = '../../data/models_new/'

In [13]:
#!!!: Choose feature model here
from isanlp_srl_framebank.processor_srl_framebank import FeatureModelDefault
feature_model = FeatureModelDefault()
main_model_path = os.path.join(main_model_path_root, 'known_preds')

# from isanlp_srl_framebank.processor_srl_framebank import FeatureModelUnknownPredicates
# feature_model = FeatureModelUnknownPredicates()
# main_model_path = os.path.join(main_model_path_root, 'unknown_preds')

#with open(os.path.join(main_model_path, 'feature_model.pckl'), 'wb') as f:
#    pickle.dump(feature_model, f)

In [14]:
feature_sets = prepare_train_data(examples, ling_data_cache, feature_model)

data_for_pandas = []
for example in feature_sets:
    data_for_pandas_ex = {}
    data_for_pandas_ex['role'] = example[1]
    data_for_pandas_ex['ex_id'] = example[2]
    data_for_pandas_ex['arg_address'] = example[3]
    for elem in example[0]:
        for subelem in elem:
            if subelem is not None:
                data_for_pandas_ex.update(subelem)
    
    data_for_pandas.append(data_for_pandas_ex)
    
pd_data = pd.DataFrame(data_for_pandas)
pd_data = pd_data.sample(frac=1)
pd_data[:10]
del data_for_pandas

0.00%
Error #1
0.31%
0.61%
0.92%
1.23%
1.53%
1.84%
2.15%
2.45%
2.76%
3.07%
3.37%
3.68%
3.99%
4.29%
4.60%
4.91%
5.21%
5.52%
5.83%
6.13%
Error #2
Error #3
Error #4
6.44%
6.75%
7.05%
7.36%
7.67%
7.97%
8.28%
8.59%
8.89%
9.20%
9.51%
Error #5
9.81%
Error #6
10.12%
10.43%
Error #7
Error #8
10.73%
Error #9
11.04%
11.35%
11.65%
11.96%
12.27%
12.57%
12.88%
13.19%
13.49%
13.80%
14.11%
Error #10
Error #11
Error #12
Error #13
Error #14
Error #15
Error #16
Error #17
14.41%
Error #18
14.72%
15.03%
15.33%
15.64%
15.95%
Error #19
16.25%
16.56%
16.86%
17.17%
17.48%
17.78%
18.09%
18.40%
18.70%
19.01%
19.32%
19.62%
19.93%
20.24%
20.54%
20.85%
Error #20
21.16%
21.46%
21.77%
22.08%
22.38%
22.69%
23.00%
23.30%
23.61%
23.92%
24.22%
24.53%
24.84%
25.14%
25.45%
25.76%
26.06%
26.37%
26.68%
26.98%
27.29%
27.60%
27.90%
Error #21
Error #22
Error #23
Error #24
Error #25
Error #26
28.21%
28.52%
28.82%
29.13%
29.44%
29.74%
30.05%
Error #27
Error #28
30.36%
Error #29
Error #30
30.66%
30.97%
31.28%
31.58%
31.89%
32.20%


# Preprocessing

In [15]:
y_stat = pd_data.loc[:, 'role'].value_counts()
drop_ys = y_stat[y_stat < 180].index
clear_data = pd_data.drop(pd_data[pd_data.loc[:, 'role'].isin(drop_ys)].index)

In [16]:
repl_roles = {
    'агенс - субъект восприятия' : 'субъект восприятия',
    'агенс - субъект ментального состояния' : 'субъект ментального состояния',
    'результат / цель' : 'результат',
    'место - пациенс' : 'место',
    'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
}


def normalize_single_region(data, rep, val):
    data.loc[:, 'role'] = data.loc[:, 'role'].str.replace(rep, val)


for rep, val in repl_roles.items():
    normalize_single_region(clear_data, rep, val)
    
number_of_roles = len(clear_data.loc[:, 'role'].value_counts().index)
print('Number of roles: ', number_of_roles)
clear_data.loc[:, 'role'].value_counts()

Number of roles:  34


агенс                                 3775
пациенс                               3309
тема                                  2436
субъект перемещения                   1886
субъект психологического состояния    1723
причина                               1572
место                                 1472
говорящий                             1159
содержание действия                   1136
конечная точка                        1130
содержание мысли                      1105
содержание высказывания                973
пациенс перемещения                    867
стимул                                 790
результат                              775
субъект ментального состояния          601
адресат                                531
контрагент                             499
эффектор                               492
субъект восприятия                     431
предмет высказывания                   382
субъект социального отношения          376
начальная точка                        366
способ     

In [17]:
y_orig = clear_data.loc[:, 'role']
X_orig = clear_data.drop('role', axis = 1)
X_orig.shape

(30371, 18)

In [21]:
from sklearn.preprocessing import LabelBinarizer
import pickle

label_encoder = LabelBinarizer()
y = label_encoder.fit_transform(y_orig)

with open(main_model_path + '/label_encoder.pckl', 'wb') as f:
    pickle.dump(label_encoder, f)

## Embedding

In [23]:
from gensim.models import KeyedVectors

embeddings_path = '../../data/embeddings/ruscorpora_upos_skipgram_300_5_2018.vec'
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)
print('Embedding size: ', embeddings.vector_size)

Embedding size:  300


In [24]:
import multiprocessing as mp


def make_embeded_form(word):
    if word:
        #return word[1].encode('utf8')
        return u"{}_{}".format(word[1], word[0])
    else:
        return word


class Embedder_map:
    def __init__(self, embeddings, X):
        self.X_ = X
        self.embeddings_ = embeddings

    def __call__(self, i):  
        result = np.zeros((len(self.X_[0]), 
                           self.embeddings_.vector_size))

        for j in range(len(self.X_[0])):
            word = self.X_[i][j]
            tag = word[0] if word else str()
            
            if tag == ARG_SPECIAL_TAG or tag == ARG_SPECIAL_TAG:
                result[j, :] = np.ones(self.embeddings_.vector_size)
            elif word and word in embeddings:
                result[j, :] = self.embeddings_[word]

        return result


def embed(X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_map(embeddings, X), X.index, 1000)
    pool.close()
    return np.asarray(result)


In [None]:
%%time

arg_context_embedded = embed(X_orig.loc[:, 'arg_context_lemmas'])

In [None]:
%%time

pred_context_embedded = embed(X_orig.loc[:, 'pred_context_lemmas'])

In [27]:
class Embedder_single_map:
    def __init__(self, embeddings, X):
        self.X_ = X
        self.embeddings_ = embeddings

    def __call__(self, i):
        #word = make_embeded_form(self.X_[i])
        word = self.X_[i]
        if word in self.embeddings_:
            return self.embeddings_[word]
        else:
            return np.zeros((self.embeddings_.vector_size,))

        
def embed_single(embeddings, X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_single_map(embeddings, X), X.index, 1000)
    pool.close()
        
    return np.asarray(result)

In [28]:
%%time

# embedded_verbs = embed_single(pd.Series(list(zip(X_orig.pred_pos, X_orig.pred_lemma)), 
#                                         index = X_orig.index))
embedded_verbs = embed_single(embeddings, X_orig.pred_lemma)
print(embedded_verbs.shape)
print((np.linalg.norm(embedded_verbs, axis = 1) < 0.001).sum())
print(clear_data[(np.linalg.norm(embedded_verbs, axis = 1) < 0.001)].pred_lemma.value_counts().shape)

(30371, 300)
362
(41,)
CPU times: user 23.2 s, sys: 14.8 s, total: 38 s
Wall time: 45.7 s


In [29]:
%%time

# embedded_args = embed_single(pd.Series(list(zip(X_orig.arg_pos, X_orig.arg_lemma)), 
#                                        index = X_orig.index))
embedded_args = embed_single(embeddings, X_orig.arg_lemma)
print(embedded_args.shape)
print((np.linalg.norm(embedded_args, axis = 1) < 0.001).sum())

(30371, 300)
10342
CPU times: user 21.9 s, sys: 14.7 s, total: 36.7 s
Wall time: 42.9 s


## Vectorizing categorial features

In [30]:
X_orig.columns

Index(['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg',
       'Valency_arg', 'VerbForm_arg', 'arg_address', 'arg_case', 'arg_lemma',
       'arg_pos', 'dist', 'ex_id', 'pred_lemma', 'pred_pos', 'prepos',
       'rel_pos', 'syn_link_name'],
      dtype='object')

In [31]:
from sklearn.feature_extraction import DictVectorizer

#morph_feats = ['pos', 'case', 'anim', 'vform', 'zform', 'shform', 'pform', 'vvform', 'nform', 'time']

# all_feats = (['pred_lemma', 'rel_pos'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep', 'link_name'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

#all_feats = ['pred_lemma', 'rel_pos', 'pred_pos', 'arg_case', 'syn_link_name', 'arg_pos', 'prepos', 'dist']

#categ_feats = [e for e in all_feats if X_orig[e].dtype in [str, object]]
#not_categ = [e for e in all_feats if e not in categ_feats]

#pred_lemma_vectorizer.fit_transform(X_orig.loc[:, ['pred_lemma']].to_dict(orient = 'records'))

not_categ_features = {'arg_address', 'ex_id', 'rel_pos', 'arg_lemma'}
categ_feats = [name for name in X_orig.columns if name not in not_categ_features] 
not_categ = ['rel_pos']
print('Category features:\n', categ_feats)
print('Not category features:\n', not_categ)

vectorizer = DictVectorizer(sparse = False)
one_hot_feats = vectorizer.fit_transform(X_orig.loc[:, categ_feats].to_dict(orient = 'records'))
print(one_hot_feats.shape)

with open(main_model_path + '/feature_encoder.pckl', 'wb') as f:
    pickle.dump(vectorizer, f)

Category features:
 ['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg', 'Valency_arg', 'VerbForm_arg', 'arg_case', 'arg_pos', 'dist', 'pred_lemma', 'pred_pos', 'prepos', 'syn_link_name']
Not category features:
 ['rel_pos']
(30371, 941)


In [32]:
not_categ_columns = np.concatenate(tuple(X_orig.loc[:, e].as_matrix().reshape(-1, 1) for e in not_categ), axis =1)
plain_features = np.concatenate((one_hot_feats, not_categ_columns), axis = 1)
plain_features.shape

  """Entry point for launching an IPython kernel.


(30371, 942)

In [33]:
del not_categ_columns
del one_hot_feats

# Model construction

In [34]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Convolution1D, Dropout, MaxPooling1D
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input
from tensorflow.python.keras.layers import TimeDistributed
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Permute
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import BatchNormalization
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.layers import Masking
from gensim.models import Word2Vec

In [35]:
def construct_simple_model():
    model = Sequential()
    model.add(Convolution1D(nb_filter=128, 
                            filter_length=2, 
                            border_mode='same', 
                            activation='relu', 
                            input_shape = (seq_embeded.shape[1], 
                                           get_embeddings_length(embeddings))))

    #model.add(MaxPooling1D(pool_length=2))
    model.add(LSTM(80))
    model.add(Dropout(0.1))
    model.add(Dense(60, activation='tanh'))
    model.add(Dense(number_of_roles, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())
    
    return model

In [36]:
def construct_simple_attentional_model():
    units = 80
    _input = Input(shape = (arg_context_embedded.shape[1], 
                            get_embeddings_length(embeddings)), dtype = 'float')

    conv = Convolution1D(nb_filter=128, 
                        filter_length=2, 
                        border_mode='same', 
                        activation='relu')(_input)

    activations = LSTM(units, return_sequences=True)(conv)

    # compute importance for each step
    attention = TimeDistributed(Dense(1, activation='tanh'))(activations) 
    #attention = Dense(6, activation='tanh')(activations) 
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(units)(attention)
    attention = Permute([2, 1])(attention)

    # apply the attention
    sent_representation = merge([activations, attention], mode='mul')
    sent_representation = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)

    #dn = Dense(100, activation = 'tanh')(sent_representation)
    #probabilities = Dense(number_of_roles, activation='softmax')(dn)
    probabilities = Dense(number_of_roles, activation='softmax')(sent_representation)

    model = Model(input=_input, output=probabilities)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [37]:
def construct_graph_bidirectional_model():
    print('Bidirectional model.')
    
    arg_context_model = Sequential()
    arg_context_model.add(Convolution1D(nb_filter=150, 
                                        filter_length=2, 
                                        border_mode='same', 
                                        activation='relu',
                                        input_shape = (arg_context_embedded.shape[1], 
                                                       get_embeddings_length(embeddings))))
    arg_context_model.add(Bidirectional(LSTM(100), merge_mode = 'sum'))
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(700, 
                          input_shape=(plain_features.shape[1],), 
                          activation = 'relu'))
    
    ###############################
    
    final = Sequential()
    final.add(Merge([arg_context_model, plain_model], mode = 'concat', concat_axis=1))
    final.add(Dropout(0.3))
    
    #final.add(Dense(300, activation = 'relu'))
    final.add(Dense(300))
    final.add(BatchNormalization())
    final.add(Activation('relu'))
    final.add(Dropout(0.3))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return final

In [38]:
def construct_graph_attentional_model():
    print('Context attentional model.')
    
    def construct_attentional_part(context_length):
        _input = Input(shape = (context_length, 
                                get_embeddings_length(embeddings)), dtype = 'float')

        conv = Convolution1D(nb_filter=200, 
                            filter_length=2, 
                            border_mode='same', 
                            activation='relu')(_input)

        units = 100
        activations = LSTM(units, return_sequences=True)(conv)

        # compute importance for each step
        attention = TimeDistributed(Dense(1, activation='tanh'))(activations)  
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(units)(attention)
        attention = Permute([2, 1])(attention)

        # apply the attention
        seq_repr = merge([activations, attention], mode='mul')
        seq_repr = Lambda(lambda xin: K.sum(xin, axis=1))(seq_repr)
        seq_model = Model(input=_input, output=seq_repr)
        
        return seq_model
    
    arg_context_model = construct_attentional_part(arg_context_embedded.shape[1])
    pred_context_model = construct_attentional_part(pred_context_embedded.shape[1])
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(800, 
                          input_shape=(plain_features.shape[1],), 
                          activation = 'relu'))
    
    
    ###############################
    
    final = Sequential()
    final.add(Merge([arg_context_model, pred_context_model, plain_model], 
                    mode = 'concat', concat_axis=1))
    final.add(Dropout(0.3))
    
    #final.add(Dense(300, activation = 'relu'))
    final.add(Dense(400))
    final.add(BatchNormalization())
    final.add(Activation('relu'))
    final.add(Dropout(0.3))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    #final.add(Dense(number_of_roles, activation = 'softmax'))
#    final.add(BatchNormalization())
    #final.add(Activation('softmax'), W_regularizer=l2(0.01))
    #final.add(Dense(number_of_roles, activation='softmax', W_regularizer = l2(0.01)))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return final

In [39]:
def construct_graph_lstm_model(plain_features_shape):
    print('Context model.')
    
    def create_embed_model():
        embed_model = Sequential()
        embed_model.add(Dense(100, input_shape = (get_embeddings_length(embeddings), )))
        embed_model.add(BatchNormalization())
        embed_model.add(Activation('relu'))
        return embed_model
    
    def construct_attentional_part(context_length):
        seq_model = Sequential()
        seq_model.add(Convolution1D(nb_filter=50, 
                                    filter_length=1, 
                                    border_mode='same', 
                                    activation='relu',
                                    input_shape = (context_length, 
                                                   get_embeddings_length(embeddings))))
#         seq_model.add(Masking(mask_value=0., input_shape = (context_length, 
#                                                             get_embeddings_length(embeddings))))
        #seq_model.add(Masking(mask_value=1.))
        seq_model.add(Bidirectional(LSTM(50), merge_mode='sum'))
        #seq_model.add(LSTM(100))
        seq_model.add(Dense(50))
        seq_model.add(BatchNormalization())
        seq_model.add(Activation('relu'))
        
        return seq_model
    
    ###############################
    
    #arg_context_model = construct_attentional_part(arg_context_embedded.shape[1])
    pred_context_model = construct_attentional_part(pred_context_embedded.shape[1])
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(400, input_shape = plain_features_shape))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    
    ###############################
    
    arg_embed_model = create_embed_model()
    pred_embed_model = create_embed_model()
    
    ###############################
    
    final1 = Sequential()
    final1.add(Merge([
  #              arg_context_model, 
                     #pred_context_model,
                     arg_embed_model,
                     pred_embed_model,
                     plain_model], 
                    mode = 'concat', concat_axis=1))
    final1.add(Dropout(0.3))
    
    final1.add(Dense(400))
    final1.add(BatchNormalization())
    final1.add(Activation('relu'))
    final1.add(Dropout(0.3))
    
    final = Sequential()
    final.add(Merge([final1, pred_context_model], mode = 'concat', concat_axis = 1))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return final

In [40]:
def construct_plain_model(input_shape):
    print('Plain model.')
    
    plain_model = Sequential()
    plain_model.add(Dense(600, 
                          #input_shape=(plain_features.shape[1],), 
                          input_shape = input_shape,
                          activation = 'relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(400))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(number_of_roles))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('softmax'))
    
    plain_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return plain_model

In [41]:
def construct_plain_model_sparse(categ_size, emb_size, number_of_roles):    
    input_plain = Input(shape=(categ_size,), name = 'input_categorical')
    input_pred_embed = Input(shape=(emb_size,), name = 'pred_embed')
    input_arg_embed = Input(shape=(emb_size,), name = 'arg_embed')
    
    plain = Dense(400)(input_plain)
    plain = BatchNormalization()(plain)
    plain = Activation('relu')(plain)
    
    def embed_submodel(inpt):
        embed = Dense(100)(inpt)
        embed = BatchNormalization()(embed)
        embed = Activation('relu')(embed)
        return embed
    
    embed_pred = embed_submodel(input_pred_embed)
    embed_arg = embed_submodel(input_arg_embed)
    
    final = Concatenate(axis = 1)([embed_pred, embed_arg, plain])
    final = Dropout(0.3)(final)
    final = Dense(400)(final)
    final = BatchNormalization()(final)
    final = Activation('relu')(final)
    final = Dropout(0.3)(final)
    final = Dense(number_of_roles)(final)
    final = BatchNormalization()(final)
    final = Activation('softmax')(final)
    
    model = Model([input_arg_embed, input_pred_embed, input_plain], final)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Experiments

## Experiments with in-domain test

In [42]:
# Use only for experiments with in-domain test. Do not use for training

### Plain model

In [43]:
model = construct_plain_model((plain_features.shape[1],))
print(model.summary())
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(plain_features, y, epochs=15, batch_size=300, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

Plain model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 600)               565800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 400)               240400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 400)               1600      
_________________________________________________________________
activation_1 (Activation)    (None, 400)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 34)                13634   

<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fe978b41668>

### Sparse

In [44]:
model = construct_plain_model_sparse(plain_features.shape[1], embeddings.vector_size, y.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([embedded_args, embedded_verbs, plain_features], y, epochs=16, batch_size=300, 
          validation_split = 0.1, shuffle=True, callbacks = [early_stopping])
model.save(os.path.join(main_model_path, 'neural_model.h5'))

Train on 27333 samples, validate on 3038 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [45]:
from tensorflow.python.keras.backend import clear_session
clear_session()

### Context

In [46]:
%%time

model = construct_graph_lstm_model((plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([arg_context_embedded, pred_context_embedded, embedded_args, embedded_verbs, plain_features], y, 
          epochs=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

Context model.


NameError: name 'pred_context_embedded' is not defined

## Experiements with out-of-domain test

In [None]:
# Use only for out of domain experiments. Do not use for training

In [47]:
def evaluate_out_of_domain(model, X_train, y_train, X_test, y_test):
    final_res = list()
    N_ITERATIONS = 5
    for i in xrange(N_ITERATIONS):
        print('Eval iter:', i + 1, '/', N_ITERATIONS)
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, 
                                       patience=2, verbose=0, mode='auto')
        model.fit(X_train, y_train, nb_epoch=15, 
                  batch_size=64, validation_split = 0.1, 
                  shuffle=True, callbacks = [early_stopping],
                 verbose = 0)

        ev_res = evaluate_model(model, X_test, y_test)
        print()
        print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))
        final_res.append(ev_res)
    
    return np.array(final_res)

In [48]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def evaluate_model(model, X_test, y_test):
    keras_eval = model.evaluate(X_test, y_test)
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return np.array(list(keras_eval) + [f1_micro, f1_macro, accur])

### Simple

In [49]:
model = construct_plain_model((ind_plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(ind_plain_features, ind_y, nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#model.evaluate(ood_plain_features, ood_y)
ev_res = evaluate_model(model, [ood_plain_features], ood_y)
print()
print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))

NameError: name 'ind_plain_features' is not defined

In [50]:
model = construct_plain_model((ind_plain_features.shape[1],))
model_eval = evaluate_out_of_domain(model, ind_plain_features, ind_y, ood_plain_features, ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

### Complex

In [51]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, nb_epoch=20, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])
#model.evaluate([ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)

ev_res = evaluate_model(model, [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print()
print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))

NameError: name 'ind_plain_features' is not defined

In [52]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],))
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

### Context

In [53]:
model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([
           #ind_arg_context, 
        #ind_pred_context,   
        ind_arg_embed, 
        ind_pred_embed, 
        ind_plain_features,
        ind_pred_context], 
#model.fit([ind_arg_context, ind_pred_context, ind_arg_embed, ind_pred_embed, ind_plain_features], 
           ind_y, nb_epoch=6, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#model.evaluate([ood_arg_context, ood_pred_context, ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
model.evaluate([
    #    ood_arg_context, 
    #    ood_pred_context,
        ood_arg_embed, 
        ood_pred_embed,
        ood_plain_features,
        ood_pred_context
    ], ood_y)

NameError: name 'ind_plain_features' is not defined

In [54]:
model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features, ind_pred_context], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features, ood_pred_context], ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

# Evaluation

In [55]:
# Use only for model comparision. Do not use for training.

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, *args, **kwargs):
    model.fit(X_train, y_train, *args, **kwargs)
    
    keras_eval = model.evaluate(X_test, y_test)
    
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return list(keras_eval) + [f1_micro, f1_macro, accur]
    

def custom_cross_val(cr_f, X, y, cv, *args, **kwargs):
    cr_f().summary()
    eval_res = list()
    for i, (train, test) in enumerate(cv.split(y)):
        model = cr_f()
        print('Running Fold', i+1, '/', cv.n_splits)
        eval1 = train_and_evaluate_model(model, 
                                         [X[j][train] for j in range(len(X))], y[train], 
                                         [X[j][test] for j in range(len(X))], y[test], 
                                         *args, **kwargs)
        
        print()
        print('Fold result: ', eval1)
        eval_res.append(eval1)
    
    return np.array(eval_res)


def describe_cv_result(cv_res):
    print(cv_res)
    mean_cv_res = cv_res.mean(axis = 0)
    std_cv_res = cv_res.std(axis = 0)
    print('Mean')
    print(pd.DataFrame([mean_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur']))
    print('Std')
    print(pd.DataFrame([std_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur']))
    
    
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=42)

## Simple

In [57]:
curr_features = np.concatenate((no_lemma_plain_features, embedded_verbs), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((curr_features.shape[1],)), 
                          [curr_features], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

NameError: name 'no_lemma_plain_features' is not defined

In [None]:
cv_res = custom_cross_val(lambda : construct_plain_model((plain_features.shape[1],)), 
                          [plain_features], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

Plain model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 600)               565800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 400)               240400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 400)               1600      
_________________________________________________________________
activation_1 (Activation)    (None, 400)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 34)                13634   

In [None]:
single_chunk = np.concatenate((embedded_args, embedded_verbs, plain_features), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((single_chunk.shape[1],)), 
                          [single_chunk], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

## Sparse

In [89]:
cv_res = custom_cross_val(lambda : construct_plain_model_sparse(plain_features.shape[1], 
                                                                embeddings.vector_size, 
                                                                y.shape[1]), 
                          [embedded_args, embedded_verbs, plain_features], y, 
                          cv = cv, epochs=13, batch_size=300,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pred_embed (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
arg_embed (InputLayer)          (None, 300)          0                                            
__________________________________________________________________________________________________
input_categorical (InputLayer)  (None, 990)          0                                            
__________________________________________________________________________________________________
dense_92 (Dense)                (None, 100)          30100       pred_embed[0][0]                 
__________________________________________________________________________________________________
dense_93 (

## Context

In [None]:
cv_res = custom_cross_val(lambda : construct_graph_lstm_model((plain_features.shape[1],)), 
                          [arg_context_embedded, 
                           pred_context_embedded, 
                           embedded_args, 
                           embedded_verbs,
                           plain_features], y, 
                          cv = cv, epochs=6, batch_size=64, validation_split = 0., 
                          shuffle=True)

describe_cv_result(cv_res)

# Training and predicting

In [None]:
# Training model and saving

In [74]:
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(X_orig.ex_id.unique(), test_size=0.2, random_state=42)
train_ids = set(train_ids.tolist())
test_ids = set(test_ids.tolist())

In [75]:
train_selector_pd = X_orig.ex_id.isin(train_ids)
test_selector_pd = X_orig.ex_id.isin(test_ids)
train_selector = train_selector_pd.values
test_selector = test_selector_pd.values

In [76]:
train_data = {k : data[k] for k in train_ids}
test_data = {k : data[k] for k in test_ids}

with open(os.path.join(main_model_path, 'train_data.json'), 'w') as f:
    json.dump(train_data, f)

with open(os.path.join(main_model_path, 'test_data.json'), 'w') as f:
    json.dump(test_data, f)

In [77]:
def select_from_nparray_list(nparray_list, selector):
    return [e[selector] for e in nparray_list]

model = construct_plain_model_sparse(plain_features.shape[1], embeddings.vector_size, y.shape[1])
model.fit(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], train_selector),
          select_from_nparray_list([y], train_selector), 
          epochs=10, batch_size=64, validation_split = 0.1, shuffle=True)

model.evaluate(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector), 
               select_from_nparray_list([y], test_selector))
model.save(os.path.join(main_model_path, 'neural_model.h5'))

Train on 37893 samples, validate on 4211 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6926666886275118, 0.8026800037746532]

In [81]:
# Hold-out evaluation.

evaluate_model(model,
               select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector), 
               select_from_nparray_list([y], test_selector)[0])



array([0.69266669, 0.80268   , 0.80268   , 0.77108236, 0.80268   ])

In [84]:
pred = model.predict(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector))

test_examples_to_store = X_orig.loc[test_selector_pd[test_selector_pd].index, :].loc[:, ['arg_address', 'ex_id']]
test_data = {k : data[k] for k in test_ids}


for index, (pd_index, row) in enumerate(test_examples_to_store.iterrows()):
    ex = test_data[row['ex_id']]
    arg_addr = row['arg_address']
    sent = ex[arg_addr[0]]
    token = sent[arg_addr[1]]
    cl = pred[index]
    predicted_role = label_encoder.inverse_transform(np.array([cl]))[0]
    actual_role = label_encoder.inverse_transform(np.array([select_from_nparray_list([y], test_selector)[0][index]]))[0]
    
    token['rolepred1'] = actual_role
    token['rolepred2'] = predicted_role

In [85]:
with open('./test_data_ann_1.json', 'w') as f:
    json.dump(test_data, f)

# Brat convertion

In [None]:
# Converts results to brat annotation for inspecting.
# Needs framebank_preprocessing from http://nlp.isa.ru/framebank_parser/

In [None]:
!python2.7 ./framebank_preprocessing/convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./brat_ann2/

In [None]:
!python2.7 ./framebank_preprocessing/convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./syntaxnet_1/ --converter=syn

In [54]:
!export PYTHONPATH=../ && python2.7 ./convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./brat_ann2/

2018-03-11 20:56:20 - Loading corpus data...
2018-03-11 20:56:22 - Done.
2018-03-11 20:56:22 - Creating verb-example index...
2018-03-11 20:56:22 - Done.
2018-03-11 20:56:22 - Converting and saving...
2018-03-11 20:56:23 - Done.
2018-03-11 20:56:23 - Generating brat configuration files...
2018-03-11 20:56:23 - Done.
