# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

# Use only one GPU
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')
sys.path.append('../../src/isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')

In [2]:

# Supress tensorflow memory appetites

import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
config.log_device_placement=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

In [3]:
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
W0516 08:00:38.633061 139896968771328 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [4]:
# Check available GPUs

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0', '/device:GPU:1']

In [5]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 18336177674727777836, name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 10953945507323820453
 physical_device_desc: "device: XLA_GPU device", name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 1494602712782881369
 physical_device_desc: "device: XLA_CPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 5534449664
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 17497484290762556171
 physical_device_desc: "device: 0, name: Tesla K20Xm, pci bus id: 0000:03:00.0, compute capability: 3.5", name: "/device:GPU:1"
 device_type: "GPU"
 memory_limit: 3224567808
 locality {
   bus_id: 2
   numa_node: 1
   links {
   }
 }
 incarnation: 7068813728541990903
 physical_device_desc: "device: 1, name: Tesla K20Xm, pci bus id: 0000:84:00.0, compute capability: 3.5"]

In [6]:
import os
import time
import isanlp
import json
import pickle

import numpy as np
np.random.seed(31)

import pandas as pd

# Data loading

In [7]:
cleared_corpus_path = '../../data/cleared_corpus.json'

In [8]:
with open(cleared_corpus_path, 'r') as f:
    examples = json.load(f)

In [9]:
ling_data_path = '../../data/results_final_fixed.pckl'
with open(ling_data_path, 'rb') as f:
    ling_data = pickle.load(f)

ling_data_cache = {k: v for k,v in ling_data}

In [10]:
error_examples = {}

# Feature construction

In [11]:
from isanlp.annotation_repr import CSentence
from convert_corpus_to_brat import make_text


def find_address_by_offset(offset, ling_ann):
    for tok_num, tok in enumerate(ling_ann['tokens']):
        if tok.begin <= offset and offset < tok.end:
            break
    
    for sent_num, sent in enumerate(ling_ann['sentences']):
        if sent.begin <= tok_num and tok_num < sent.end:
            break
    
    return sent_num, tok_num - sent.begin


def process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, example):
    feature_sets = list()
    
    text, offset_index = make_text(example, 0)
    ling_ann = ling_cache[ex_id]
    
    pred_offset = offset_index[(pred[0], pred[1])]
    pred_ling_sent, pred_ling_word = find_address_by_offset(pred_offset, ling_ann)
    
    for arg in args:
        arg_offset = offset_index[(arg[0], arg[1])]
        arg_ling_sent, arg_ling_word = find_address_by_offset(arg_offset, ling_ann)
        
        fb_pred_word = example[pred[0]][pred[1]]
        fb_arg_word = example[arg[0]][arg[1]]
        
        role = fb_arg_word['rolepred1']

        if arg_ling_sent != pred_ling_sent:
            global num_of_errors
            num_of_errors += 1
            # We miss some examples due to mistakes in framebank or discrepancy in 
            # automatica annotation of sentences.
            print('Error #{}'.format(num_of_errors))
            continue
        features = feature_extractor.extract_features(pred_ling_word, 
                                                      arg_ling_word, 
                                                      ling_ann['postag'][arg_ling_sent],
                                                      ling_ann['morph'][arg_ling_sent],
                                                      ling_ann['lemma'][arg_ling_sent],
                                                      ling_ann['syntax_dep_tree'][arg_ling_sent])

                    
        feature_sets.append((features, role, ex_id, [tok.text for tok in ling_ann['tokens']], arg, pred, offset_index))
    
    return feature_sets


def process_example(feature_extractor, ling_cache, ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == 'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, sentences)


num_of_errors = 0
def prepare_train_data(examples, ling_data_cache, feature_extractor):
    feature_sets = []
    for ex_num, (ex_id, ex) in enumerate(examples):    
        if ex_num % 100 == 0:
            print('{0:.2f}%'.format((ex_num / len(examples)) * 100.))
            
        feature_sets += process_example(feature_extractor, ling_data_cache, ex_id, ex)

    print('Number of training examples:', len(feature_sets))
    return feature_sets

In [12]:
main_model_path_root = '../../data/models_new/'

In [13]:
#!!!: Choose feature model here
from isanlp_srl_framebank.processor_srl_framebank import FeatureModelDefault
feature_model = FeatureModelDefault()
main_model_path = os.path.join(main_model_path_root, 'known_preds')

# from isanlp_srl_framebank.processor_srl_framebank import FeatureModelUnknownPredicates
# feature_model = FeatureModelUnknownPredicates()
# main_model_path = os.path.join(main_model_path_root, 'unknown_preds')

#with open(os.path.join(main_model_path, 'feature_model.pckl'), 'wb') as f:
#    pickle.dump(feature_model, f)

In [39]:
feature_sets = prepare_train_data(examples, ling_data_cache, feature_model)

data_for_pandas = []
for example in feature_sets:
    data_for_pandas_ex = {}
    data_for_pandas_ex['role'] = example[1]
    data_for_pandas_ex['ex_id'] = example[2]
    data_for_pandas_ex['tokens'] = example[3]
    idxmapping = {v:i for i,v in enumerate(example[6].keys())}
    data_for_pandas_ex['offsets'] = idxmapping
    data_for_pandas_ex['arg_address'] = idxmapping[example[4]]
    data_for_pandas_ex['pred_offset'] = idxmapping[example[5]]
    for elem in example[0]:
        for subelem in elem:
            if subelem is not None:
                data_for_pandas_ex.update(subelem)
    
    data_for_pandas.append(data_for_pandas_ex)
    
pd_data = pd.DataFrame(data_for_pandas)
pd_data = pd_data.sample(frac=1)
pd_data[:10]
del data_for_pandas

0.00%
Error #415
0.31%
Error #416
0.61%
0.92%
1.23%
1.53%
1.84%
2.15%
Error #417
2.45%
2.76%
3.07%
3.37%
3.68%
3.99%
4.29%
4.60%
4.91%
Error #418
Error #419
5.21%
5.52%
5.83%
Error #420
Error #421
Error #422
Error #423
Error #424
Error #425
Error #426
Error #427
Error #428
Error #429
Error #430
Error #431
Error #432
Error #433
Error #434
Error #435
Error #436
Error #437
Error #438
Error #439
Error #440
Error #441
Error #442
Error #443
Error #444
Error #445
Error #446
Error #447
Error #448
6.13%
Error #449
Error #450
Error #451
Error #452
Error #453
Error #454
Error #455
Error #456
Error #457
Error #458
Error #459
Error #460
Error #461
Error #462
Error #463
Error #464
Error #465
Error #466
Error #467
Error #468
Error #469
Error #470
Error #471
Error #472
Error #473
Error #474
Error #475
Error #476
Error #477
Error #478
Error #479
Error #480
Error #481
Error #482
Error #483
Error #484
Error #485
Error #486
Error #487
Error #488
Error #489
Error #490
Error #491
Error #492
Error #493
Error

In [43]:
print(pd_data.iloc[5])

Animacy_arg                                                   Anim
Aspect_arg                                                        
Gender_arg                                                    Masc
Number_arg                                                    Sing
Tense_arg                                                         
Valency_arg                                                       
VerbForm_arg                                                      
arg_address                                                    163
arg_case                                                       Acc
arg_lemma                                              старик_NOUN
arg_pos                                                       NOUN
dist                                                             1
ex_id                                                       104470
offsets          {(0, 0): 0, (0, 1): 1, (0, 2): 2, (0, 3): 3, (...
pred_lemma                                           радовать_

In [44]:
from pprint import pprint as print_
print_(pd_data.iloc[5].offsets)

{(0, 0): 0,
 (0, 1): 1,
 (0, 2): 2,
 (0, 3): 3,
 (0, 4): 4,
 (0, 5): 5,
 (0, 6): 6,
 (0, 7): 7,
 (0, 8): 8,
 (0, 9): 9,
 (0, 10): 10,
 (0, 11): 11,
 (0, 12): 12,
 (0, 13): 13,
 (0, 14): 14,
 (0, 15): 15,
 (0, 16): 16,
 (0, 17): 17,
 (0, 18): 18,
 (1, 0): 19,
 (1, 1): 20,
 (1, 2): 21,
 (1, 3): 22,
 (1, 4): 23,
 (1, 5): 24,
 (1, 6): 25,
 (1, 7): 26,
 (1, 8): 27,
 (1, 9): 28,
 (1, 10): 29,
 (1, 11): 30,
 (1, 12): 31,
 (1, 13): 32,
 (1, 14): 33,
 (1, 15): 34,
 (1, 16): 35,
 (1, 17): 36,
 (1, 18): 37,
 (1, 19): 38,
 (1, 20): 39,
 (1, 21): 40,
 (1, 22): 41,
 (1, 23): 42,
 (1, 24): 43,
 (1, 25): 44,
 (1, 26): 45,
 (1, 27): 46,
 (1, 28): 47,
 (1, 29): 48,
 (1, 30): 49,
 (1, 31): 50,
 (1, 32): 51,
 (1, 33): 52,
 (1, 34): 53,
 (1, 35): 54,
 (1, 36): 55,
 (1, 37): 56,
 (1, 38): 57,
 (1, 39): 58,
 (1, 40): 59,
 (1, 41): 60,
 (1, 42): 61,
 (1, 43): 62,
 (1, 44): 63,
 (1, 45): 64,
 (1, 46): 65,
 (1, 47): 66,
 (1, 48): 67,
 (1, 49): 68,
 (1, 50): 69,
 (1, 51): 70,
 (1, 52): 71,
 (2, 0): 72,
 (2, 1): 

In [45]:
print_(list(enumerate(pd_data.iloc[5].tokens)))

[(0, 'В'),
 (1, 'течение'),
 (2, 'многих'),
 (3, 'лет'),
 (4, 'у'),
 (5, 'него'),
 (6, 'служил'),
 (7, 'камердинером'),
 (8, 'и'),
 (9, 'заведовал'),
 (10, 'его'),
 (11, 'домашним'),
 (12, 'хозяйством'),
 (13, 'честный'),
 (14, 'и'),
 (15, 'усердный'),
 (16, 'курляндский'),
 (17, 'уроженец'),
 (18, '.'),
 (19, 'В'),
 (20, 'конце'),
 (21, 'шестидесятых'),
 (22, 'годов'),
 (23, 'он'),
 (24, 'умер'),
 (25, 'скоропостижно'),
 (26, ','),
 (27, 'и'),
 (28, 'Иван'),
 (29, 'Александрович'),
 (30, ','),
 (31, 'соболезнуя'),
 (32, 'положению'),
 (33, 'его'),
 (34, 'вдовы'),
 (35, 'с'),
 (36, 'тремя'),
 (37, 'малолетними'),
 (38, 'детьми'),
 (39, ','),
 (40, 'оставил'),
 (41, 'её'),
 (42, 'служить'),
 (43, 'у'),
 (44, 'себя'),
 (45, ','),
 (46, 'предоставив'),
 (47, 'ей'),
 (48, 'маленькое'),
 (49, 'помещение'),
 (50, 'через'),
 (51, 'площадку'),
 (52, 'лестницы'),
 (53, 'своей'),
 (54, 'квартиры'),
 (55, ','),
 (56, 'и'),
 (57, 'заменил'),
 (58, 'ею'),
 (59, 'умершего'),
 (60, 'её'),
 (61, 'мужа

# Preprocessing

In [18]:
y_stat = pd_data.loc[:, 'role'].value_counts()
drop_ys = y_stat[y_stat < 180].index
clear_data = pd_data.drop(pd_data[pd_data.loc[:, 'role'].isin(drop_ys)].index)

In [19]:
repl_roles = {
    'агенс - субъект восприятия' : 'субъект восприятия',
    'агенс - субъект ментального состояния' : 'субъект ментального состояния',
    'результат / цель' : 'результат',
    'место - пациенс' : 'место',
    'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
}


def normalize_single_region(data, rep, val):
    data.loc[:, 'role'] = data.loc[:, 'role'].str.replace(rep, val)


for rep, val in repl_roles.items():
    normalize_single_region(clear_data, rep, val)
    
number_of_roles = len(clear_data.loc[:, 'role'].value_counts().index)
print('Number of roles: ', number_of_roles)
clear_data.loc[:, 'role'].value_counts()

Number of roles:  44


агенс                                 6147
пациенс                               5362
тема                                  3656
субъект психологического состояния    3250
субъект перемещения                   3011
причина                               2502
говорящий                             2365
место                                 2185
содержание действия                   1874
содержание мысли                      1817
содержание высказывания               1792
конечная точка                        1772
результат                             1452
пациенс перемещения                   1356
стимул                                1271
субъект ментального состояния         1223
адресат                                941
субъект восприятия                     901
контрагент                             831
эффектор                               739
субъект социального отношения          598
начальная точка                        588
предмет высказывания                   548
способ     

In [20]:
list(clear_data.loc[:, 'role'].drop_duplicates().values)

['содержание высказывания',
 'говорящий',
 'субъект социального отношения',
 'субъект психологического состояния',
 'содержание действия',
 'агенс',
 'тема',
 'конечная точка',
 'сфера',
 'контрагент',
 'субъект перемещения',
 'причина',
 'субъект поведения',
 'ситуация в фокусе',
 'исходный посессор',
 'субъект физиологической реакции',
 'адресат',
 'пациенс',
 'срок',
 'источник звука',
 'место',
 'признак',
 'потенциальная угроза',
 'субъект ментального состояния',
 'конечный посессор',
 'результат',
 'стимул',
 'субъект восприятия',
 'эффектор',
 'траектория',
 'содержание мысли',
 'пациенс перемещения',
 'каузатор',
 'предмет высказывания',
 'начальная точка',
 'способ',
 'пациенс социального отношения',
 'статус',
 'предмет мысли',
 'цель',
 'потенциальный пациенс',
 'контрагент социального отношения',
 'эталон',
 'признак действия']

In [21]:
y_orig = clear_data.loc[:, 'role']
X_orig = clear_data.drop('role', axis = 1)
X_orig.shape

(52751, 20)

In [22]:
from sklearn.preprocessing import LabelBinarizer
import pickle

label_encoder = LabelBinarizer()
y = label_encoder.fit_transform(y_orig)

with open(main_model_path + '/label_encoder.pckl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [23]:
y.shape

(52751, 44)

## Embedding

In [58]:
from gensim.models import KeyedVectors

embeddings_path = '../../data/embeddings/ruscorpora_upos_skipgram_300_5_2018.vec'
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)
print('Embedding size: ', embeddings.vector_size)

Embedding size:  300


In [59]:
import multiprocessing as mp


def make_embeded_form(word):
    if word:
        #return word[1].encode('utf8')
        return u"{}_{}".format(word[1], word[0])
    else:
        return word


class Embedder_map:
    def __init__(self, embeddings, X):
        self.X_ = X
        self.embeddings_ = embeddings

    def __call__(self, i):  
        result = np.zeros((len(self.X_[0]), 
                           self.embeddings_.vector_size))

        for j in range(len(self.X_[0])):
            word = self.X_[i][j]
            tag = word[0] if word else str()
            
            if tag == ARG_SPECIAL_TAG or tag == ARG_SPECIAL_TAG:
                result[j, :] = np.ones(self.embeddings_.vector_size)
            elif word and word in embeddings:
                result[j, :] = self.embeddings_[word]

        return result


def embed(X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_map(embeddings, X), X.index, 1000)
    pool.close()
    return np.asarray(result)


In [60]:
X_orig.columns

Index(['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg',
       'Valency_arg', 'VerbForm_arg', 'arg_address', 'arg_case', 'arg_lemma',
       'arg_pos', 'dist', 'ex_id', 'pred_lemma', 'pred_pos', 'prepos',
       'rel_pos', 'syn_link_name'],
      dtype='object')

In [22]:
%%time

arg_context_embedded = embed(X_orig.loc[:, 'arg_context_lemmas'])

KeyError: 'the label [arg_context_lemmas] is not in the [columns]'

In [None]:
%%time

pred_context_embedded = embed(X_orig.loc[:, 'pred_context_lemmas'])

In [61]:
class Embedder_single_map:
    def __init__(self, embeddings, X):
        self.X_ = X
        self.embeddings_ = embeddings

    def __call__(self, i):
        #word = make_embeded_form(self.X_[i])
        word = self.X_[i]
        if word in self.embeddings_:
            return self.embeddings_[word]
        else:
            return np.zeros((self.embeddings_.vector_size,))

        
def embed_single(embeddings, X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_single_map(embeddings, X), X.index, 1000)
    pool.close()
        
    return np.asarray(result)

In [24]:
elmo = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz", elmo_output_names=['elmo'])

Using TensorFlow backend.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0516 08:04:56.965393 139896968771328 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


In [33]:
def elmo_embed(embeddings, tokens, word_idx):
    embedded = embeddings([tokens])[0]
    return embedded[min(word_idx, len(tokens)-1)]

In [34]:
from tqdm import tqdm as tqdm

In [35]:
def get_embedding(obj):
    verb_idx = obj.pred_offset
    tokens = obj.tokens
    return elmo_embed(elmo, tokens, verb_idx)

In [36]:
e_verbs = np.stack(embedded_verbs)

In [37]:
np.save("../../data/elmo_verbs.npy", e_verbs)

In [38]:
%%time
embedded_verbs = []
for i in tqdm(range(len(X_orig))):
    obj = X_orig.iloc[i]
    verb_idx = obj.pred_offset
    tokens = obj.tokens
    embedded_verbs.append(elmo_embed(elmo, tokens, verb_idx))

 33%|███▎      | 17335/52751 [2:31:41<5:09:53,  1.90it/s]


KeyboardInterrupt: 

In [62]:
%%time

embedded_verbs = embed_single(pd.Series(list(zip(X_orig.pred_pos, X_orig.pred_lemma)), 
                                         index = X_orig.index))

print(embedded_verbs.shape)
print((np.linalg.norm(embedded_verbs, axis = 1) < 0.001).sum())
print(clear_data[(np.linalg.norm(embedded_verbs, axis = 1) < 0.001)].pred_lemma.value_counts().shape)

(30371, 300)
362
(41,)
CPU times: user 32 ms, sys: 68 ms, total: 100 ms
Wall time: 99.3 ms


In [63]:
%%time

# embedded_args = embed_single(pd.Series(list(zip(X_orig.arg_pos, X_orig.arg_lemma)), 
#                                        index = X_orig.index))
embedded_args = embed_single(embeddings, X_orig.arg_lemma)
print(embedded_args.shape)
print((np.linalg.norm(embedded_args, axis = 1) < 0.001).sum())

(30371, 300)
10342
CPU times: user 22 s, sys: 14.3 s, total: 36.3 s
Wall time: 42.3 s


## Vectorizing categorial features

In [64]:
X_orig.columns

Index(['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg',
       'Valency_arg', 'VerbForm_arg', 'arg_address', 'arg_case', 'arg_lemma',
       'arg_pos', 'dist', 'ex_id', 'pred_lemma', 'pred_pos', 'prepos',
       'rel_pos', 'syn_link_name'],
      dtype='object')

In [65]:
from sklearn.feature_extraction import DictVectorizer

#morph_feats = ['pos', 'case', 'anim', 'vform', 'zform', 'shform', 'pform', 'vvform', 'nform', 'time']

# all_feats = (['pred_lemma', 'rel_pos'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep', 'link_name'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

#all_feats = ['pred_lemma', 'rel_pos', 'pred_pos', 'arg_case', 'syn_link_name', 'arg_pos', 'prepos', 'dist']

#categ_feats = [e for e in all_feats if X_orig[e].dtype in [str, object]]
#not_categ = [e for e in all_feats if e not in categ_feats]

#pred_lemma_vectorizer.fit_transform(X_orig.loc[:, ['pred_lemma']].to_dict(orient = 'records'))

not_categ_features = {'arg_address', 'ex_id', 'rel_pos', 'arg_lemma'}
categ_feats = [name for name in X_orig.columns if name not in not_categ_features] 
not_categ = ['rel_pos']
print('Category features:\n', categ_feats)
print('Not category features:\n', not_categ)

vectorizer = DictVectorizer(sparse = False)
one_hot_feats = vectorizer.fit_transform(X_orig.loc[:, categ_feats].to_dict(orient = 'records'))
print(one_hot_feats.shape)

with open(main_model_path + '/feature_encoder.pckl', 'wb') as f:
    pickle.dump(vectorizer, f)

Category features:
 ['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg', 'Valency_arg', 'VerbForm_arg', 'arg_case', 'arg_pos', 'dist', 'pred_lemma', 'pred_pos', 'prepos', 'syn_link_name']
Not category features:
 ['rel_pos']
(30371, 941)


In [66]:
not_categ_columns = np.concatenate(tuple(X_orig.loc[:, e].as_matrix().reshape(-1, 1) for e in not_categ), axis =1)
plain_features = np.concatenate((one_hot_feats, not_categ_columns), axis = 1)
plain_features.shape

  """Entry point for launching an IPython kernel.


(30371, 942)

In [67]:
del not_categ_columns
del one_hot_feats

# Model construction

In [68]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Convolution1D, Dropout, MaxPooling1D
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input
from tensorflow.python.keras.layers import TimeDistributed
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Permute
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import BatchNormalization
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.layers import Masking
from gensim.models import Word2Vec

In [74]:
def construct_plain_model(input_shape):
    print('Plain model.')
    
    plain_model = Sequential()
    plain_model.add(Dense(600, 
                          #input_shape=(plain_features.shape[1],), 
                          input_shape = input_shape,
                          activation = 'relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(400))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(number_of_roles))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('softmax'))
    
    plain_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return plain_model

In [75]:
def construct_plain_model_sparse(categ_size, emb_size, number_of_roles):    
    input_plain = Input(shape=(categ_size,), name = 'input_categorical')
    input_pred_embed = Input(shape=(emb_size,), name = 'pred_embed')
    input_arg_embed = Input(shape=(emb_size,), name = 'arg_embed')
    
    plain = Dense(400)(input_plain)
    plain = BatchNormalization()(plain)
    plain = Activation('relu')(plain)
    
    def embed_submodel(inpt):
        embed = Dense(100)(inpt)
        embed = BatchNormalization()(embed)
        embed = Activation('relu')(embed)
        return embed
    
    embed_pred = embed_submodel(input_pred_embed)
    embed_arg = embed_submodel(input_arg_embed)
    
    final = Concatenate(axis = 1)([embed_pred, embed_arg, plain])
    final = Dropout(0.3)(final)
    final = Dense(400)(final)
    final = BatchNormalization()(final)
    final = Activation('relu')(final)
    final = Dropout(0.3)(final)
    final = Dense(number_of_roles)(final)
    final = BatchNormalization()(final)
    final = Activation('softmax')(final)
    
    model = Model([input_arg_embed, input_pred_embed, input_plain], final)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Experiments

## Experiments with in-domain test

In [35]:
# Use only for experiments with in-domain test. Do not use for training

### Plain model

In [76]:
model = construct_plain_model((plain_features.shape[1],))
print(model.summary())
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(plain_features, y, epochs=15, batch_size=300, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

Plain model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 600)               565800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 400)               240400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 400)               1600      
_________________________________________________________________
activation_1 (Activation)    (None, 400)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 34)                13634   

<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f8b5e515470>

### Sparse

In [77]:
model = construct_plain_model_sparse(plain_features.shape[1], embeddings.vector_size, y.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([embedded_args, embedded_verbs, plain_features], y, epochs=16, batch_size=300, 
          validation_split = 0.1, shuffle=True, callbacks = [early_stopping])
model.save(os.path.join(main_model_path, 'neural_model.h5'))

Train on 27333 samples, validate on 3038 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [78]:
from tensorflow.python.keras.backend import clear_session
clear_session()

### Context

In [79]:
%%time

model = construct_graph_lstm_model((plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([arg_context_embedded, pred_context_embedded, embedded_args, embedded_verbs, plain_features], y, 
          epochs=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

Context model.


NameError: name 'pred_context_embedded' is not defined

## Experiements with out-of-domain test

In [None]:
# Use only for out of domain experiments. Do not use for training

In [80]:
def evaluate_out_of_domain(model, X_train, y_train, X_test, y_test):
    final_res = list()
    N_ITERATIONS = 5
    for i in xrange(N_ITERATIONS):
        print('Eval iter:', i + 1, '/', N_ITERATIONS)
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, 
                                       patience=2, verbose=0, mode='auto')
        model.fit(X_train, y_train, nb_epoch=15, 
                  batch_size=64, validation_split = 0.1, 
                  shuffle=True, callbacks = [early_stopping],
                 verbose = 0)

        ev_res = evaluate_model(model, X_test, y_test)
        print()
        print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))
        final_res.append(ev_res)
    
    return np.array(final_res)

In [81]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def evaluate_model(model, X_test, y_test):
    keras_eval = model.evaluate(X_test, y_test)
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return np.array(list(keras_eval) + [f1_micro, f1_macro, accur])

### Simple

In [82]:
model = construct_plain_model((ind_plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(ind_plain_features, ind_y, nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#model.evaluate(ood_plain_features, ood_y)
ev_res = evaluate_model(model, [ood_plain_features], ood_y)
print()
print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))

NameError: name 'ind_plain_features' is not defined

In [50]:
model = construct_plain_model((ind_plain_features.shape[1],))
model_eval = evaluate_out_of_domain(model, ind_plain_features, ind_y, ood_plain_features, ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

### Complex

In [51]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, nb_epoch=20, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])
#model.evaluate([ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)

ev_res = evaluate_model(model, [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print()
print(pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur']))

NameError: name 'ind_plain_features' is not defined

In [52]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],))
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

### Context

In [53]:
model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([
           #ind_arg_context, 
        #ind_pred_context,   
        ind_arg_embed, 
        ind_pred_embed, 
        ind_plain_features,
        ind_pred_context], 
#model.fit([ind_arg_context, ind_pred_context, ind_arg_embed, ind_pred_embed, ind_plain_features], 
           ind_y, nb_epoch=6, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#model.evaluate([ood_arg_context, ood_pred_context, ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
model.evaluate([
    #    ood_arg_context, 
    #    ood_pred_context,
        ood_arg_embed, 
        ood_pred_embed,
        ood_plain_features,
        ood_pred_context
    ], ood_y)

NameError: name 'ind_plain_features' is not defined

In [54]:
model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features, ind_pred_context], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features, ood_pred_context], ood_y)
print(model_eval)
describe_cv_result(model_eval)

NameError: name 'ind_plain_features' is not defined

# Evaluation

In [55]:
# Use only for model comparision. Do not use for training.

In [83]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, *args, **kwargs):
    model.fit(X_train, y_train, *args, **kwargs)
    
    keras_eval = model.evaluate(X_test, y_test)
    
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return list(keras_eval) + [f1_micro, f1_macro, accur]
    

def custom_cross_val(cr_f, X, y, cv, *args, **kwargs):
    cr_f().summary()
    eval_res = list()
    for i, (train, test) in enumerate(cv.split(y)):
        model = cr_f()
        print('Running Fold', i+1, '/', cv.n_splits)
        eval1 = train_and_evaluate_model(model, 
                                         [X[j][train] for j in range(len(X))], y[train], 
                                         [X[j][test] for j in range(len(X))], y[test], 
                                         *args, **kwargs)
        
        print()
        print('Fold result: ', eval1)
        eval_res.append(eval1)
    
    return np.array(eval_res)


def describe_cv_result(cv_res):
    print(cv_res)
    mean_cv_res = cv_res.mean(axis = 0)
    std_cv_res = cv_res.std(axis = 0)
    print('Mean')
    print(pd.DataFrame([mean_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur']))
    print('Std')
    print(pd.DataFrame([std_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur']))
    
    
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=42)

## Simple

In [84]:
curr_features = np.concatenate((no_lemma_plain_features, embedded_verbs), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((curr_features.shape[1],)), 
                          [curr_features], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

NameError: name 'no_lemma_plain_features' is not defined

In [85]:
cv_res = custom_cross_val(lambda : construct_plain_model((plain_features.shape[1],)), 
                          [plain_features], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

Plain model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 600)               565800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 400)               240400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 400)               1600      
_________________________________________________________________
activation_1 (Activation)    (None, 400)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 34)                13634   

In [86]:
single_chunk = np.concatenate((embedded_args, embedded_verbs, plain_features), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((single_chunk.shape[1],)), 
                          [single_chunk], 
                          y, cv = cv, epochs=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

Plain model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 600)               925800    
_________________________________________________________________
dropout_13 (Dropout)         (None, 600)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 400)               240400    
_________________________________________________________________
batch_normalization_13 (Batc (None, 400)               1600      
_________________________________________________________________
activation_13 (Activation)   (None, 400)               0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 34)                13634   

## Sparse

In [89]:
cv_res = custom_cross_val(lambda : construct_plain_model_sparse(plain_features.shape[1], 
                                                                embeddings.vector_size, 
                                                                y.shape[1]), 
                          [embedded_args, embedded_verbs, plain_features], y, 
                          cv = cv, epochs=13, batch_size=300,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pred_embed (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
arg_embed (InputLayer)          (None, 300)          0                                            
__________________________________________________________________________________________________
input_categorical (InputLayer)  (None, 990)          0                                            
__________________________________________________________________________________________________
dense_92 (Dense)                (None, 100)          30100       pred_embed[0][0]                 
__________________________________________________________________________________________________
dense_93 (

## Context

In [None]:
cv_res = custom_cross_val(lambda : construct_graph_lstm_model((plain_features.shape[1],)), 
                          [arg_context_embedded, 
                           pred_context_embedded, 
                           embedded_args, 
                           embedded_verbs,
                           plain_features], y, 
                          cv = cv, epochs=6, batch_size=64, validation_split = 0., 
                          shuffle=True)

describe_cv_result(cv_res)

# Training and predicting

In [None]:
# Training model and saving

In [87]:
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(X_orig.ex_id.unique(), test_size=0.2, random_state=42)
train_ids = set(train_ids.tolist())
test_ids = set(test_ids.tolist())

In [88]:
train_selector_pd = X_orig.ex_id.isin(train_ids)
test_selector_pd = X_orig.ex_id.isin(test_ids)
train_selector = train_selector_pd.values
test_selector = test_selector_pd.values

In [89]:
train_data = {k : data[k] for k in train_ids}
test_data = {k : data[k] for k in test_ids}

with open(os.path.join(main_model_path, 'train_data.json'), 'w') as f:
    json.dump(train_data, f)

with open(os.path.join(main_model_path, 'test_data.json'), 'w') as f:
    json.dump(test_data, f)

In [77]:
def select_from_nparray_list(nparray_list, selector):
    return [e[selector] for e in nparray_list]

model = construct_plain_model_sparse(plain_features.shape[1], embeddings.vector_size, y.shape[1])
model.fit(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], train_selector),
          select_from_nparray_list([y], train_selector), 
          epochs=10, batch_size=64, validation_split = 0.1, shuffle=True)

model.evaluate(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector), 
               select_from_nparray_list([y], test_selector))
model.save(os.path.join(main_model_path, 'neural_model.h5'))

Train on 37893 samples, validate on 4211 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6926666886275118, 0.8026800037746532]

In [81]:
# Hold-out evaluation.

evaluate_model(model,
               select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector), 
               select_from_nparray_list([y], test_selector)[0])



array([0.69266669, 0.80268   , 0.80268   , 0.77108236, 0.80268   ])

In [84]:
pred = model.predict(select_from_nparray_list([embedded_args, embedded_verbs, plain_features], test_selector))

test_examples_to_store = X_orig.loc[test_selector_pd[test_selector_pd].index, :].loc[:, ['arg_address', 'ex_id']]
test_data = {k : data[k] for k in test_ids}


for index, (pd_index, row) in enumerate(test_examples_to_store.iterrows()):
    ex = test_data[row['ex_id']]
    arg_addr = row['arg_address']
    sent = ex[arg_addr[0]]
    token = sent[arg_addr[1]]
    cl = pred[index]
    predicted_role = label_encoder.inverse_transform(np.array([cl]))[0]
    actual_role = label_encoder.inverse_transform(np.array([select_from_nparray_list([y], test_selector)[0][index]]))[0]
    
    token['rolepred1'] = actual_role
    token['rolepred2'] = predicted_role

In [85]:
with open('./test_data_ann_1.json', 'w') as f:
    json.dump(test_data, f)

# Brat convertion

In [None]:
# Converts results to brat annotation for inspecting.
# Needs framebank_preprocessing from http://nlp.isa.ru/framebank_parser/

In [None]:
!python2.7 ./framebank_preprocessing/convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./brat_ann2/

In [None]:
!python2.7 ./framebank_preprocessing/convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./syntaxnet_1/ --converter=syn

In [54]:
!export PYTHONPATH=../ && python2.7 ./convert_corpus_to_brat.py --inputFile=./test_data_ann_1.json --outputDir=./brat_ann2/

2018-03-11 20:56:20 - Loading corpus data...
2018-03-11 20:56:22 - Done.
2018-03-11 20:56:22 - Creating verb-example index...
2018-03-11 20:56:22 - Done.
2018-03-11 20:56:22 - Converting and saving...
2018-03-11 20:56:23 - Done.
2018-03-11 20:56:23 - Generating brat configuration files...
2018-03-11 20:56:23 - Done.
