In [1]:
# Use only one GPU
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')
sys.path.append('../../src/isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')

# Supress tensorflow memory appetites

import tensorflow as tf
print(tf.__version__)

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
config.log_device_placement=True
sess = tf.Session(config=config)

from tensorflow.python.keras import backend as K
K.set_session(sess)

# Check available GPUs

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

1.12.0


['/device:GPU:0']

# Load the data and make the train/test split 

In [2]:
import os
import time
import isanlp
import json
import pickle

import numpy as np
np.random.seed(42)

import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [3]:
cleared_corpus_path = '../../data/cleared_corpus.json'

with open(cleared_corpus_path, 'r') as f:
    examples = json.load(f)

In [4]:
ling_data_path = '../../data/results_final_fixed.pckl'
with open(ling_data_path, 'rb') as f:
    ling_data_cache = pickle.load(f)

ling_data_cache = {k: v for k,v in ling_data_cache}

In [5]:
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(list(ling_data_cache.keys()), test_size=0.2, random_state=42)
train_ids = list(set(train_ids))
test_ids = list(set(test_ids))

data_path = '../../data/'
main_model_path_root = '../../data/models_new/'

In [6]:
train_data = [example for example in examples if example[0] in train_ids]
test_data = [example for example in examples if example[0] in test_ids]

with open(os.path.join(data_path, 'train_data.json'), 'w') as f:
    json.dump(train_data, f)

with open(os.path.join(data_path, 'test_data.json'), 'w') as f:
    json.dump(test_data, f)

# Construct the features

In [7]:
import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')
sys.path.append('../../src/isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')


from isanlp.annotation_repr import CSentence
from convert_corpus_to_brat import make_text


def find_address_by_offset(offset, ling_ann):
    for tok_num, tok in enumerate(ling_ann['tokens']):
        if tok.begin <= offset and offset < tok.end:
            break
    
    for sent_num, sent in enumerate(ling_ann['sentences']):
        if sent.begin <= tok_num and tok_num < sent.end:
            break
    
    return sent_num, tok_num - sent.begin


error_examples = {}

def process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, example):
    feature_sets = list()
    
    text, offset_index = make_text(example, 0)
    ling_ann = ling_cache[ex_id]
    
    pred_offset = offset_index[(pred[0], pred[1])]
    pred_ling_sent, pred_ling_word = find_address_by_offset(pred_offset, ling_ann)
    
    for arg in args:
        arg_offset = offset_index[(arg[0], arg[1])]
        arg_ling_sent, arg_ling_word = find_address_by_offset(arg_offset, ling_ann)
        
        fb_pred_word = example[pred[0]][pred[1]]
        fb_arg_word = example[arg[0]][arg[1]]
        
        if arg_ling_sent != pred_ling_sent:
            error_examples[ex_id] = {
                'reason': 'sent_mismatch',
                'arg': arg_ling_sent,
                'pred': pred_ling_sent
            }
            continue
            
        sentence = ling_ann['sentences'][pred_ling_sent]
        tokens = [tok.text for tok in ling_ann['tokens']]
        tokens = tokens[sentence.begin:sentence.end]
        
        role = fb_arg_word['rolepred1']

        features = feature_extractor.extract_features(pred_ling_word, 
                                                      arg_ling_word, 
                                                      ling_ann['postag'][arg_ling_sent],
                                                      ling_ann['morph'][arg_ling_sent],
                                                      ling_ann['lemma'][arg_ling_sent],
                                                      ling_ann['syntax_dep_tree'][arg_ling_sent])

                    
        feature_sets.append((features, role, ex_id, tokens, arg_ling_word, pred_ling_word))
    
    return feature_sets


def process_example(feature_extractor, ling_cache, ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == 'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, sentences)


num_of_errors = 0
def prepare_train_data(examples, ling_data_cache, feature_extractor):
    feature_sets = []
    for ex_num, (ex_id, ex) in tqdm(list(enumerate(examples))):                
        feature_sets += process_example(feature_extractor, ling_data_cache, ex_id, ex)

    print('Number of examples:', len(feature_sets))
    return feature_sets


def construct_features(examples, ling_data_cache, feature_model):
    feature_sets = prepare_train_data(examples, ling_data_cache, feature_model)

    data_for_pandas = []
    for example in feature_sets:
        data_for_pandas_ex = {}
        data_for_pandas_ex['role'] = example[1]
        data_for_pandas_ex['ex_id'] = example[2]
        data_for_pandas_ex['tokens'] = example[3]
        data_for_pandas_ex['arg_address'] = example[4]
        data_for_pandas_ex['prd_address'] = example[5]
        for elem in example[0]:
            for subelem in elem:
                if subelem is not None:
                    data_for_pandas_ex.update(subelem)

        data_for_pandas.append(data_for_pandas_ex)

    return pd.DataFrame(data_for_pandas).sample(frac=1)

In [8]:
known_preds = True  # Choose feature model here

In [9]:
if known_preds:
    from isanlp_srl_framebank.processor_srl_framebank import FeatureModelDefault
    feature_model = FeatureModelDefault()
    main_model_path = os.path.join(main_model_path_root, 'known_preds')
    pd_data = construct_features(examples, ling_data_cache, feature_model)

else:
    from isanlp_srl_framebank.processor_srl_framebank import FeatureModelUnknownPredicates
    feature_model = FeatureModelUnknownPredicates()
    main_model_path = os.path.join(main_model_path_root, 'unknown_preds')
    pd_data = construct_features(examples, ling_data_cache, feature_model)
    del pd_data['pred_lemma']

with open(os.path.join(main_model_path, 'feature_model.pckl'), 'wb') as f:
    pickle.dump(feature_model, f)

HBox(children=(IntProgress(value=0, max=32612), HTML(value='')))


Number of examples: 57552


In [10]:
pd_data.shape

(57552, 21)

In [11]:
N_verify = 1
for i in np.random.choice(len(pd_data), size=N_verify):
    print("-"*60)
    obj = pd_data.iloc[i]
    print(f"Argument position: {obj.arg_address}")
    print(f"Argument lemma: {obj.arg_lemma}")
    print("--")
    print(f"Predicat position: {obj.prd_address}")
    print(f"Predicat lemma: {obj.get('pred_lemma')}")
    print("--")
    print(f"Distance {int(obj.dist)}")
    print("--")
    print(f"Sentence tokens: {list(enumerate(obj.tokens))}")

------------------------------------------------------------
Argument position: 7
Argument lemma: контрабас_NOUN
--
Predicat position: 4
Predicat lemma: извлекать_VERB
--
Distance 3
--
Sentence tokens: [(0, 'Зубря'), (1, 'заклинания'), (2, ','), (3, 'Таня'), (4, 'извлекла'), (5, 'из'), (6, 'футляра'), (7, 'контрабас'), (8, ','), (9, 'села'), (10, 'на'), (11, 'него'), (12, 'и'), (13, 'взяла'), (14, 'в'), (15, 'руку'), (16, 'смычок'), (17, '.')]


In [12]:
pd_data.keys()

Index(['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg',
       'Valency_arg', 'VerbForm_arg', 'arg_address', 'arg_case', 'arg_lemma',
       'arg_pos', 'dist', 'ex_id', 'prd_address', 'pred_lemma', 'pred_pos',
       'prepos', 'rel_pos', 'role', 'syn_link_name', 'tokens'],
      dtype='object')

In [13]:
pd_data.head(1)

Unnamed: 0,Animacy_arg,Aspect_arg,Gender_arg,Number_arg,Tense_arg,Valency_arg,VerbForm_arg,arg_address,arg_case,arg_lemma,...,dist,ex_id,prd_address,pred_lemma,pred_pos,prepos,rel_pos,role,syn_link_name,tokens
8788,,,,Plur,,,,0,Nom,они_PRON,...,1.0,40335,1,появляться_VERB,VERB,,1.0,тема,nsubj,"[Они, появлялись, и, уходили, .]"


# Preprocess

In [14]:
y_stat = pd_data.role.value_counts()
drop_ys = y_stat[y_stat < 180].index
pd_data = pd_data.drop(pd_data[pd_data.role.isin(drop_ys)].index)

In [15]:
repl_roles = {
    'агенс - субъект восприятия' : 'субъект восприятия',
    'агенс - субъект ментального состояния' : 'субъект ментального состояния',
    'результат / цель' : 'результат',
    'место - пациенс' : 'место',
    'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
}

pd_data['role'] = pd_data['role'].replace(repl_roles)
    
number_of_roles = len(pd_data.role.unique())
print('Number of roles: ', number_of_roles)
pd_data.loc[:, 'role'].value_counts()

Number of roles:  44


агенс                                 6147
пациенс                               5362
тема                                  3656
субъект психологического состояния    3250
субъект перемещения                   3011
причина                               2502
говорящий                             2365
место                                 2185
содержание действия                   1874
содержание мысли                      1817
содержание высказывания               1792
конечная точка                        1772
результат                             1452
пациенс перемещения                   1356
стимул                                1271
субъект ментального состояния         1223
адресат                                941
субъект восприятия                     901
контрагент                             831
эффектор                               739
субъект социального отношения          598
начальная точка                        588
предмет высказывания                   548
способ     

In [16]:
y_orig = pd_data.loc[:, 'role']
X_orig = pd_data.drop('role', axis = 1)
X_orig.shape

(52751, 20)

In [17]:
train_selector_pd = X_orig.ex_id.isin(train_ids)
test_selector_pd = X_orig.ex_id.isin(test_ids)
train_selector = train_selector_pd.values
test_selector = test_selector_pd.values

def select_from_nparray_list(nparray_list, selector):
    return [e[selector] for e in nparray_list]

X_train = select_from_nparray_list([X_orig], train_selector)[0]
y_train = select_from_nparray_list([y_orig], train_selector)[0]

In [18]:
X_test = select_from_nparray_list([X_orig], test_selector)[0]
y_test = select_from_nparray_list([y_orig], test_selector)[0]

In [19]:
from sklearn.preprocessing import LabelBinarizer
import pickle

label_encoder = LabelBinarizer()
y_train = label_encoder.fit_transform(y_train)

with open(main_model_path + '/label_encoder.pckl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [20]:
y_test = label_encoder.transform(y_test)

# Vectorize categorical features

In [21]:
columns_to_ommit = ['tokens']

In [27]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder

#morph_feats = ['pos', 'case', 'anim', 'vform', 'zform', 'shform', 'pform', 'vvform', 'nform', 'time']

# all_feats = (['pred_lemma', 'rel_pos'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

# all_feats = (['pred_lemma', 'rel_pos', 'arg_prep', 'link_name'] + 
#              ['arg_' + e for e in morph_feats] + 
#              ['pred_' + e for e in morph_feats])

#all_feats = ['pred_lemma', 'rel_pos', 'pred_pos', 'arg_case', 'syn_link_name', 'arg_pos', 'prepos', 'dist']

#categ_feats = [e for e in all_feats if X_orig[e].dtype in [str, object]]
#not_categ = [e for e in all_feats if e not in categ_feats]

#pred_lemma_vectorizer.fit_transform(X_orig.loc[:, ['pred_lemma']].to_dict(orient = 'records'))

not_categ_features = {'arg_address', 'ex_id', 'rel_pos', 'arg_lemma'}
categ_feats = [name for name in X_train.drop(columns=columns_to_ommit).columns if name not in not_categ_features] 
not_categ = ['rel_pos']
print('Category features:\n', categ_feats)
print('Not category features:\n', not_categ)

# le = LabelEncoder()
# X_train[categorical_cols] = X_train[categorical_cols].apply(lambda col: le.fit_transform(col))
# one_hot_feats = vectorizer.fit_transform(X_orig[categ_feats].to_dict(orient='records'))

vectorizer = DictVectorizer(sparse=False)
vectorizer.fit(X_train[categ_feats].to_dict(orient='records'))
one_hot_feats = vectorizer.transform(X_orig[categ_feats].to_dict(orient='records'))
print(one_hot_feats.shape)

with open(main_model_path + '/feature_encoder.pckl', 'wb') as f:
    pickle.dump(vectorizer, f)

Category features:
 ['Animacy_arg', 'Aspect_arg', 'Gender_arg', 'Number_arg', 'Tense_arg', 'Valency_arg', 'VerbForm_arg', 'arg_case', 'arg_pos', 'dist', 'prd_address', 'pred_lemma', 'pred_pos', 'prepos', 'syn_link_name']
Not category features:
 ['rel_pos']
(52751, 802)


In [28]:
np.save("../../data/labnpnpels.npy", y_train)

In [29]:
not_categ_columns = np.concatenate(tuple(X_orig.loc[:, e].values.reshape(-1, 1) for e in not_categ), axis =1)
plain_features = np.concatenate((one_hot_feats, not_categ_columns), axis = 1)
plain_features.shape

(52751, 803)

In [30]:
np.save("../../data/plain_features.npy", plain_features)

# Add embedding features 

In [31]:
from gensim.models import KeyedVectors

embeddings_path = '../../data/ruscorpora_upos_skipgram_300_5_2018.vec'
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)
print('Embedding size: ', embeddings.vector_size)

Embedding size:  300


In [32]:
import multiprocessing as mp


def make_embeded_form(word):
    if word:
        #return word[1].encode('utf8')
        return u"{}_{}".format(word[1], word[0])
    else:
        return word


class Embedder_map:
    def __init__(self, embeddings, X):
        self.X_ = X
        self.embeddings_ = embeddings

    def __call__(self, i):  
        result = np.zeros(embeddings.vector_size)
        
        ARG_SPECIAL_TAG = None  # ??

        word = self.X_[i]
        if embeddings.vocab.get(word):
            return embeddings[word]

        return result


def embed(X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_map(embeddings, X), list(range(len(X))), 1000)
    pool.close()
#     embedder = Embedder_map(embeddings, X)
#     result =[embedder(i) for i in range(2)]
#     #result = [embedder(i) for i in range(len(X))]
    return np.asarray(result)

In [33]:
%%time

arg_embedded = embed(X_orig['arg_lemma'].values)

CPU times: user 17.8 s, sys: 3.6 s, total: 21.4 s
Wall time: 25.4 s


In [34]:
%%time

pred_embedded = embed(X_orig['pred_lemma'].values)

CPU times: user 18.1 s, sys: 3.72 s, total: 21.8 s
Wall time: 25.8 s


In [35]:
np.save("../../data/w2v_verbs.npy", pred_embedded)
np.save("../../data/w2v_args.npy", arg_embedded)

In [36]:
pred_embedded = np.load("../../data/w2v_verbs.npy")
arg_embedded = np.load("../../data/w2v_args.npy")

In [37]:
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder

elmo = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz", elmo_output_names=['elmo'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
Using TensorFlow backend.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [38]:
def elmo_embed(embeddings, tokens, word_idx1, word_idx2):
    embedded = embeddings([tokens])[0]
    return embedded[min(word_idx1, len(tokens)-1)], embedded[min(word_idx2, len(tokens)-1)] 

In [39]:
# test object
obj = X_orig.iloc[38]
verb_idx = obj.prd_address
arg_idx = obj.arg_address
tokens = obj.tokens
embed_verb, embed_arg = elmo_embed(elmo, tokens, verb_idx, arg_idx)

In [None]:
%%time
embedded_verbs = []
embedded_args  = []
for i in tqdm(range(len(X_orig))):
    try:
        if i % 100 == 0:
            with open("./log.txt", 'a', encoding='utf-8') as log:
                print(f"Processed {i} examples", file=log)
        obj = X_orig.iloc[i]
        verb_idx = obj.prd_address
        arg_idx = obj.arg_address
        tokens = obj.tokens
        embed_verb, embed_arg = elmo_embed(elmo, tokens, verb_idx, arg_idx)
        embedded_verbs.append(embed_verb)
        embedded_args.append(embed_arg)
    except Exception as e:
        with open("./log.txt", 'a', encoding='utf-8') as log:
            print(f"Error while processing example {i}={X_orig.iloc[i]}: {e}", file=log)

HBox(children=(IntProgress(value=0, max=52751), HTML(value='')))

In [65]:
e_verbs = np.stack(embedded_verbs)
e_args  = np.stack(embedded_args)

print(e_verbs.shape, e_args.shape)

(52751, 1024) (52751, 1024)


In [66]:
np.save("../../data/elmo_verbs.npy", e_verbs)
np.save("../../data/elmo_args.npy", e_args)

In [41]:
e_verbs = np.load("../../data/elmo_verbs.npy")
e_args = np.load("../../data/elmo_args.npy")

# Construct the models

In [42]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Convolution1D, Dropout, MaxPooling1D
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Input
from tensorflow.python.keras.layers import TimeDistributed
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Permute
from tensorflow.python.keras.layers import Lambda
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.layers import BatchNormalization
from tensorflow.python.keras.layers import Concatenate
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.layers import Masking
from gensim.models import Word2Vec

In [43]:
def construct_plain_model(input_shape):
    print('Plain model.')
    
    plain_model = Sequential()
    plain_model.add(Dense(600, 
                          #input_shape=(plain_features.shape[1],), 
                          input_shape = input_shape,
                          activation = 'relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(400))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(number_of_roles))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('softmax'))
    
    plain_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return plain_model

In [44]:
def construct_plain_model_sparse(categ_size, emb_size, number_of_roles):    
    input_plain = Input(shape=(categ_size,), name = 'input_categorical')
    input_pred_embed = Input(shape=(emb_size,), name = 'pred_embed')
    input_arg_embed = Input(shape=(emb_size,), name = 'arg_embed')
    
    plain = Dense(400)(input_plain)
    plain = BatchNormalization()(plain)
    plain = Activation('relu')(plain)
    
    def embed_submodel(inpt):
        embed = Dense(100)(inpt)
        embed = BatchNormalization()(embed)
        embed = Activation('relu')(embed)
        return embed
    
    embed_pred = embed_submodel(input_pred_embed)
    embed_arg = embed_submodel(input_arg_embed)
    
    final = Concatenate(axis = 1)([embed_pred, embed_arg, plain])
    final = Dropout(0.3)(final)
    final = Dense(400)(final)
    final = BatchNormalization()(final)
    final = Activation('relu')(final)
    final = Dropout(0.3)(final)
    final = Dense(number_of_roles)(final)
    final = BatchNormalization()(final)
    final = Activation('softmax')(final)
    
    model = Model([input_arg_embed, input_pred_embed, input_plain], final)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Train and save the models 

### For known preds 

In [45]:
def select_from_nparray_list(nparray_list, selector):
    return [np.array(e)[selector] for e in nparray_list]

In [46]:
MODEL_NAME = 'test_model_elmo.h5'
VEC_SIZE = elmo.dim

model = construct_plain_model_sparse(plain_features.shape[1], VEC_SIZE, y_train.shape[1])
model.summary()
model.fit(select_from_nparray_list([e_args, e_verbs, plain_features], train_selector),
          y_train, 
          epochs=10, batch_size=64, validation_split = 0.1, shuffle=True)

print(model.metrics_names)
print(model.evaluate(select_from_nparray_list([e_args, e_verbs, plain_features], test_selector), 
               y_test))
model.save(os.path.join(main_model_path, MODEL_NAME))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pred_embed (InputLayer)         (None, 1024)         0                                            
__________________________________________________________________________________________________
arg_embed (InputLayer)          (None, 1024)         0                                            
__________________________________________________________________________________________________
input_categorical (InputLayer)  (None, 803)          0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 100)          102500      pred_embed[0][0]                 
__________________________________________________________________________________________________
dense_2 (D

In [168]:
MODEL_NAME = 'test_model_w2v.h5'
VEC_SIZE = embeddings.vector_size

model = construct_plain_model_sparse(plain_features.shape[1], embeddings.vector_size, y_train.shape[1])
model.fit(select_from_nparray_list([arg_embedded, pred_embedded, plain_features], train_selector),
          y_train, 
          epochs=15, batch_size=300, validation_split = 0.1, shuffle=True)

print(model.metrics_names)
print(model.evaluate(select_from_nparray_list([arg_embedded, pred_embedded, plain_features], test_selector), 
               y_test))
model.save(os.path.join(main_model_path, MODEL_NAME))

Train on 37972 samples, validate on 4220 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
['loss', 'acc']
[0.7371134754834454, 0.7891845818055445]


### For unknown preds

In [252]:
MODEL_NAME = 'test_model_elmo.h5'
VEC_SIZE = elmo.dim

model = construct_plain_model_sparse_test(plain_features.shape[1], VEC_SIZE, y_train.shape[1])
model.summary()
model.fit(select_from_nparray_list([e_args, e_verbs, plain_features], train_selector),
          y_train, 
          epochs=10, batch_size=64, validation_split = 0.1, shuffle=True)

print(model.metrics_names)
print(model.evaluate(select_from_nparray_list([e_args, e_verbs, plain_features], test_selector), 
               y_test))
model.save(os.path.join(main_model_path, MODEL_NAME))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pred_embed (InputLayer)         (None, 1024)         0                                            
__________________________________________________________________________________________________
arg_embed (InputLayer)          (None, 1024)         0                                            
__________________________________________________________________________________________________
input_categorical (InputLayer)  (None, 178)          0                                            
__________________________________________________________________________________________________
dense_210 (Dense)               (None, 100)          102500      pred_embed[0][0]                 
__________________________________________________________________________________________________
dense_211 

In [232]:
MODEL_NAME = 'test_model_w2v.h5'
VEC_SIZE = embeddings.vector_size

model = construct_plain_model_sparse(plain_features.shape[1], VEC_SIZE, y_train.shape[1])
model.summary()
model.fit(select_from_nparray_list([arg_embedded, pred_embedded, plain_features], train_selector),
          y_train, 
          epochs=15, batch_size=300, validation_split = 0.1, shuffle=True)

print(model.metrics_names)
print(model.evaluate(select_from_nparray_list([arg_embedded, pred_embedded, plain_features], test_selector), 
               y_test))
model.save(os.path.join(main_model_path, MODEL_NAME))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pred_embed (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
arg_embed (InputLayer)          (None, 300)          0                                            
__________________________________________________________________________________________________
input_categorical (InputLayer)  (None, 178)          0                                            
__________________________________________________________________________________________________
dense_156 (Dense)               (None, 100)          30100       pred_embed[0][0]                 
__________________________________________________________________________________________________
dense_157 

# Generate latex table with per-role performance 

#### Map roles as given in https://github.com/olesar/framebank/blob/master/framebank_roles_ru_eng.md 

In [391]:
en_vocab = {
    'avg': 'avg',
    'предмет мысли': 'topic of thought',
    'результат': 'result',
    'потенциальная угроза': 'potential threat',
    'контрагент': 'counteragent',
    'агенс': 'agent',
    'каузатор': 'causer',
    'пациенс': 'patient',
    'ситуация в фокусе': 'situation in focus',
    'конечный посессор': 'recipient',
    'тема': 'theme',
    'эффектор': 'effector',
    'способ': 'manner',
    'сфера': 'field',
    'траектория': 'path',
    'цель': 'goal',
    'признак': 'attribute',
    'субъект социального отношения': 'subject of social attitude',
    'пациенс социального отношения': 'patient of social attitude',
    'субъект поведения': 'behaver',
    'статус': 'status',
    'исходный посессор': 'initial possessor',
    'контрагент социального отношения': 'counteragent of social attitude',
    'потенциальный пациенс': 'potential patient',
    'пациенс перемещения': 'patient of motion',
    'содержание мысли': 'content of thought',
    'содержание действия': 'content of action',
    'субъект ментального состояния': 'cognizer',
    'стимул': 'stimulus',
    'признак действия': 'attribute of action',
    'эталон': 'standard',
    'субъект психологического состояния': 'sbj of psychol. state',
    'срок': 'term',
    'субъект перемещения': 'goer',
    'говорящий': 'speaker',
    'конечная точка': 'final destination',
    'причина': 'cause',
    'источник звука': 'source of sound',
    'предмет высказывания': 'topic of speech',
    'адресат': 'addressee',
    'место': 'location',
    'субъект восприятия': 'perceiver',
    'субъект физиологической реакции': 'sbj of physiol. reaction',
    'начальная точка': 'initial point',
    'содержание высказывания': 'content of speech'
}

#### Numbers of examples per each role

In [394]:
roles_counts = dict(pd_data.loc[:, 'role'].value_counts())

#### Generate report table with sklearn 

In [396]:
from sklearn.metrics import classification_report

y_pred = model.predict(select_from_nparray_list([e_args, e_verbs, plain_features], test_selector))
report = classification_report(label_encoder.inverse_transform(y_test), label_encoder.inverse_transform(y_pred),
                            target_names=target_names, digits=4)

#### Generate latex table 

In [399]:
import collections


def parse_classification_report(clfreport):
    """
    Parse a sklearn classification report into a dict keyed by class name
    and containing a tuple (precision, recall, fscore, support) for each class
    """
    lines = clfreport.split('\n')
    # Remove empty lines
    lines = list(filter(lambda l: not len(l.strip()) == 0, lines))

    # Starts with a header, then score for each class and finally an average
    header = lines[0]
    cls_lines = lines[1:-1]
    avg_line = lines[-1]

    assert header.split() == ['precision', 'recall', 'f1-score', 'support']
    assert avg_line.split()[0] == 'avg'

    # We cannot simply use split because class names can have spaces. So instead
    # figure the width of the class field by looking at the indentation of the
    # precision header
    cls_field_width = len(header) - len(header.lstrip())
    # Now, collect all the class names and score in a dict
    def parse_line(l):
        """Parse a line of classification_report"""
        cls_name = l[:cls_field_width].strip() 
        precision, recall, fscore, support = l[cls_field_width:].split()
        precision = float(precision)
        recall = float(recall)
        fscore = float(fscore)
        support = roles_counts[cls_name]/len(pd_data)
        return (cls_name, precision, recall, fscore, support)

    data = collections.OrderedDict()
    for l in cls_lines:
        ret = parse_line(l)
        cls_name = ret[0]
        scores = [score * 100. for score in ret[1:]]
        data[cls_name] = scores
        #print(f'data[{cls_name}] = {scores}')
    
    # Apply sort by column
    # Column#2 - F1, Column#3 - quantity
    listofTuples = sorted(data.items(), key=lambda x: x[1][-1], reverse=True)
    _data = collections.OrderedDict()
 
    for elem in listofTuples:
        if elem[0] != 'avg':
            _data[elem[0]] = elem[1]

    # average
    # data['avg'] = parse_line(avg_line)[1:]

    return _data

def report_to_latex_table(data, percentage=True):
    out = ""
    out += "\\begin{tabular}{l|c|c|c}\n"
    out += "\hline\n"
    out += "\\bf Class & \\bf Precision & \\bf Recall & \\bf F-score \\\\\n"
    out += "\hline\n"
    for cls, scores in data.items():
        scores = scores[:-1]
        if percentage:
            out += en_vocab[cls] + f" ({round(roles_counts[cls]/len(pd_data)*100, 1)}\%)" + " & " + " & ".join([str(round(s, 1)) for s in scores])
        else:
            out += en_vocab[cls] + " & " + " & ".join([str(round(s, 1)) for s in scores])
        out += "\\\\\n"
    out += "\\end{tabular}"
    return out

data = parse_classification_report(report)
print(report_to_latex_table(data))

\begin{tabular}{l|c|c|c}
\hline
\bf Class & \bf Precision & \bf Recall & \bf F-score \\
\hline
agent (11.7\%) & 76.1 & 83.3 & 79.5\\
patient (10.2\%) & 85.1 & 88.7 & 86.9\\
theme (6.9\%) & 84.6 & 71.6 & 77.6\\
sbj of psychol. state (6.2\%) & 86.7 & 83.9 & 85.2\\
goer (5.7\%) & 82.9 & 89.2 & 85.9\\
cause (4.7\%) & 86.2 & 88.6 & 87.4\\
speaker (4.5\%) & 73.5 & 78.3 & 75.8\\
location (4.1\%) & 87.4 & 82.5 & 84.9\\
content of action (3.6\%) & 89.1 & 83.8 & 86.3\\
content of thought (3.4\%) & 74.6 & 79.7 & 77.0\\
content of speech (3.4\%) & 75.9 & 69.5 & 72.6\\
final destination (3.4\%) & 70.3 & 52.0 & 59.8\\
result (2.8\%) & 63.5 & 54.0 & 58.4\\
patient of motion (2.6\%) & 88.8 & 80.4 & 84.4\\
stimulus (2.4\%) & 85.1 & 72.2 & 78.1\\
cognizer (2.3\%) & 85.1 & 76.9 & 80.8\\
addressee (1.8\%) & 75.7 & 79.1 & 77.4\\
perceiver (1.7\%) & 90.5 & 79.0 & 84.3\\
counteragent (1.6\%) & 56.8 & 65.6 & 60.9\\
effector (1.4\%) & 77.0 & 81.0 & 78.9\\
subject of social attitude (1.1\%) & 82.2 & 79.5 & 80.8