In [9]:
import sys
sys.path.append("../")

In [11]:
!ls

embedding.py		 pipeline_eval.ipynb
errors_w2v_x_w2v.pkl	 prepare_dataset.ipynb
evaluate pipeline.ipynb  prepare_dataset.py
extract_features.ipynb	 README.md
extract_features.py	 resplit data & retrain models.ipynb
feature_modeling.py	 sample.log
ling_parse.py		 train_framebank_parser.ipynb
log_idx.txt		 training_utils.py
log.txt			 train_model.ipynb
ModelEvaluation.ipynb	 vectorization.py
models.py		 vectorize_data.ipynb


In [12]:
import json
from pprint import pprint as print_
from collections import OrderedDict
import numpy as np
from numpy.random import RandomState
from tqdm import tqdm_notebook as tqdm

from isanlp_srl_framebank.pipeline_default import PipelineDefault

In [13]:
random_seed = 41

In [14]:
roleset44 = {
 'содержание высказывания',
 'говорящий',
 'субъект социального отношения',
 'субъект психологического состояния',
 'содержание действия',
 'агенс',
 'тема',
 'конечная точка',
 'сфера',
 'контрагент',
 'субъект перемещения',
 'причина',
 'субъект поведения',
 'ситуация в фокусе',
 'исходный посессор',
 'субъект физиологической реакции',
 'адресат',
 'пациенс',
 'срок',
 'источник звука',
 'место',
 'признак',
 'потенциальная угроза',
 'субъект ментального состояния',
 'конечный посессор',
 'результат',
 'стимул',
 'субъект восприятия',
 'эффектор',
 'траектория',
 'содержание мысли',
 'пациенс перемещения',
 'каузатор',
 'предмет высказывания',
 'начальная точка',
 'способ',
 'пациенс социального отношения',
 'статус',
 'предмет мысли',
 'цель',
 'потенциальный пациенс',
 'контрагент социального отношения',
 'эталон',
 'признак действия'}

In [15]:
ARGUMENT_POSTAGS = {
        'NOUN',
        'PRON',
        'ADJ',
        'PROPN'
    }

In [16]:
def get_roles_pred(lemma, role_annot, part_id):
    ann_sent = role_annot[part_id]
    predicates = {}
    arguments = {}
    for event in ann_sent:
        predicate = {
            'lemma': lemma[part_id][event.pred[0]],
        }
        predicates[event.pred[0]] = predicate
        arguments[event.pred[0]] = []
        for arg in event.args:
            argument = {
                'tag': arg.tag,
                'lemma': lemma[part_id][arg.begin],
                'idx': arg.begin
            }
            arguments[event.pred[0]].append(argument)
            
    return predicates, arguments


def get_example(corpus, ex_number, part_id):
    words = []
    for obj in corpus[ex_number][1][part_id]:
        word = obj['form']
        for symbol in ':;,.!?':
            word = word.replace(' ' + symbol, symbol)
        words.append(word)
        
    if words:
        return ' '.join(words)
    else:
        return '_'  


def get_roles_true(annot, corpus, ex_number, part_id):
    predicates = {}
    arguments = {}
    postags = [item for sublist in annot['postag'] for item in sublist]
    for i, obj in enumerate(corpus[ex_number][1][part_id]):
        if 'rank' in obj:
            if obj['rank'] == 'Предикат':
                predicate = {
                    'lemma': obj['lemma']
                }
                predicates[i] = predicate
            else:
                if 'lemma' not in obj:
                    argument = {
                        'lemma': obj['form'],
                        'tag': obj['rolepred1'],
                        'idx': i
                    }
                else:
                    argument = {
                        'lemma': obj['lemma'],
                        'tag': obj['rolepred1'],
                        'idx': i
                    }
                
                argument['postag'] = postags[argument['idx']]
                pred_id = obj['fillpred']
                if pred_id not in arguments.keys():
                    arguments[pred_id] = []
                arguments[pred_id].append(argument)
                
    return predicates, arguments

In [17]:
def random_texts(corpus, ppl, n_samples=100):
    if len(corpus) > n_samples:
        np.random.seed(random_seed)
        samples_idxs = np.random.choice(len(corpus), size=n_samples)
    else:
        samples_idxs = [_ for _ in range(len(corpus))]
    
    texts = [get_example(corpus, ex_num, 0) for ex_num in samples_idxs]
    return texts

In [18]:
def random_predictions(corpus, ppl, n_samples=100):
    if len(corpus) > n_samples:
        np.random.seed(random_seed)
        samples_idxs = np.random.choice(len(corpus), size=n_samples)
    else:
        samples_idxs = [_ for _ in range(len(corpus))]
    
    texts = [get_example(corpus, ex_num, 0) for ex_num in samples_idxs]
    
    annotations = [ppl(text) for text in tqdm(texts, desc='Analyzing texts')]
    pred_roles = [get_roles_pred(res['lemma'], res['srl'], 0) for res in annotations]
    
    true_roles = [get_roles_true(annotations[i], corpus, ex_num, 0) for i, ex_num in enumerate(samples_idxs)]
    
    
    repl_roles = {
        'агенс - субъект восприятия' : 'субъект восприятия',
        'агенс - субъект ментального состояния' : 'субъект ментального состояния',
        'результат / цель' : 'результат',
        'место - пациенс' : 'место',
        'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
    }
    
    for role, val in repl_roles.items():
        for pair in true_roles:
            for _, args in pair[1].items():
                for arg in args:
                    arg['tag'] = arg['tag'].replace(role, val)
                    
    return true_roles, pred_roles, texts


def compute_metrics(y_pred, y_true, report_to=sys.stdout, roleset=roleset44, idxmatching=False):
    true_positive = 0
    condition_positive = 0
    predicted_condition_positive = 0
    error_examples = []
    
    print_func = lambda x: print(x, file=report_to)
    
    for i, (true_predicates, true_arguments) in enumerate(y_true):
        print_func(f"Inspecting example {i}")
        print_func(f"Expecting true predicates {true_predicates}")
        print_func(f"Expecting true arguments  {true_arguments}")
        
        pred_predicates, pred_arguments = y_pred[i]

        print_func(f"Got predicted predicates  {pred_predicates}")
        print_func(f"Got predicted arguments   {pred_arguments}")
        
        print_func("-"*60)
        
        for true_pred_idx, true_predicate in true_predicates.items():
            if true_pred_idx in pred_predicates:
                print_func(f"Matched predicate {true_pred_idx} = {true_predicate}")
                
                true = true_arguments[true_pred_idx]
                pred_arguments_i = pred_arguments[true_pred_idx]
                
                true_arguments_i = []
                
                for idx, true_argument in enumerate(true):
                    if true_argument['tag'] in roleset and true_argument['postag'] in ARGUMENT_POSTAGS:
                        true_arguments_i.append(true[idx])
                        
                if true_arguments_i:
                    print_func(f"Expecting arguments  {true_arguments_i}")
                    print_func(f"Got predicted        {pred_arguments_i}")
                    print_func(f"Predicted Condition Positive = {len(pred_arguments_i)}")
                    print_func(f"Condition Positive           = {len(true_arguments_i)}")
                    condition_positive += len(true_arguments_i)
                    condition_positive_i = len(true_arguments_i)
                    predicted_condition_positive += len(pred_arguments_i)

                    true_positive_i = 0

                    error_report = {
                        'example_idx' : i,
                        'predicate': true_predicate,
                        'true_arguments' : true_arguments_i,
                        'predicted_arguments': pred_arguments_i
                    }

                    for j, obj in enumerate(true_arguments_i):
                        true_tag = obj['tag']
                        true_lemma = obj['lemma']
                        true_idx = obj['idx']
                        for obj_pred in pred_arguments_i:
                            if idxmatching:
                                if obj_pred['idx'] == true_idx:
                                    true_positive_i += 1
                            else:
                                if obj_pred['idx'] == true_idx and obj_pred['tag'] == true_tag:
                                    true_positive_i += 1

                    print_func(f"True Positive = {true_positive_i}")
                    if true_positive_i != condition_positive_i:
                        error_examples.append(error_report)

                    true_positive += true_positive_i
                
        print_func("="*60)
       
    recall = true_positive/condition_positive
    precision = true_positive/predicted_condition_positive
    
    return {
        'recall': recall,
        'precision': precision,
        'f1': 2 * ((precision*recall)/(precision+recall)),
        'errors': error_examples
    }

In [19]:
#corpus_name = 'annotated_corpus_fixed+syntaxnet.json'
#corpus_path = '../../data/cleared_corpus.json'
corpus_path = '../../data/test_data.json'
with open(corpus_path, 'r', encoding='utf-8') as f:
    corpus = json.load(f)

In [20]:
len(corpus)

6523

In [21]:
import isanlp

In [22]:
! pip install -U git+https://github.com/tchewik/isanlp_srl_framebank.git

Collecting git+https://github.com/tchewik/isanlp_srl_framebank.git
  Cloning https://github.com/tchewik/isanlp_srl_framebank.git to /tmp/pip-req-build-fktpanay
  Running command git clone -q https://github.com/tchewik/isanlp_srl_framebank.git /tmp/pip-req-build-fktpanay
  Running command git submodule update --init --recursive -q
Building wheels for collected packages: isanl-srl-framebank
  Building wheel for isanl-srl-framebank (setup.py) ... [?25ldone
[?25h  Created wheel for isanl-srl-framebank: filename=isanl_srl_framebank-0.0.1-cp36-none-any.whl size=13502 sha256=9267a0b605f1081d98633170c9aa41256a21d7e9ea9fe4205ecb3351f829d31c
  Stored in directory: /tmp/pip-ephem-wheel-cache-tt7lzgos/wheels/53/85/ca/d8697e680129d7344263e11de6f9c43c55721e83038ff90d95
Successfully built isanl-srl-framebank
Installing collected packages: isanl-srl-framebank
  Found existing installation: isanl-srl-framebank 0.0.1
    Uninstalling isanl-srl-framebank-0.0.1:
      Successfully uninstalled isanl-srl-

### Metrics of parser quality

In [23]:
host = 'vmh2.isa.ru'
host2 = 'echistova.isa.ru'
host3 = 'tsa05.isa.ru'

ppl = PipelineDefault(address_morph=(host, 4333),
                       address_syntax=(host, 4344),
                       address_srl=(host3, 4335))

In [826]:
ppl('Мама мыла раму.')

{'text': 'Мама мыла раму.',
 'tokens': [<isanlp.annotation.Token at 0x7f031ad3e9e8>,
  <isanlp.annotation.Token at 0x7f031ad3ef60>,
  <isanlp.annotation.Token at 0x7f031ad3e860>,
  <isanlp.annotation.Token at 0x7f031ad3e7b8>],
 'sentences': [<isanlp.annotation.Sentence at 0x7f031ad3e470>],
 'mystem_postag': [['S,жен,од=им,ед',
   'V,несов,пе=прош,ед,изъяв,жен',
   'S,жен,неод=вин,ед',
   '']],
 'lemma': [['мама', 'мыть', 'рама', '.']],
 'syntax_dep_tree': [[<isanlp.annotation.WordSynt at 0x7f0317251dd8>,
   <isanlp.annotation.WordSynt at 0x7f0317251f60>,
   <isanlp.annotation.WordSynt at 0x7f03172517f0>,
   <isanlp.annotation.WordSynt at 0x7f0317251cf8>]],
 'morph': [[{'fPOS': 'NOUN',
    'Gender': 'Fem',
    'Animacy': 'Anim',
    'Case': 'Nom',
    'Number': 'Sing'},
   {'fPOS': 'VERB',
    'Aspect': 'Imp',
    'Valency': 'TR',
    'Tense': 'Past',
    'Number': 'Sing',
    'VerbForm': 'Fin',
    'Gender': 'Fem'},
   {'fPOS': 'NOUN',
    'Gender': 'Fem',
    'Animacy': 'Inan',
    'C

In [768]:
res = ppl('- И за это тебя посадят, -- как бы сообразив, прервала его тётя Катя.')

for event in res['srl'][0]:
    for i in range(len(event.args)):
        print(event.args[i].begin, f"({res['lemma'][0][event.args[i].begin]})", event.args[i].tag)

3 (это) содержание мысли
4 (ты) субъект психологического состояния
13 (он) пациенс
15 (катя) говорящий


In [15]:
res = ppl('- И за это тебя посадят, -- как бы сообразив, прервала его тётя Катя.')

for event in res['srl'][0]:
    for i in range(len(event.args)):
        print(event.args[i].begin, f"({res['lemma'][0][event.args[i].begin]})", event.args[i].tag)

3 (это) цель
4 (ты) пациенс
13 (он) пациенс
15 (катя) говорящий


n_samples=400 (usually)

# All the action goes here

Evaluate the pipeline on all the test data we used to evaluate the srl models performance. 

**1. Syntaxnet**
- Current syntaxnet parser refuses to process more than ~720 texts in a row without throwing an error, just stucks;
- 720 texts chunk processes ~15 min;
- ``random_predictions`` function now does not randomize input data if n_samples==len(data);
- ``get_example`` function now returns '\_' instead of empty strings because of a bug in the pipeline: it stucks on empty texts **(how is it even possible for this function to return nothing? see example #1613 in test set)**

In [454]:
from isanlp.processor_remote import ProcessorRemote
from isanlp.processor_syntaxnet_remote import ProcessorSyntaxNetRemote
from isanlp import PipelineCommon
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd


class PipelineDefault:
    def __init__(self, address_morph, address_syntax, address_srl):
        self._ppl = PipelineCommon([(ProcessorRemote(address_morph[0], address_morph[1], 'default'), 
                                     ['text'], 
                                     {'tokens' : 'tokens', 
                                      'sentences' : 'sentences',
                                      'postag' : 'mystem_postag',
                                      'lemma' : 'lemma'}),
                                     (ProcessorSyntaxNetRemote(address_syntax[0], address_syntax[1]),
                                      ['tokens', 'sentences'],
                                      {'syntax_dep_tree' : 'syntax_dep_tree'}),
                                     (ConverterMystemToUd(), 
                                      ['mystem_postag'],
                                      {'morph' : 'morph',
                                       'postag' : 'postag'}),
                                     (ProcessorRemote(address_srl[0], address_srl[1], 'default'),
                                      ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
                                      {'srl' : 'srl'})])
        self._name = 'default'
    
    def __call__(self, *args, **kwargs):
        return self._ppl(*args, **kwargs)
    
    def get_processors(self):
        return self._ppl.get_processors()

In [457]:
host = 'vmh2.isa.ru'
host2 = 'echistova.isa.ru'
host3 = 'tsa05.isa.ru'

syntax_ports = [4340, 4341, 4342, 4343, 4344]

ppls = PipelineDefault(address_morph=(host, 4333),
                       address_syntax=(host, syntax_ports[0]),
                       address_srl=(host3, 4335))

In [None]:
start = 0
true_roles, pred_roles, texts = [], [], []
number_of_chunks = 9

for counter in range(number_of_chunks):
    ppls = PipelineDefault(address_morph=(host, 4333),
                           address_syntax=(host, syntax_ports[counter % len(syntax_ports)]),
                           address_srl=(host3, 4335))
    print(f"Switch syntax parser to #{counter % len(syntax_ports)}, compute examples ##{start}-{start+len(corpus)//10-1}/{len(corpus)}")
    tmp_true, tmp_pred, tmp_texts = random_predictions(corpus[start:start+len(corpus)//10], ppls, n_samples=len(corpus)//10)
    true_roles += tmp_true
    pred_roles += tmp_pred
    texts += tmp_texts
    start += len(corpus) // 10

Switch syntax parser to #0, compute examples ##0-651/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

Switch syntax parser to #1, compute examples ##652-1303/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

Switch syntax parser to #2, compute examples ##1304-1955/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

Switch syntax parser to #3, compute examples ##1956-2607/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

Switch syntax parser to #4, compute examples ##2608-3259/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

Switch syntax parser to #0, compute examples ##3260-3911/6523


HBox(children=(IntProgress(value=0, description='Analyzing texts', max=652), HTML(value='')))

In [None]:
log_path = 'log_idx.txt'
results = compute_metrics(y_pred=pred_roles, y_true=true_roles, 
                          report_to=open(log_path, 'w', encoding='utf-8'), 
                          idxmatching=True)

copyres = dict(results)
del copyres['errors']
print_(copyres)

In [None]:
log_path = 'log_idx.txt'
results = compute_metrics(y_pred=pred_roles, y_true=true_roles, 
                          report_to=open(log_path, 'w', encoding='utf-8'), 
                          idxmatching=False)

copyres = dict(results)
del copyres['errors']
print_(copyres)

#### 2. UDPipe 

In [None]:
! pip install git+https://github.com/IINemo/isanlp.git@dev

In [445]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd

ppl = PipelineCommon([(ProcessorRemote(host, 4333, 'default'),
                            ['text'],
                            {'sentences' : 'sentences', 
                             'tokens' : 'tokens',
                             'postag' : 'mystem_postags',
                             'lemma' : 'lemma'}),
                      (ConverterMystemToUd(), 
                                      ['mystem_postags'],
                                      {'morph' : 'morph',
                                       'postag' : 'postag'}),
                      (ProcessorRemote(host, 5336, '0'), 
                            ['tokens', 'sentences'], 
                            {'syntax_dep_tree' : 'syntax_dep_tree'}),
                      (ProcessorRemote(host3, 4335, 'default'),
                            ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'],
                            {'srl' : 'srl'})])

400 samples: ~8 min on udpipe with w2v, ~20 min with elmo

In [446]:
true_roles, pred_roles, texts = random_predictions(corpus, ppl, n_samples=len(corpus))

HBox(children=(IntProgress(value=0, description='Analyzing texts', max=6523), HTML(value='')))

In [None]:
log_path = 'log_idx.txt'
results = compute_metrics(y_pred=pred_roles, y_true=true_roles, 
                          report_to=open(log_path, 'w', encoding='utf-8'), 
                          idxmatching=True)

copyres = dict(results)
del copyres['errors']
print_(copyres)

In [None]:
log_path = 'log_idx.txt'
results = compute_metrics(y_pred=pred_roles, y_true=true_roles, 
                          report_to=open(log_path, 'w', encoding='utf-8'), 
                          idxmatching=False)

copyres = dict(results)
del copyres['errors']
print_(copyres)

## Errors in the argument extraction

In [None]:
import pandas as pd

ff = pd.DataFrame(results['errors'])

In [None]:
def neatify(text):
    return text.replace(' ,', ',')\
                .replace(' .', '.')\
                .replace(' ) ', ') ')\
                .replace(' ( ', ' (')\
                .replace(' : ', ': ')
                
ff['text'] = ff.example_idx.map(lambda idx: neatify(texts[idx]))

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
ff

In [None]:
ff.shape

In [None]:
ff.to_pickle('errors_elmo_x_w2v.pkl')

In [None]:
texts = []

In [None]:
annot = ppl(texts[-1])

In [None]:
annot.keys()

In [None]:
vars(annot["syntax_dep_tree"][0][0])

In [None]:
for i, ann in enumerate(annot['morph'][0]):
    print(f'{i}\t{annot["lemma"][0][i]}\t{ann}')

In [None]:
for i, ann in enumerate(annot['morph'][0]):
    print(f'{i}\t{annot["tokens"][i].text}\t{ann}')

In [None]:
for i, ann in enumerate(annot['morph'][0]):
    print(f'{i}\t{annot["tokens"][i].text}\t{ann.get("fPOS")}')

In [None]:
for i, ann in enumerate(annot['syntax_dep_tree'][0]):
    print(i, vars(ann))

In [None]:
for events in annot['srl']:
    for event in events:
        print(event.pred, [vars(arg) for arg in event.args])