# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
#os.environ['CUDA_VISIBLE_DEVICES'] = ''

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../src/')
sys.path.append('../isanlp_srl_framebank/')
sys.path.append('../../libs/')
sys.path.append('../../libs/pylingtools/')

In [2]:
from tqdm import tqdm_notebook as tqdm

In [3]:
from pprint import pprint as pretty

In [4]:
import json
import pickle
import pandas as pd

# Load data

In [5]:
cleared_corpus_path = '../../data/cleared_corpus.json'
with open(cleared_corpus_path, 'r') as f:
    examples = json.load(f)

In [6]:
ling_data_path = '../../data/results_final_fixed.pckl'
with open(ling_data_path, 'rb') as f:
    ling_data = pickle.load(f)

ling_data_cache = {k: v for k,v in ling_data}

# Extract features

In [7]:
error_examples = {}

In [8]:
from isanlp.annotation_repr import CSentence
from convert_corpus_to_brat import make_text


def find_address_by_offset(offset, ling_ann):
    for tok_num, tok in enumerate(ling_ann['tokens']):
        if tok.begin <= offset and offset < tok.end:
            break
    
    for sent_num, sent in enumerate(ling_ann['sentences']):
        if sent.begin <= tok_num and tok_num < sent.end:
            break
    
    return sent_num, tok_num - sent.begin


def process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, example):
    feature_sets = list()
    
    text, offset_index = make_text(example, 0)
    ling_ann = ling_cache[ex_id]
    
    pred_offset = offset_index[(pred[0], pred[1])]
    pred_ling_sent, pred_ling_word = find_address_by_offset(pred_offset, ling_ann)
    
    for arg in args:
        arg_offset = offset_index[(arg[0], arg[1])]
        arg_ling_sent, arg_ling_word = find_address_by_offset(arg_offset, ling_ann)
        
        #print("-"*20)
        #print('ex_id: ', ex_id)
        #print('ling_ann_sent: ', arg_ling_sent)
        #print('total number of postags: ', len(ling_ann['postag']))
        #print('total number of morph featues: ', len(ling_ann['morph']))
        #print('total number of lemmas: ', len(ling_ann['lemma']))
        #print('total number of syntax trees: ', len(ling_ann['syntax_dep_tree']))
        
        lens = {
            'len_postags' : len(ling_ann['postag']),
            'len_morph' : len(ling_ann['morph']),
            'len_lemma' : len(ling_ann['lemma']),
            'len_syntax' : len(ling_ann['syntax_dep_tree'])
        }
        
        #print("-"*20)
        #print(ex_id)
        #print(lens)
        #print("arg_ling_sent: ", arg_ling_sent)
        
        if arg_ling_sent > min(lens.values()) or len(set(lens.values())) != 1:
            lens['len_arg_ling_sent'] = arg_ling_sent
            if ex_id not in error_examples:
                error_examples[ex_id] = []
            error_examples[ex_id].append((ex_id, lens, "length mismatch"))
            continue
        
        
        fb_pred_word = example[pred[0]][pred[1]]
        fb_arg_word = example[arg[0]][arg[1]]
        
        role = fb_arg_word['rolepred1']

        if arg_ling_sent != pred_ling_sent:
            global num_of_errors
            num_of_errors += 1
            # We miss some examples due to mistakes in framebank or discrepancy in 
            # automatica annotation of sentences.
            print('Error #{}'.format(num_of_errors))
            continue
        
        try:
            features = feature_extractor.extract_features(pred_ling_word, 
                                                      arg_ling_word, 
                                                      ling_ann['postag'][arg_ling_sent],
                                                      ling_ann['morph'][arg_ling_sent],
                                                      ling_ann['lemma'][arg_ling_sent],
                                                      ling_ann['syntax_dep_tree'][arg_ling_sent])
        except Exception as e:
            lens['len_arg_ling_sent'] = arg_ling_sent
            if ex_id not in error_examples:
                error_examples[ex_id] = []
            error_examples[ex_id].append((ex_id, lens, str(e)))
            continue
            
                    
        feature_sets.append((features, role, ex_id, arg))
    
    return feature_sets


def process_example(feature_extractor, ling_cache, ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == 'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, sentences)


num_of_errors = 0
def prepare_train_data(examples, ling_data_cache, feature_extractor):
    feature_sets = []
    for ex_num, (ex_id, ex) in enumerate(examples):    
        if ex_num % 100 == 0:
            print('{0:.2f}%'.format((ex_num / len(examples)) * 100.))
            
        feature_sets += process_example(feature_extractor, ling_data_cache, ex_id, ex)

    print('Number of training examples:', len(feature_sets))
    return feature_sets

In [10]:
main_model_path_root = '../../data/models_new/'

In [11]:
# !!!: Change feature model here

from isanlp_srl_framebank.processor_srl_framebank import FeatureModelDefault
feature_model = FeatureModelDefault()
main_model_path = os.path.join(main_model_path_root, 'known_preds')   

# from isanlp_srl_framebank.processor_srl_framebank import FeatureModelUnknownPredicates
# feature_model = FeatureModelUnknownPredicates()
# main_model_path = os.path.join(main_model_path_root, 'unknown_preds')

In [15]:
len(examples)

32612

In [13]:
error_ids = set([error[0] for error in error_examples])

In [19]:
len(error_examples.keys())

13492

In [12]:
feature_sets = prepare_train_data(examples, ling_data_cache, feature_model)

0.00%
Error #1
0.31%
0.61%
0.92%
1.23%
1.53%
1.84%
2.15%
2.45%
2.76%
3.07%
3.37%
3.68%
3.99%
4.29%
4.60%
4.91%
5.21%
5.52%
5.83%
6.13%
Error #2
Error #3
Error #4
6.44%
6.75%
7.05%
7.36%
7.67%
7.97%
8.28%
8.59%
8.89%
9.20%
9.51%
Error #5
9.81%
Error #6
10.12%
10.43%
Error #7
Error #8
10.73%
Error #9
11.04%
11.35%
11.65%
11.96%
12.27%
12.57%
12.88%
13.19%
13.49%
13.80%
14.11%
Error #10
Error #11
Error #12
Error #13
Error #14
Error #15
Error #16
Error #17
14.41%
Error #18
14.72%
15.03%
15.33%
15.64%
15.95%
Error #19
16.25%
16.56%
16.86%
17.17%
17.48%
17.78%
18.09%
18.40%
18.70%
19.01%
19.32%
19.62%
19.93%
20.24%
20.54%
20.85%
Error #20
21.16%
21.46%
21.77%
22.08%
22.38%
22.69%
23.00%
23.30%
23.61%
23.92%
24.22%
24.53%
24.84%
25.14%
25.45%
25.76%
26.06%
26.37%
26.68%
26.98%
27.29%
27.60%
27.90%
Error #21
Error #22
Error #23
Error #24
Error #25
Error #26
28.21%
28.52%
28.82%
29.13%
29.44%
29.74%
30.05%
Error #27
Error #28
30.36%
Error #29
Error #30
30.66%
30.97%
31.28%
31.58%
31.89%
32.20%


In [20]:
feature_config = {'features' : [{'name' : 'role', 
                                 'type' : ''}]}
unique_features = set()
data_for_pandas = []

def add_features_to_config(feature_config, unique_features, subelem, tp):
    for nm, vl in subelem.items():
        if nm in unique_features:
            continue
        
        feature_config['features'].append({'name' : nm,
                                           'type' : tp})
        unique_features.add(nm)

        
for example in feature_sets:
    data_for_pandas_ex = {}
    data_for_pandas_ex['role'] = example[1]
    data_for_pandas_ex['ex_id'] = example[2]
    data_for_pandas_ex['arg_address'] = example[3]
    for elem in example[0]:
        for type_num, subelem in enumerate(elem):
            if subelem is not None:
                data_for_pandas_ex.update(subelem)
                
                if type_num == 0:
                    add_features_to_config(feature_config, unique_features, subelem, '')
                    
                elif type_num == 1:
                    add_features_to_config(feature_config, unique_features, subelem, 'embedding')
                    
                elif type_num == 2:
                    add_features_to_config(feature_config, unique_features, subelem, 'cont')
                        
    data_for_pandas.append(data_for_pandas_ex)

In [21]:
feature_config

{'features': [{'name': 'role', 'type': ''},
  {'name': 'arg_lemma', 'type': 'embedding'},
  {'name': 'pred_lemma', 'type': 'embedding'},
  {'name': 'dist', 'type': ''},
  {'name': 'arg_case', 'type': ''},
  {'name': 'pred_pos', 'type': ''},
  {'name': 'arg_pos', 'type': ''},
  {'name': 'syn_link_name', 'type': ''},
  {'name': 'prepos', 'type': ''},
  {'name': 'Aspect_arg', 'type': ''},
  {'name': 'Number_arg', 'type': ''},
  {'name': 'Tense_arg', 'type': ''},
  {'name': 'Valency_arg', 'type': ''},
  {'name': 'VerbForm_arg', 'type': ''},
  {'name': 'Animacy_arg', 'type': ''},
  {'name': 'Gender_arg', 'type': ''},
  {'name': 'rel_pos', 'type': 'cont'}]}

In [22]:
pd_data = pd.DataFrame(data_for_pandas)
del data_for_pandas

pd_data = pd_data.sample(frac=1) # shuffle dataset
pd_data[:10]

Unnamed: 0,Animacy_arg,Aspect_arg,Gender_arg,Number_arg,Tense_arg,Valency_arg,VerbForm_arg,arg_address,arg_case,arg_lemma,arg_pos,dist,ex_id,pred_lemma,pred_pos,prepos,rel_pos,role,syn_link_name
9733,Inan,,Masc,Sing,,,,"(0, 6)",Acc,прогон_NOUN,NOUN,2.0,70858,проходить_VERB,VERB,,-1.0,субъект перемещения,obj
5774,,,,,,,,"(10, 4)",,что_CONJ,CONJ,7.0,104109,претендовать_VERB,VERB,на,1.0,цель,mark
14606,Inan,,Fem,Sing,,,,"(0, 10)",Ins,рука_NOUN,NOUN,1.0,104651,рубить_VERB,VERB,,-1.0,пациенс перемещения,iobj
23364,Inan,,Masc,Sing,,,,"(2, 6)",Acc,штрафной_ADJ,ADJ,1.0,135514,бить_VERB,VERB,,1.0,содержание действия,acl
11142,Anim,,Masc,Sing,,,,"(0, 0)",Nom,кямал_NOUN,NOUN,1.0,33600,опускать_VERB,VERB,,1.0,агенс,nsubj
34051,Inan,,Masc,Sing,,,,"(0, 41)",Abl,город_NOUN,NOUN,2.0,35582,сохраняться_VERB,VERB,в,1.0,место,obl
21085,Inan,,Fem,Sing,,,,"(7, 4)",Nom,голова_NOUN,NOUN,8.0,37167,болтаться_VERB,VERB,,1.0,субъект перемещения,conj
9996,Inan,,Fem,Plur,,,,"(0, 22)",Gen,рота_NOUN,NOUN,2.0,100595,вытирать_VERB,VERB,,-1.0,пациенс,conj
815,,Perf,Masc,Sing,Past,INTR,Fin,"(1, 12)",,примиряться_VERB,VERB,5.0,61544,мочь_VERB,VERB,,-1.0,ситуация в фокусе,root
5054,Anim,,Masc,Sing,,,,"(5, 0)",Nom,отец_NOUN,NOUN,5.0,105930,ухаживать_VERB,VERB,,1.0,субъект социального отношения,nsubj


# Preprocess features and labels

In [23]:
y_stat = pd_data.loc[:, 'role'].value_counts()
drop_ys = y_stat[y_stat < 180].index
clear_data = pd_data.drop(pd_data[pd_data.loc[:, 'role'].isin(drop_ys)].index)

In [24]:
repl_roles = {
    'агенс - субъект восприятия' : 'субъект восприятия',
    'агенс - субъект ментального состояния' : 'субъект ментального состояния',
    'результат / цель' : 'результат',
    'место - пациенс' : 'место',
    'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
}


def normalize_single_region(data, rep, val):
    data.loc[:, 'role'] = data.loc[:, 'role'].str.replace(rep, val)


for rep, val in repl_roles.items():
    normalize_single_region(clear_data, rep, val)
    
number_of_roles = len(clear_data.loc[:, 'role'].value_counts().index)
print('Number of roles: ', number_of_roles)
clear_data.loc[:, 'role'].value_counts()

Number of roles:  34


агенс                                 3775
пациенс                               3309
тема                                  2436
субъект перемещения                   1886
субъект психологического состояния    1723
причина                               1572
место                                 1472
говорящий                             1159
содержание действия                   1136
конечная точка                        1130
содержание мысли                      1105
содержание высказывания                973
пациенс перемещения                    867
стимул                                 790
результат                              775
субъект ментального состояния          601
адресат                                531
контрагент                             499
эффектор                               492
субъект восприятия                     431
предмет высказывания                   382
субъект социального отношения          376
начальная точка                        366
способ     

# Save everything

In [25]:
with open(os.path.join(main_model_path, 'feature_model.pckl'), 'wb') as f:
    pickle.dump(feature_model, f)

feature_config_path = os.path.join(main_model_path, 'feature_vectorization_config.json')
with open(feature_config_path, 'w') as f:
    json.dump(feature_config, f)
    
clear_data.to_csv(os.path.join(main_model_path, 'features.csv'))