# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../isanlp/src/')

In [16]:
import json
import pickle
import pandas as pd

# Load data

In [3]:
cleared_corpus_path = '../../data/cleared_corpus.json'
with open(cleared_corpus_path, 'r') as f:
    examples = json.load(f)

In [4]:
ling_data_path = '../../data/results_final_fixed.pckl'
with open(ling_data_path, 'rb') as f:
    ling_data = pickle.load(f)

ling_data_cache = {k: v for k,v in ling_data}

# Extract features

In [5]:
from isanlp.annotation_repr import CSentence
from convert_corpus_to_brat import make_text


def find_address_by_offset(offset, ling_ann):
    for tok_num, tok in enumerate(ling_ann['tokens']):
        if tok.begin <= offset and offset < tok.end:
            break
    
    for sent_num, sent in enumerate(ling_ann['sentences']):
        if sent.begin <= tok_num and tok_num < sent.end:
            break
    
    return sent_num, tok_num - sent.begin


def process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, example):
    feature_sets = list()
    
    text, offset_index = make_text(example, 0)
    ling_ann = ling_cache[ex_id]
    
    pred_offset = offset_index[(pred[0], pred[1])]
    pred_ling_sent, pred_ling_word = find_address_by_offset(pred_offset, ling_ann)
    
    for arg in args:
        arg_offset = offset_index[(arg[0], arg[1])]
        arg_ling_sent, arg_ling_word = find_address_by_offset(arg_offset, ling_ann)
        
        fb_pred_word = example[pred[0]][pred[1]]
        fb_arg_word = example[arg[0]][arg[1]]
        
        role = fb_arg_word['rolepred1']

        if arg_ling_sent != pred_ling_sent:
            global num_of_errors
            num_of_errors += 1
            # We miss some examples due to mistakes in framebank or discrepancy in 
            # automatica annotation of sentences.
            print('Error #{}'.format(num_of_errors))
            continue
        
        features = feature_extractor.extract_features(pred_ling_word, 
                                                      arg_ling_word, 
                                                      ling_ann['postag'][arg_ling_sent],
                                                      ling_ann['morph'][arg_ling_sent],
                                                      ling_ann['lemma'][arg_ling_sent],
                                                      ling_ann['syntax_dep_tree'][arg_ling_sent])
                    
        feature_sets.append((features, role, ex_id, arg))
    
    return feature_sets


def process_example(feature_extractor, ling_cache, ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == 'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(feature_extractor, ling_cache, ex_id, pred, args, sentences)


num_of_errors = 0
def prepare_train_data(examples, ling_data_cache, feature_extractor):
    feature_sets = []
    for ex_num, (ex_id, ex) in enumerate(examples):    
        if ex_num % 100 == 0:
            print('{0:.2f}%'.format((ex_num / len(examples)) * 100.))
            
        feature_sets += process_example(feature_extractor, ling_data_cache, ex_id, ex)

    print('Number of training examples:', len(feature_sets))
    return feature_sets

In [6]:
main_model_path_root = '../../data/models_new/'

In [7]:
# !!!: Change feature model here

from isanlp_srl_framebank.processor_srl_framebank import FeatureModelDefault
feature_model = FeatureModelDefault()
main_model_path = os.path.join(main_model_path_root, 'known_preds')   

# from isanlp_srl_framebank.processor_srl_framebank import FeatureModelUnknownPredicates
# feature_model = FeatureModelUnknownPredicates()
# main_model_path = os.path.join(main_model_path_root, 'unknown_preds')

In [8]:
feature_sets = prepare_train_data(examples, ling_data_cache, feature_model)

0.00%
Error #1
0.31%
Error #2
0.61%
0.92%
Error #3
1.23%
Error #4
1.53%
1.84%
2.15%
Error #5
2.45%
2.76%
3.07%
3.37%
3.68%
3.99%
4.29%
4.60%
4.91%
Error #6
Error #7
5.21%
5.52%
5.83%
Error #8
Error #9
Error #10
Error #11
Error #12
Error #13
Error #14
Error #15
Error #16
Error #17
Error #18
Error #19
Error #20
Error #21
Error #22
Error #23
Error #24
Error #25
Error #26
Error #27
Error #28
Error #29
Error #30
Error #31
Error #32
Error #33
Error #34
Error #35
Error #36
6.13%
Error #37
Error #38
Error #39
Error #40
Error #41
Error #42
Error #43
Error #44
Error #45
Error #46
Error #47
Error #48
Error #49
Error #50
Error #51
Error #52
Error #53
Error #54
Error #55
Error #56
Error #57
Error #58
Error #59
Error #60
Error #61
Error #62
Error #63
Error #64
Error #65
Error #66
Error #67
Error #68
Error #69
Error #70
Error #71
Error #72
Error #73
Error #74
Error #75
Error #76
Error #77
Error #78
Error #79
Error #80
Error #81
Error #82
Error #83
Error #84
Error #85
Error #86
Error #87
Error #88
Err

In [36]:
feature_config = {'features' : [{'name' : 'role', 
                                 'type' : ''}]}
unique_features = set()
data_for_pandas = []

def add_features_to_config(feature_config, unique_features, subelem, tp):
    for nm, vl in subelem.items():
        if nm in unique_features:
            continue
        
        feature_config['features'].append({'name' : nm,
                                           'type' : tp})
        unique_features.add(nm)

        
for example in feature_sets:
    data_for_pandas_ex = {}
    data_for_pandas_ex['role'] = example[1]
    data_for_pandas_ex['ex_id'] = example[2]
    data_for_pandas_ex['arg_address'] = example[3]
    for elem in example[0]:
        for type_num, subelem in enumerate(elem):
            if subelem is not None:
                data_for_pandas_ex.update(subelem)
                
                if type_num == 0:
                    add_features_to_config(feature_config, unique_features, subelem, '')
                    
                elif type_num == 1:
                    add_features_to_config(feature_config, unique_features, subelem, 'embedding')
                    
                elif type_num == 2:
                    add_features_to_config(feature_config, unique_features, subelem, 'cont')
                        
    data_for_pandas.append(data_for_pandas_ex)

In [37]:
feature_config

{'features': [{'name': 'role', 'type': ''},
  {'name': 'arg_lemma', 'type': 'embedding'},
  {'name': 'pred_lemma', 'type': 'embedding'},
  {'name': 'dist', 'type': ''},
  {'name': 'arg_case', 'type': ''},
  {'name': 'pred_pos', 'type': ''},
  {'name': 'arg_pos', 'type': ''},
  {'name': 'syn_link_name', 'type': ''},
  {'name': 'prepos', 'type': ''},
  {'name': 'Aspect_arg', 'type': ''},
  {'name': 'Number_arg', 'type': ''},
  {'name': 'Tense_arg', 'type': ''},
  {'name': 'Valency_arg', 'type': ''},
  {'name': 'VerbForm_arg', 'type': ''},
  {'name': 'Animacy_arg', 'type': ''},
  {'name': 'Gender_arg', 'type': ''},
  {'name': 'rel_pos', 'type': 'cont'}]}

In [31]:
pd_data = pd.DataFrame(data_for_pandas)
del data_for_pandas

pd_data = pd_data.sample(frac=1) # shuffle dataset
pd_data[:10]

Unnamed: 0,Animacy_arg,Aspect_arg,Gender_arg,Number_arg,Tense_arg,Valency_arg,VerbForm_arg,arg_address,arg_case,arg_lemma,arg_pos,dist,ex_id,pred_lemma,pred_pos,prepos,rel_pos,role,syn_link_name
41441,,,,Plur,,,,"(0, 25)",Nom,они_PRON,PRON,2.0,102474,нападывать_VERB,VERB,,1.0,агенс,nsubj
41535,,,,Plur,,,,"(5, 0)",Nom,мы_PRON,PRON,4.0,102467,нападывать_VERB,VERB,,1.0,агенс,nsubj
9130,Inan,,Fem,Sing,,,,"(21, 26)",Acc,гостиная_NOUN,NOUN,2.0,105443,уводить_VERB,VERB,в,-1.0,конечная точка,nmod
22540,Inan,,Fem,Sing,,,,"(4, 7)",Ins,собственность_NOUN,NOUN,7.0,53914,владеть_VERB,VERB,,-1.0,тема,nmod
56953,,Imp,,Sing,,TR,Fin,"(1, 3)",,вывешивать_VERB,VERB,2.0,55513,говорить_VERB,VERB,,-1.0,содержание высказывания,xcomp
51032,Inan,,Fem,Sing,,,,"(1, 1)",Acc,узость_NOUN,NOUN,14.0,51445,ограничивать_VERB,VERB,,1.0,эффектор,nsubj
19909,,,,,,,,"(0, 4)",Acc,себя_PRON,PRON,1.0,104164,принуждать_VERB,VERB,,-1.0,пациенс,dobj
31091,Inan,,Masc,Plur,,,,"(4, 5)",Acc,опрос_NOUN,NOUN,2.0,88165,отражать_VERB,VERB,,1.0,место,nsubj
20778,Anim,,Masc,Plur,,,,"(1, 23)",Nom,чиновник_NOUN,NOUN,2.0,37674,ведать_VERB,VERB,,-1.0,агенс,nsubj
21405,Inan,,Masc,Sing,,,,"(1, 17)",Gen,стадион_NOUN,NOUN,2.0,91363,прибегать_VERB,VERB,со,-1.0,начальная точка,dobj


# Preprocess features and labels

In [32]:
y_stat = pd_data.loc[:, 'role'].value_counts()
drop_ys = y_stat[y_stat < 180].index
clear_data = pd_data.drop(pd_data[pd_data.loc[:, 'role'].isin(drop_ys)].index)

In [33]:
repl_roles = {
    'агенс - субъект восприятия' : 'субъект восприятия',
    'агенс - субъект ментального состояния' : 'субъект ментального состояния',
    'результат / цель' : 'результат',
    'место - пациенс' : 'место',
    'говорящий - субъект психологического состояния' : 'субъект психологического состояния'
}


def normalize_single_region(data, rep, val):
    data.loc[:, 'role'] = data.loc[:, 'role'].str.replace(rep, val)


for rep, val in repl_roles.items():
    normalize_single_region(clear_data, rep, val)
    
number_of_roles = len(clear_data.loc[:, 'role'].value_counts().index)
print('Number of roles: ', number_of_roles)
clear_data.loc[:, 'role'].value_counts()

Number of roles:  44


агенс                                 6142
пациенс                               5357
тема                                  3649
субъект психологического состояния    3249
субъект перемещения                   3008
причина                               2501
говорящий                             2356
место                                 2183
содержание действия                   1873
содержание мысли                      1817
содержание высказывания               1789
конечная точка                        1772
результат                             1451
пациенс перемещения                   1355
стимул                                1271
субъект ментального состояния         1223
адресат                                940
субъект восприятия                     901
контрагент                             829
эффектор                               738
субъект социального отношения          598
начальная точка                        588
предмет высказывания                   547
способ     

# Save everything

In [35]:
with open(os.path.join(main_model_path, 'feature_model.pckl'), 'wb') as f:
    pickle.dump(feature_model, f)

feature_config_path = os.path.join(main_model_path, 'feature_vectorization_config.json')
with open(feature_config_path, 'w') as f:
    json.dump(feature_config, f)
    
clear_data.to_csv(os.path.join(main_model_path, 'features.csv'))