In [2]:
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, AutoTokenizer, BertConfig, BertModel
from rbert_model import RBERT
import os
import numpy as np
from rbert_data_loader import load_and_cache_examples
from train_relation_extraction import RelationExtractorTrainer, get_tokenizer, model_id_to_path
from rbert_data_loader import TermFrameProcessor, convert_examples_to_features
import torch
from scipy.special import softmax

2022-05-28 16:49:02.368980: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
device = torch.device('cuda')

In [4]:
device

device(type='cuda')

In [5]:
conf = {'experiment': 'SL_reg_nonhier+def',
        'model_id': 'EMBEDDIA/crosloengual-bert',
        'max_length': 128,
        'batch_size': 4,
        'epochs': 5}
conf['model_dir'] = os.path.join('data', 'experiments', conf['experiment'], model_id_to_path(conf['model_id']))
conf['eval_dir'] = conf['model_dir']
conf['data_dir'] = os.path.join('data', 'experiments', conf['experiment'])

In [6]:
tokenizer = get_tokenizer(conf['model_id'])

In [7]:
processor = TermFrameProcessor(conf)

In [8]:
args = torch.load(os.path.join(conf['model_dir'], "training_args.bin"))
model = RBERT.from_pretrained(os.path.join(conf['model_dir'], 'model.pt'), args=args)
model.to(device);

In [9]:
sentence = "<e1> Uvala </e1> je večja kraška globel skledaste oblike z neravnim dnom in sklenjenim višjim obodom . Praviloma je manjša od kraškega polja in večja od vrtače ."
pure_sentence = sentence.replace('<e1> ', '').replace('</e1> ', '').split(' ')

i1 = sentence.find('<e1>')
i2 = sentence.find('</e1>')
words_before = sentence[:i1].strip().split(' ')
words_inside = sentence[i1 + 5:i2].strip().split(' ')
words_after = sentence[i2 + 5:].strip().split(' ')
for wo in [words_before, words_inside, words_after]:
    if '' in wo:
        wo.remove('')

In [15]:
word_class_scores = np.zeros((len(pure_sentence), len(processor.relation_labels)))
for window_size in [1, 2]:
    lines = []
    word_masks = []
    idx1 = 0
    for i in range(len(words_before) - window_size + 1):
        idx2 = idx1 + window_size
        e2_before = words_before[:idx1] + ['<e2>'] + words_before[idx1:idx2] + ['</e2>'] + words_before[idx2:]
        lines.append(['Other', ' '.join(e2_before) + ' ' + sentence[i1:]])
        word_masks.append(list(range(idx1, idx2)))
        idx1 += 1
    idx1 = 0
    offset = len(words_before) + len(words_inside)
    for i in range(window_size, len(words_after)):
        idx2 = idx1 + window_size
        e2_after = words_after[:idx1] + ['<e2>'] + words_after[idx1:idx2] + ['</e2>'] + words_after[idx2:]
        lines.append(['Other', sentence[:i2 + 5] + ' ' + ' '.join(e2_after)])
        word_masks.append(list(range(idx1 + offset, idx2 + offset)))
        idx1 += 1
    examples = processor._create_examples(lines, 'train')
    features = convert_examples_to_features(
        examples, conf['max_length'], tokenizer, add_sep_token=False
    )

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long, device=device)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long, device=device)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long, device=device)
    all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long, device=device)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long, device=device)  # add e2 mask

    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long, device=device)
    model.eval();
    # for i in range(len(all_input_ids))
    with torch.no_grad():
        outputs = model(all_input_ids, all_attention_mask, all_token_type_ids, None, all_e1_mask, all_e2_mask)
        logits = outputs[0].detach().cpu().numpy()
        probs = softmax(logits, axis=1)
    logits = outputs[0].detach().cpu().numpy()
    logits[logits < 7] = 0
    for idx in range(logits.shape[0]):
        word_class_scores[word_masks[idx]] += logits[idx, :] / window_size

In [16]:
word_class_scores

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 11.80213904],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 15.53942251],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  3.67651057],
       [ 0.        ,  0.        ,  0.        , 12.64386415,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 17.60972786,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 17.61022091,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 17.70760632,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 17.67105293,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.    

In [27]:
res = [(word, processor.relation_labels[np.argmax(score)], np.max(score)) for word, score in
       zip(pure_sentence, word_class_scores)]

In [28]:
df = pd.DataFrame.from_records(res, columns=["Word", 'non-hierarchical', 'score'])

In [29]:
df

Unnamed: 0,Word,non-hierarchical,score
0,Uvala,Other,0.0
1,je,HAS_SIZE,11.802139
2,večja,HAS_SIZE,15.539423
3,kraška,HAS_SIZE,3.676511
4,globel,HAS_FORM,12.643864
5,skledaste,HAS_FORM,17.609728
6,oblike,HAS_FORM,17.610221
7,z,HAS_FORM,17.707606
8,neravnim,HAS_FORM,17.671053
9,dnom,HAS_FORM,17.667773


In [30]:
df_true = pd.read_csv('data/full_data_new_SL.csv')

In [31]:
df_true[df_true['Sentence'] ==2]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
53,2,Uvala,A.1_Surface_landform,DEFINIENDUM,,
54,2,je,,DEFINITOR,,
55,2,večja,,,HAS_SIZE,
56,2,kraška,,GENUS,,
57,2,globel,A.1_Surface_landform,GENUS,,
58,2,skledaste,,,HAS_FORM,
59,2,oblike,,,HAS_FORM,frame_FORM
60,2,z,,,HAS_FORM,
61,2,neravnim,,,HAS_FORM,
62,2,dnom,,,HAS_FORM,
