In [2]:
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, AutoTokenizer, BertConfig, BertModel
from rbert_model import RBERT
import os
import numpy as np
from rbert_data_loader import load_and_cache_examples
from train_relation_extraction import RelationExtractorTrainer, get_tokenizer, model_id_to_path
from rbert_data_loader import TermFrameProcessor, convert_examples_to_features
import torch
from scipy.special import softmax

2022-05-28 12:38:47.701750: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
device = torch.device('cuda')

In [4]:
device

device(type='cuda')

In [432]:
conf = {'experiment': 'SL_reg_nonhier+def',
        'model_id': 'EMBEDDIA/crosloengual-bert',
        'max_length': 128,
        'batch_size': 4,
        'epochs': 5}
conf['model_dir'] = os.path.join('data', 'experiments', conf['experiment'], model_id_to_path(conf['model_id']))
conf['eval_dir'] = conf['model_dir']
conf['data_dir'] = os.path.join('data', 'experiments', conf['experiment'])

In [433]:
tokenizer = get_tokenizer(conf['model_id'])

In [434]:
processor = TermFrameProcessor(conf)

In [435]:
args = torch.load(os.path.join(conf['model_dir'], "training_args.bin"))
model = RBERT.from_pretrained(os.path.join(conf['model_dir'], 'model.pt'), args=args)
model.to(device);

In [463]:
sentence = "<e1> Uvala </e1> je večja kraška globel skledaste oblike z neravnim dnom in sklenjenim višjim obodom . Praviloma je manjša od kraškega polja in večja od vrtače ."
pure_sentence = sentence.replace('<e1> ', '').replace('</e1> ', '').split(' ')

i1 = sentence.find('<e1>')
i2 = sentence.find('</e1>')
words_before = sentence[:i1].strip().split(' ')
words_inside = sentence[i1 + 5:i2].strip().split(' ')
words_after = sentence[i2 + 5:].strip().split(' ')
for wo in [words_before, words_inside, words_after]:
    if '' in wo:
        wo.remove('')

In [481]:
word_class_scores = np.zeros((len(pure_sentence), len(processor.relation_labels)))
for window_size in [1, 2]:
    lines = []
    word_masks = []
    idx1 = 0
    for i in range(len(words_before) - window_size + 1):
        idx2 = idx1 + window_size
        e2_before = words_before[:idx1] + ['<e2>'] + words_before[idx1:idx2] + ['</e2>'] + words_before[idx2:]
        lines.append(['Other', ' '.join(e2_before) + ' ' + sentence[i1:]])
        word_masks.append(list(range(idx1, idx2)))
        idx1 += 1
    idx1 = 0
    offset = len(words_before) + len(words_inside)
    for i in range(window_size, len(words_after)):
        idx2 = idx1 + window_size
        e2_after = words_after[:idx1] + ['<e2>'] + words_after[idx1:idx2] + ['</e2>'] + words_after[idx2:]
        lines.append(['Other', sentence[:i2 + 5] + ' ' + ' '.join(e2_after)])
        word_masks.append(list(range(idx1 + offset, idx2 + offset)))
        idx1 += 1
    examples = processor._create_examples(lines, 'train')
    features = convert_examples_to_features(
        examples, conf['max_length'], tokenizer, add_sep_token=False
    )

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long, device=device)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long, device=device)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long, device=device)
    all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long, device=device)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long, device=device)  # add e2 mask

    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long, device=device)
    model.eval();
    # for i in range(len(all_input_ids))
    with torch.no_grad():
        outputs = model(all_input_ids, all_attention_mask, all_token_type_ids, None, all_e1_mask, all_e2_mask)
        logits = outputs[0].detach().cpu().numpy()
        probs = softmax(logits, axis=1)
    logits = outputs[0].detach().cpu().numpy()
    logits[logits < 7] = 0
    for idx in range(logits.shape[0]):
        word_class_scores[word_masks[idx]] += logits[idx, :]

In [485]:
word_class_scores

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 15.75531721],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 23.16911125],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  7.35302114],
       [ 0.        ,  0.        ,  0.        , 17.06823254,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 26.40349102,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 26.41017342,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 26.54001331,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 26.49033928,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.    

In [482]:
res = [(word, processor.relation_labels[np.argmax(score)], np.max(score)) for word, score in
       zip(pure_sentence, word_class_scores)]

In [483]:
df = pd.DataFrame.from_records(res, columns=["Word", 'non-hierarchical', 'score'])

In [484]:
df

Unnamed: 0,Word,non-hierarchical,score
0,Uvala,Other,0.0
1,je,HAS_SIZE,15.755317
2,večja,HAS_SIZE,23.169111
3,kraška,HAS_SIZE,7.353021
4,globel,HAS_FORM,17.068233
5,skledaste,HAS_FORM,26.403491
6,oblike,HAS_FORM,26.410173
7,z,HAS_FORM,26.540013
8,neravnim,HAS_FORM,26.490339
9,dnom,HAS_FORM,26.501854


In [468]:
df = pd.read_csv('data/full_data_new_SL.csv')

In [472]:
df[df['Sentence'] ==2]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
53,2,Uvala,A.1_Surface_landform,DEFINIENDUM,,
54,2,je,,DEFINITOR,,
55,2,večja,,,HAS_SIZE,
56,2,kraška,,GENUS,,
57,2,globel,A.1_Surface_landform,GENUS,,
58,2,skledaste,,,HAS_FORM,
59,2,oblike,,,HAS_FORM,frame_FORM
60,2,z,,,HAS_FORM,
61,2,neravnim,,,HAS_FORM,
62,2,dnom,,,HAS_FORM,


In [121]:
df = pd.DataFrame(logits, columns=processor.relation_labels)

In [122]:
df

Unnamed: 0,Other,HAS_CAUSE,HAS_LOCATION,HAS_FORM,COMPOSITION_MEDIUM,HAS_FUNCTION,HAS_SIZE
0,0.0,0.0,0.0,7.875909,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,7.471579
2,0.0,0.0,0.0,0.0,0.0,0.0,5.942061
3,0.0,0.0,0.0,0.0,0.0,0.0,5.583723
4,0.0,0.0,0.0,8.230618,0.0,0.0,0.0
5,0.0,0.0,0.0,9.190335,0.0,0.0,0.0
6,0.0,0.0,0.0,8.36814,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,7.919581,0.0,0.0
9,0.0,0.0,0.0,0.0,5.230893,0.0,0.0


In [120]:
for l in range(len(lines)):
    print(l, lines[l][1])

0 Geer moraines or <e1> washboard moraines </e1> <e2> are series of small and </e2> roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
1 Geer moraines or <e1> washboard moraines </e1> are <e2> series of small and roughly </e2> parallel ridges of till that are ordinarily associated with lakes or former lakes .
2 Geer moraines or <e1> washboard moraines </e1> are series <e2> of small and roughly parallel </e2> ridges of till that are ordinarily associated with lakes or former lakes .
3 Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and roughly parallel ridges </e2> of till that are ordinarily associated with lakes or former lakes .
4 Geer moraines or <e1> washboard moraines </e1> are series of small <e2> and roughly parallel ridges of </e2> till that are ordinarily associated with lakes or former lakes .
5 Geer moraines or <e1> washboard moraines </e1> are series of small and <e2> roughly parallel ridges of till </e2> that

In [1]:
all_e2_mask[7]

NameError: name 'all_e2_mask' is not defined

In [51]:
max_vals = np.max(logits, axis=1)
arg_max = np.argmax(logits, axis=1)

In [63]:
lines[10]

['Other',
 'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are <e2> ordinarily associated with </e2> lakes or former lakes .']

In [42]:
processor.relation_labels

['Other',
 'HAS_CAUSE',
 'HAS_LOCATION',
 'HAS_FORM',
 'COMPOSITION_MEDIUM',
 'HAS_FUNCTION',
 'HAS_SIZE']

In [56]:
df = pd.read_csv('data/full_data_new_EN.csv')

In [61]:
df[df['Sentence'] == 79]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
1703,79,Geer,A.4_Other,DEFINIENDUM,,
1704,79,moraines,A.4_Other,DEFINIENDUM,,
1705,79,or,,,,
1706,79,washboard,A.4_Other,DEFINIENDUM,,
1707,79,moraines,A.4_Other,DEFINIENDUM,,
1708,79,are,,DEFINITOR,,
1709,79,series,,,,
1710,79,of,,,,
1711,79,small,,,HAS_SIZE,
1712,79,and,,,,


In [19]:
lines

[['Other',
  '<e2> Geer moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and roughly </e2> parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small <e2> and roughly parallel </e2> ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and <e2> roughly parallel ridges </e2> of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly <e2> parallel ridges of </e2> till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard