In [None]:
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, AutoTokenizer, BertConfig, BertModel
from rbert_model import RBERT
import os
import numpy as np
from rbert_data_loader import load_and_cache_examples
from train_relation_extraction import RelationExtractorTrainer, get_tokenizer, model_id_to_path
from rbert_data_loader import TermFrameProcessor, convert_examples_to_features
import torch
from scipy.special import softmax

In [None]:
device = torch.device('cuda')

In [None]:
device

In [None]:
conf = {'experiment': 'EN_reg_nonhier+def',
        'model_id': 'allenai/scibert_scivocab_cased',
        'max_length': 128,
        'batch_size': 4,
        'epochs': 5}
conf['model_dir'] = os.path.join('data', 'experiments', conf['experiment'], model_id_to_path(conf['model_id']))
conf['eval_dir'] = conf['model_dir']
conf['data_dir'] = os.path.join('data', 'experiments', conf['experiment'])

In [None]:
tokenizer = get_tokenizer(conf['model_id'])

In [None]:
processor = TermFrameProcessor(conf)

In [None]:
args = torch.load(os.path.join(conf['model_dir'], "training_args.bin"))
model = RBERT.from_pretrained(os.path.join(conf['model_dir'], 'model.pt'), args=args)
model.to(device);

In [None]:
sentence = "Geer moraines or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes ."
i1 = sentence.find('<e1>')
i2 = sentence.find('</e1>')
window_size = 2
words_before = sentence[:i1].strip().split(' ')
words_after = sentence[i2 + 5:].strip().split(' ')
lines = []
idx1 = 0
for i in range(len(words_before) - window_size + 1):
    idx2 = idx1 + window_size
    e2_before = words_before[:idx1] + ['<e2>'] + words_before[idx1:idx2] + ['</e2>'] + words_before[idx2:]
    lines.append(['Other', ' '.join(e2_before) + ' ' + sentence[i1:]])
    idx1 += 1
idx1 = 0
for i in range(window_size, len(words_after)):
    idx2 = idx1 + window_size
    e2_after = words_after[:idx1] + ['<e2>'] + words_after[idx1:idx2] + ['</e2>'] + words_after[idx2:]
    lines.append(['Other', sentence[:i2 + 5] + ' ' + ' '.join(e2_after)])
    idx1 += 1
# lines.append(['HAS_FUNCTION', '<e1> Grab samplers </e1> are buckets or segments that <e2> drive into the sediment layer and enclose and retain a layer </e2> .'])

In [None]:
lines

In [None]:
examples = processor._create_examples(lines, 'train')

In [None]:
features = convert_examples_to_features(
    examples, conf['max_length'], tokenizer, add_sep_token=False
)

In [None]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long, device=device)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long, device=device)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long, device=device)
all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long, device=device)  # add e1 mask
all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long, device=device)  # add e2 mask

all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long, device=device)

In [None]:
model.eval();

In [None]:
# for i in range(len(all_input_ids))
with torch.no_grad():
    outputs = model(all_input_ids, all_attention_mask, all_token_type_ids, None, all_e1_mask, all_e2_mask)
    logits = outputs[0].detach().cpu().numpy()
    probs = softmax(logits, axis=1)

In [None]:
logits

In [None]:
logits = outputs[0].detach().cpu().numpy()

In [None]:
logits

In [None]:
logits[logits < 8] = 0

In [None]:
df = pd.DataFrame(logits, columns=processor.relation_labels)

In [None]:
df

In [None]:
for l in range(len(lines)):
    print(l, lines[l][1])

In [None]:
max_vals = np.max(logits, axis=1)
arg_max = np.argmax(logits, axis=1)

In [None]:
lines[10]

In [None]:
processor.relation_labels

In [None]:
df = pd.read_csv('data/full_data_new_EN.csv')

In [None]:
df[df['Sentence'] ==79]

In [None]:
lines

In [63]:
logits

array([[-3.3230584 , -0.28777394,  0.3915504 ,  2.6353245 ,  4.0684237 ,
        -2.2714093 , -2.2305045 ],
       [-3.61521   , -0.5822604 , -0.27176338,  2.2229083 ,  4.8831253 ,
        -1.4569333 , -2.4374123 ],
       [-1.7090881 , -1.5794371 , -1.728625  ,  8.812912  , -1.9842335 ,
        -1.9991543 ,  0.09666806],
       [-1.2743081 , -2.2612534 , -2.9260657 ,  3.9110909 , -1.0057275 ,
        -2.1319952 ,  5.9472904 ],
       [-0.54082996, -1.9248059 , -1.7650276 , -1.7264402 , -0.8863953 ,
        -1.4189417 ,  8.554747  ],
       [-0.60751706, -1.9159092 , -1.8468326 , -1.3841116 , -1.2140503 ,
        -1.4515196 ,  8.654597  ],
       [-1.0642627 , -2.164248  , -2.4575984 ,  1.93889   , -2.2918904 ,
        -1.38612   ,  7.6464434 ],
       [-1.6847495 , -1.7833356 , -1.3581494 ,  9.161529  , -2.1036088 ,
        -1.3656468 , -0.9450502 ],
       [-1.6853762 , -1.6389939 , -1.13269   ,  9.09769   , -2.0025904 ,
        -1.4591932 , -1.2642449 ],
       [-1.8404146 , -1.7514

In [64]:
logits = outputs[0].detach().cpu().numpy()

In [65]:
logits

array([[-3.3230584 , -0.28777394,  0.3915504 ,  2.6353245 ,  4.0684237 ,
        -2.2714093 , -2.2305045 ],
       [-3.61521   , -0.5822604 , -0.27176338,  2.2229083 ,  4.8831253 ,
        -1.4569333 , -2.4374123 ],
       [-1.7090881 , -1.5794371 , -1.728625  ,  8.812912  , -1.9842335 ,
        -1.9991543 ,  0.09666806],
       [-1.2743081 , -2.2612534 , -2.9260657 ,  3.9110909 , -1.0057275 ,
        -2.1319952 ,  5.9472904 ],
       [-0.54082996, -1.9248059 , -1.7650276 , -1.7264402 , -0.8863953 ,
        -1.4189417 ,  8.554747  ],
       [-0.60751706, -1.9159092 , -1.8468326 , -1.3841116 , -1.2140503 ,
        -1.4515196 ,  8.654597  ],
       [-1.0642627 , -2.164248  , -2.4575984 ,  1.93889   , -2.2918904 ,
        -1.38612   ,  7.6464434 ],
       [-1.6847495 , -1.7833356 , -1.3581494 ,  9.161529  , -2.1036088 ,
        -1.3656468 , -0.9450502 ],
       [-1.6853762 , -1.6389939 , -1.13269   ,  9.09769   , -2.0025904 ,
        -1.4591932 , -1.2642449 ],
       [-1.8404146 , -1.7514

In [66]:
logits[logits < 8] = 0

In [67]:
df = pd.DataFrame(logits, columns=processor.relation_labels)

In [68]:
df

Unnamed: 0,Other,HAS_CAUSE,HAS_LOCATION,HAS_FORM,COMPOSITION_MEDIUM,HAS_FUNCTION,HAS_SIZE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,8.812912,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,8.554747
5,0.0,0.0,0.0,0.0,0.0,0.0,8.654597
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,9.161529,0.0,0.0,0.0
8,0.0,0.0,0.0,9.09769,0.0,0.0,0.0
9,0.0,0.0,0.0,9.04278,0.0,0.0,0.0


In [69]:
for l in range(len(lines)):
    print(l, lines[l][1])

0 <e2> Geer moraines </e2> or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
1 Geer <e2> moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
2 Geer moraines or <e1> washboard moraines </e1> <e2> are series </e2> of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
3 Geer moraines or <e1> washboard moraines </e1> are <e2> series of </e2> small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
4 Geer moraines or <e1> washboard moraines </e1> are series <e2> of small </e2> and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
5 Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and </e2> roughly parallel ridges of till that

In [51]:
max_vals = np.max(logits, axis=1)
arg_max = np.argmax(logits, axis=1)

In [63]:
lines[10]

['Other',
 'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are <e2> ordinarily associated with </e2> lakes or former lakes .']

In [42]:
processor.relation_labels

['Other',
 'HAS_CAUSE',
 'HAS_LOCATION',
 'HAS_FORM',
 'COMPOSITION_MEDIUM',
 'HAS_FUNCTION',
 'HAS_SIZE']

In [56]:
df = pd.read_csv('data/full_data_new_EN.csv')

In [61]:
df[df['Sentence'] ==79]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
1703,79,Geer,A.4_Other,DEFINIENDUM,,
1704,79,moraines,A.4_Other,DEFINIENDUM,,
1705,79,or,,,,
1706,79,washboard,A.4_Other,DEFINIENDUM,,
1707,79,moraines,A.4_Other,DEFINIENDUM,,
1708,79,are,,DEFINITOR,,
1709,79,series,,,,
1710,79,of,,,,
1711,79,small,,,HAS_SIZE,
1712,79,and,,,,


In [19]:
lines

[['Other',
  '<e2> Geer moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and roughly </e2> parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small <e2> and roughly parallel </e2> ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and <e2> roughly parallel ridges </e2> of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly <e2> parallel ridges of </e2> till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard