In [1]:
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, AdamW, AutoTokenizer, BertConfig, BertModel
from rbert_model import RBERT
import os
import numpy as np
from rbert_data_loader import load_and_cache_examples
from train_relation_extraction import RelationExtractorTrainer, get_tokenizer, model_id_to_path
from rbert_data_loader import TermFrameProcessor, convert_examples_to_features
import torch
from scipy.special import softmax

In [2]:
device = torch.device('cuda')

In [3]:
device

device(type='cuda')

In [4]:
conf = {'experiment': 'EN_reg_nonhier+def',
        'model_id': 'allenai/scibert_scivocab_cased',
        'max_length': 128,
        'batch_size': 4,
        'epochs': 5}
conf['model_dir'] = os.path.join('data', 'experiments', conf['experiment'], model_id_to_path(conf['model_id']))
conf['eval_dir'] = conf['model_dir']
conf['data_dir'] = os.path.join('data', 'experiments', conf['experiment'])

In [5]:
tokenizer = get_tokenizer(conf['model_id'])

In [6]:
processor = TermFrameProcessor(conf)

In [54]:
args = torch.load(os.path.join(conf['model_dir'], "training_args.bin"))
model = RBERT.from_pretrained(os.path.join(conf['model_dir'], 'model.pt'), args=args)
model.to(device);
model.eval()

RBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31116, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      

In [128]:
def predict_line(line) :
    examples = processor._create_examples(line, 'train')
    features = convert_examples_to_features(
        examples, conf['max_length'], tokenizer, add_sep_token=False
    )
    
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long, device=device)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long, device=device)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long, device=device)
    all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long, device=device)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long, device=device)  # add e2 mask

    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long, device=device)
    
    # for i in range(len(all_input_ids))
    with torch.no_grad():
        outputs = model(all_input_ids, all_attention_mask, all_token_type_ids, None, all_e1_mask, all_e2_mask)
        logits = outputs[0].detach().cpu().numpy()
        probs = softmax(logits, axis=1)
    detection = 0
    max_val = np.max(logits[0])
    if max_val > 7:
        detection = np.argmax(logits[0])
    
    return detection, max_val

In [129]:
sentence = "Geer moraines or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes ."
i1 = sentence.find('<e1>')
i2 = sentence.find('</e1>')
window_size_start = 2
window_size = window_size_start
words_before = sentence[:i1].strip().split(' ')
words_after = sentence[i2:].strip().split(' ')
lines = []
preds = []
idx1 = 0
while idx1 < (len(words_before) - window_size + 1):
    idx2 = idx1 + window_size
    e2_before = words_before[:idx1] + ['<e2>'] + words_before[idx1:idx2] + ['</e2>'] + words_before[idx2:]
    line = [['Other', ' '.join(e2_before) + ' ' + sentence[i1:]]]
    prediction, confidence = predict_line(line)
    
    if prediction != 0 :
        max_confidence = confidence
        nu_prediction = prediction
        nu_confidence = confidence
        e2_before_nu = e2_before
        
        window_size += 1
        while prediction == nu_prediction and idx2 < len(words_before) and max_confidence >= nu_confidence - 1 :
            e2_before = e2_before_nu
            
            idx2 = idx1 + window_size
            e2_before_nu = words_before[:idx1] + ['<e2>'] + words_before[idx1:idx2] + ['</e2>'] + words_before[idx2:]
            nu_line = [['Other', ' '.join(e2_before_nu) + ' ' + sentence[i1:]]]
            nu_prediction, nu_confidence = predict_line(nu_line)
            if nu_confidence > max_confidence :
                max_confidence = nu_confidence
            
            window_size += 1
            
        preds.append([str(prediction), ' '.join(e2_before) + ' ' + sentence[i1:]])
        idx1 += window_size - 1
        window_size = window_size_start
    
    idx1 += 1

idx1 = 0
while idx1 < len(words_after):
    idx2 = idx1 + window_size
    e2_after = words_after[:idx1] + ['<e2>'] + words_after[idx1:idx2] + ['</e2>'] + words_after[idx2:]
    line = [['Other', sentence[:i2] + ' ' + ' '.join(e2_after)]]
    prediction, confidence = predict_line(line)
    
    if prediction != 0 :
        max_confidence = confidence
        nu_prediction = prediction
        nu_confidence = confidence
        e2_after_nu = e2_after
        print('Found ' + str(prediction))
        
        window_size += 1
        idx2 = idx1 + window_size
        while prediction == nu_prediction and idx2 < len(words_after) and max_confidence >= nu_confidence - 1 :
            e2_after = e2_after_nu
            
            e2_after_nu = words_after[:idx1] + ['<e2>'] + words_after[idx1:idx2] + ['</e2>'] + words_after[idx2:]
            nu_line = [['Other', sentence[:i2] + ' ' + ' '.join(e2_after_nu)]]
            nu_prediction, nu_confidence = predict_line(nu_line)
            print(nu_prediction)
            if nu_confidence > max_confidence :
                max_confidence = nu_confidence
            
            window_size += 1
            idx2 = idx1 + window_size
            
        preds.append([str(prediction), sentence[:i2] + ' ' + ' '.join(e2_after)])
        idx1 += window_size - 2
        window_size = window_size_start
    else :
        idx1 += 1
# lines.append(['HAS_FUNCTION', '<e1> Grab samplers </e1> are buckets or segments that <e2> drive into the sediment layer and enclose and retain a layer </e2> .'])


[[10.45941   -2.0900204 -1.6167957 -1.8023508 -1.9084367 -1.7309326
  -1.7495626]]
[[10.535376  -1.9328756 -1.6672069 -1.9021527 -1.7403682 -1.823944
  -1.8182349]]
[[10.551659  -2.0295384 -1.7098657 -1.9516482 -1.6418275 -1.88708
  -1.5329087]]
[[10.564585  -2.0932128 -1.9275554 -1.6084161 -1.656087  -2.032623
  -1.5767817]]
[[10.503424  -1.9429697 -1.8709061 -1.9238069 -1.4655601 -2.079242
  -1.521052 ]]
[[-0.26385963 -1.7008317  -2.9521842  -0.9481971  -0.40331048 -2.4300153
   7.942303  ]]
Found 6
[[10.493633  -2.1466563 -2.0746891 -1.593467  -1.5134028 -2.2493997
  -1.2057354]]
0
[[ 5.4637904 -3.012116  -3.4803064  1.7157595 -1.4539253 -3.0508163
   3.5833077]]
[[-0.15626672 -2.3901618  -1.9791044   9.245464   -2.7628362  -2.0311084
  -1.2988751 ]]
Found 3
[[-0.16115616 -2.3991754  -1.7899622   9.399419   -2.651478   -2.1623378
  -1.5740547 ]]
3
[[ 9.049711  -2.2166717 -2.334558   1.3186641 -2.0022838 -2.6791286
  -1.9191375]]
0
[[ 0.9141397 -1.7678968 -1.9254615 -0.6315382  7.201

In [127]:
preds

[['6',
  'Geer moraines or <e1> washboard moraines  </e1> are series <e2> of small </e2> and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['3',
  'Geer moraines or <e1> washboard moraines  </e1> are series of small and <e2> roughly parallel ridges </e2> of till that are ordinarily associated with lakes or former lakes .'],
 ['4',
  'Geer moraines or <e1> washboard moraines  </e1> are series of small and roughly parallel ridges <e2> of till </e2> that are ordinarily associated with lakes or former lakes .']]

In [29]:
examples = processor._create_examples(lines, 'train')

In [30]:
features = convert_examples_to_features(
    examples, conf['max_length'], tokenizer, add_sep_token=False
)

In [31]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long, device=device)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long, device=device)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long, device=device)
all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long, device=device)  # add e1 mask
all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long, device=device)  # add e2 mask

all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long, device=device)

In [32]:
model.eval();

In [33]:
# for i in range(len(all_input_ids))
with torch.no_grad():
    outputs = model(all_input_ids, all_attention_mask, all_token_type_ids, None, all_e1_mask, all_e2_mask)
    logits = outputs[0].detach().cpu().numpy()
    probs = softmax(logits, axis=1)

In [34]:
logits

array([[10.540789  , -1.9406198 , -1.4902645 , -1.8921092 , -1.5461885 ,
        -2.1296341 , -1.4779458 ],
       [10.175579  , -1.5989631 , -1.5731035 , -1.9601575 , -1.3176928 ,
        -2.2539062 , -1.5726503 ],
       [10.309549  , -2.1937    , -1.429167  , -1.6343846 , -1.6824181 ,
        -2.1121252 , -1.617209  ],
       [10.440431  , -1.7740446 , -1.6110647 , -1.7429495 , -1.7683027 ,
        -1.989427  , -1.9434831 ],
       [ 8.120712  ,  0.82375026, -1.4609393 , -1.7021438 , -2.2573965 ,
        -2.1428084 , -2.2404277 ],
       [-1.7056692 ,  9.201068  , -1.2181455 , -1.8262587 , -1.939174  ,
        -2.1097791 , -1.4957887 ],
       [-2.1209626 ,  9.431411  , -1.0019859 , -1.8649789 , -1.5667375 ,
        -2.036273  , -1.7351724 ],
       [ 8.027953  ,  0.6024179 , -1.5878698 , -1.6910765 , -2.134036  ,
        -1.9717289 , -2.0567887 ],
       [ 0.6234943 ,  7.865587  , -1.4714093 , -2.1363072 , -1.6687295 ,
        -2.0495079 , -1.5996628 ],
       [ 9.492211  , -0.3523

In [35]:
logits = outputs[0].detach().cpu().numpy()

In [36]:
logits

array([[10.540789  , -1.9406198 , -1.4902645 , -1.8921092 , -1.5461885 ,
        -2.1296341 , -1.4779458 ],
       [10.175579  , -1.5989631 , -1.5731035 , -1.9601575 , -1.3176928 ,
        -2.2539062 , -1.5726503 ],
       [10.309549  , -2.1937    , -1.429167  , -1.6343846 , -1.6824181 ,
        -2.1121252 , -1.617209  ],
       [10.440431  , -1.7740446 , -1.6110647 , -1.7429495 , -1.7683027 ,
        -1.989427  , -1.9434831 ],
       [ 8.120712  ,  0.82375026, -1.4609393 , -1.7021438 , -2.2573965 ,
        -2.1428084 , -2.2404277 ],
       [-1.7056692 ,  9.201068  , -1.2181455 , -1.8262587 , -1.939174  ,
        -2.1097791 , -1.4957887 ],
       [-2.1209626 ,  9.431411  , -1.0019859 , -1.8649789 , -1.5667375 ,
        -2.036273  , -1.7351724 ],
       [ 8.027953  ,  0.6024179 , -1.5878698 , -1.6910765 , -2.134036  ,
        -1.9717289 , -2.0567887 ],
       [ 0.6234943 ,  7.865587  , -1.4714093 , -2.1363072 , -1.6687295 ,
        -2.0495079 , -1.5996628 ],
       [ 9.492211  , -0.3523

In [37]:
logits[logits < 8] = 0

In [38]:
df = pd.DataFrame(logits, columns=processor.relation_labels)

In [39]:
df

Unnamed: 0,Other,HAS_CAUSE,HAS_LOCATION,HAS_FORM,COMPOSITION_MEDIUM,HAS_FUNCTION,HAS_SIZE
0,10.540789,0.0,0.0,0.0,0.0,0.0,0.0
1,10.175579,0.0,0.0,0.0,0.0,0.0,0.0
2,10.309549,0.0,0.0,0.0,0.0,0.0,0.0
3,10.440431,0.0,0.0,0.0,0.0,0.0,0.0
4,8.120712,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,9.201068,0.0,0.0,0.0,0.0,0.0
6,0.0,9.431411,0.0,0.0,0.0,0.0,0.0
7,8.027953,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9.492211,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
for l in range(len(lines)):
    print(l, lines[l][1])

0 <e1> Eskers </e1> <e2> are the </e2> chief landform created by subglacial meltwater and form by the infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .
1 <e1> Eskers </e1> are <e2> the chief </e2> landform created by subglacial meltwater and form by the infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .
2 <e1> Eskers </e1> are the <e2> chief landform </e2> created by subglacial meltwater and form by the infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .
3 <e1> Eskers </e1> are the chief <e2> landform created </e2> by subglacial meltwater and form by the infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .
4 <e1> Eskers </e1> are the chief landform <e2> created by </e2> subglacial meltwater and form by the infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .
5 <e1> Eskers </e1> are t

In [41]:
max_vals = np.max(logits, axis=1)
arg_max = np.argmax(logits, axis=1)

In [42]:
lines[10]

['Other',
 '<e1> Eskers </e1> are the chief landform created by subglacial meltwater and form <e2> by the </e2> infilling of subglacial or englacial channels or by sedimentation in supraglacial channels .']

In [43]:
processor.relation_labels

['Other',
 'HAS_CAUSE',
 'HAS_LOCATION',
 'HAS_FORM',
 'COMPOSITION_MEDIUM',
 'HAS_FUNCTION',
 'HAS_SIZE']

In [44]:
df = pd.read_csv('data/full_data_new_EN.csv')

In [122]:
df[df['Sentence'] ==79]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
1703,79,Geer,A.4_Other,DEFINIENDUM,,
1704,79,moraines,A.4_Other,DEFINIENDUM,,
1705,79,or,,,,
1706,79,washboard,A.4_Other,DEFINIENDUM,,
1707,79,moraines,A.4_Other,DEFINIENDUM,,
1708,79,are,,DEFINITOR,,
1709,79,series,,,,
1710,79,of,,,,
1711,79,small,,,HAS_SIZE,
1712,79,and,,,,


In [38]:
lines

[['Other',
  '<e2> Geer moraines </e2> or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer <e2> moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> <e2> are series </e2> of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are <e2> series of </e2> small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series <e2> of small </e2> and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard

In [63]:
logits

array([[-3.3230584 , -0.28777394,  0.3915504 ,  2.6353245 ,  4.0684237 ,
        -2.2714093 , -2.2305045 ],
       [-3.61521   , -0.5822604 , -0.27176338,  2.2229083 ,  4.8831253 ,
        -1.4569333 , -2.4374123 ],
       [-1.7090881 , -1.5794371 , -1.728625  ,  8.812912  , -1.9842335 ,
        -1.9991543 ,  0.09666806],
       [-1.2743081 , -2.2612534 , -2.9260657 ,  3.9110909 , -1.0057275 ,
        -2.1319952 ,  5.9472904 ],
       [-0.54082996, -1.9248059 , -1.7650276 , -1.7264402 , -0.8863953 ,
        -1.4189417 ,  8.554747  ],
       [-0.60751706, -1.9159092 , -1.8468326 , -1.3841116 , -1.2140503 ,
        -1.4515196 ,  8.654597  ],
       [-1.0642627 , -2.164248  , -2.4575984 ,  1.93889   , -2.2918904 ,
        -1.38612   ,  7.6464434 ],
       [-1.6847495 , -1.7833356 , -1.3581494 ,  9.161529  , -2.1036088 ,
        -1.3656468 , -0.9450502 ],
       [-1.6853762 , -1.6389939 , -1.13269   ,  9.09769   , -2.0025904 ,
        -1.4591932 , -1.2642449 ],
       [-1.8404146 , -1.7514

In [64]:
logits = outputs[0].detach().cpu().numpy()

In [65]:
logits

array([[-3.3230584 , -0.28777394,  0.3915504 ,  2.6353245 ,  4.0684237 ,
        -2.2714093 , -2.2305045 ],
       [-3.61521   , -0.5822604 , -0.27176338,  2.2229083 ,  4.8831253 ,
        -1.4569333 , -2.4374123 ],
       [-1.7090881 , -1.5794371 , -1.728625  ,  8.812912  , -1.9842335 ,
        -1.9991543 ,  0.09666806],
       [-1.2743081 , -2.2612534 , -2.9260657 ,  3.9110909 , -1.0057275 ,
        -2.1319952 ,  5.9472904 ],
       [-0.54082996, -1.9248059 , -1.7650276 , -1.7264402 , -0.8863953 ,
        -1.4189417 ,  8.554747  ],
       [-0.60751706, -1.9159092 , -1.8468326 , -1.3841116 , -1.2140503 ,
        -1.4515196 ,  8.654597  ],
       [-1.0642627 , -2.164248  , -2.4575984 ,  1.93889   , -2.2918904 ,
        -1.38612   ,  7.6464434 ],
       [-1.6847495 , -1.7833356 , -1.3581494 ,  9.161529  , -2.1036088 ,
        -1.3656468 , -0.9450502 ],
       [-1.6853762 , -1.6389939 , -1.13269   ,  9.09769   , -2.0025904 ,
        -1.4591932 , -1.2642449 ],
       [-1.8404146 , -1.7514

In [66]:
logits[logits < 8] = 0

In [67]:
df = pd.DataFrame(logits, columns=processor.relation_labels)

In [68]:
df

Unnamed: 0,Other,HAS_CAUSE,HAS_LOCATION,HAS_FORM,COMPOSITION_MEDIUM,HAS_FUNCTION,HAS_SIZE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,8.812912,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,8.554747
5,0.0,0.0,0.0,0.0,0.0,0.0,8.654597
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,9.161529,0.0,0.0,0.0
8,0.0,0.0,0.0,9.09769,0.0,0.0,0.0
9,0.0,0.0,0.0,9.04278,0.0,0.0,0.0


In [69]:
for l in range(len(lines)):
    print(l, lines[l][1])

0 <e2> Geer moraines </e2> or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
1 Geer <e2> moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
2 Geer moraines or <e1> washboard moraines </e1> <e2> are series </e2> of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
3 Geer moraines or <e1> washboard moraines </e1> are <e2> series of </e2> small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
4 Geer moraines or <e1> washboard moraines </e1> are series <e2> of small </e2> and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .
5 Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and </e2> roughly parallel ridges of till that

In [51]:
max_vals = np.max(logits, axis=1)
arg_max = np.argmax(logits, axis=1)

In [63]:
lines[10]

['Other',
 'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are <e2> ordinarily associated with </e2> lakes or former lakes .']

In [42]:
processor.relation_labels

['Other',
 'HAS_CAUSE',
 'HAS_LOCATION',
 'HAS_FORM',
 'COMPOSITION_MEDIUM',
 'HAS_FUNCTION',
 'HAS_SIZE']

In [56]:
df = pd.read_csv('data/full_data_new_EN.csv')

In [61]:
df[df['Sentence'] ==79]

Unnamed: 0,Sentence,Word,category,hierarchical,non-hierarchical,non-hierarchical-definitor
1703,79,Geer,A.4_Other,DEFINIENDUM,,
1704,79,moraines,A.4_Other,DEFINIENDUM,,
1705,79,or,,,,
1706,79,washboard,A.4_Other,DEFINIENDUM,,
1707,79,moraines,A.4_Other,DEFINIENDUM,,
1708,79,are,,DEFINITOR,,
1709,79,series,,,,
1710,79,of,,,,
1711,79,small,,,HAS_SIZE,
1712,79,and,,,,


In [19]:
lines

[['Other',
  '<e2> Geer moraines or </e2> <e1> washboard moraines </e1> are series of small and roughly parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of <e2> small and roughly </e2> parallel ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small <e2> and roughly parallel </e2> ridges of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and <e2> roughly parallel ridges </e2> of till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard moraines </e1> are series of small and roughly <e2> parallel ridges of </e2> till that are ordinarily associated with lakes or former lakes .'],
 ['Other',
  'Geer moraines or <e1> washboard