In [1]:
#REBEL environment details:
!python --version
import torch
print('Pythorch version: ', torch.__version__)
import transformers
print('Transformers version: ', transformers.__version__)
import json
import numpy as np

Python 3.10.9


Pythorch version:  2.0.1
Transformers version:  4.30.1


In [2]:
# It is a post-processing function to shape the triples from the REBEL output
def extract_triples(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append((subject.strip(), relation.strip(), object_.strip()))
    return triplets

In [3]:
# we call the tokenizer and the model from the HuggingFace library
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large").to("cuda")

In [4]:
# we set the generation parameters for the model
gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 0,
    "num_beams": 10, # 10 beams is NOT the default value but we opted for it to get more diverse results
    "num_return_sequences": 10, # 10 sequences is NOT the default value but we opted for it to get long tail triple extraction
    "return_dict_in_generate": True, 
    "output_scores": True
}

In [5]:
sentence = "The first president of the United States was George Washington."

In [6]:
inputs = tokenizer(sentence, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')

In [7]:
outputs = model.generate(
                            inputs["input_ids"].to('cuda'),
                            attention_mask=inputs["attention_mask"].to('cuda'),
                            **gen_kwargs,
                            )

In [8]:
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
)

In [9]:
input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]

In [10]:
output_length = input_length + np.sum(transition_scores.cpu().numpy() < 0, axis=1)

In [11]:
length_penalty = model.generation_config.length_penalty

In [12]:
reconstructed_scores = transition_scores.cpu().sum(axis=1) / (output_length**length_penalty)

In [13]:
print(reconstructed_scores)

tensor([-0.0801, -0.0517, -0.2450, -0.3481, -0.2088, -0.3183, -0.2210, -0.3477,
        -0.2159, -0.2298], dtype=torch.float64)


In [14]:
for s in reconstructed_scores:
    print(s.item())
    print(np.exp(s.item()))
    

-0.08008399109045665
0.9230388160940569
-0.0517070356168245
0.9496070271323342
-0.2450245710519644
0.782685306604258
-0.34805148298090155
0.7060625250751662
-0.2088111952731484
0.8115484453621502
-0.31829203092134917
0.7273903369246306
-0.22099416255950927
0.8017213602772665
-0.3477042638338529
0.7063077260696624
-0.21589133853004092
0.8058228590127368
-0.22976818084716796
0.7947178119607781


In [15]:
for seq, seq_score, prob in zip(outputs.sequences, reconstructed_scores, np.exp(reconstructed_scores)):
    print(f'Extracted triples: {extract_triples(tokenizer.decode(seq, skip_special_tokens=False))}')
    print(f'Log probability: {seq_score}')
    print(f'Probability: {prob}')
    print()

Extracted triples: [('George Washington', 'position held', 'president of the United States')]
Log probability: -0.08008399109045665
Probability: 0.9230388160940569

Extracted triples: [('president of the United States', 'officeholder', 'George Washington'), ('George Washington', 'position held', 'president of the United States')]
Log probability: -0.0517070356168245
Probability: 0.9496070271323342

Extracted triples: [('president of the United States', 'officeholder', 'George Washington')]
Log probability: -0.2450245710519644
Probability: 0.782685306604258

Extracted triples: [('United States', 'founded by', 'George Washington')]
Log probability: -0.34805148298090155
Probability: 0.7060625250751662

Extracted triples: [('first president of the United States', 'officeholder', 'George Washington'), ('George Washington', 'position held', 'president of the United States')]
Log probability: -0.2088111952731484
Probability: 0.8115484453621502

Extracted triples: [('first president of the Uni

In [16]:
generated_tokens = outputs.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.cpu().numpy():.3f} | {np.exp(score.cpu().numpy()):.2%}")

| 50267 | <triplet> | -0.000 | 100.00%
|  1655 |  George  | -0.950 | 38.69%
|   663 |  Washington | -0.000 | 100.00%
|  1437 |          | -0.000 | 99.99%
| 50266 | <subj>   | 0.000 | 100.00%
|   394 |  president | -0.004 | 99.59%
|     9 |  of      | -0.006 | 99.37%
|     5 |  the     | -0.000 | 100.00%
|   315 |  United  | -0.000 | 100.00%
|   532 |  States  | 0.000 | 100.00%
|  1437 |          | -0.000 | 100.00%
| 50265 | <obj>    | 0.000 | 100.00%
|   737 |  position | -0.000 | 99.97%
|   547 |  held    | 0.000 | 100.00%
|     2 | </s>     | -0.000 | 99.95%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000 | 100.00%
|     1 | <pad>    | 0.000

In [17]:
print(np.allclose(outputs.sequences_scores.cpu(), reconstructed_scores))

False
