### Model Training Setup

In [2]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

### Load the model

In [184]:
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    DataCollatorForSeq2Seq,
)

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL = "../model/bart_base_colab_DECODE_ONLY_LABEL/checkpoint_final"
PATH_FB15k237 = "../data/datasets_knowledge_embedding/FB15k-237/"
# Load model and tokenizer
model = BartForConditionalGeneration.from_pretrained(MODEL).cuda().float().to(device)
tokenizer = BartTokenizer.from_pretrained(MODEL)

### Load Data

In [4]:
from torch.utils.data import Dataset
import copy
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from tqdm.auto import tqdm


random.seed(42)


class DatasetKGC(Dataset):
    def __init__(self, data):
        self.data = data
        self.data["input_ids"] = self.data["input_ids"]
        self.data["labels"] = self.data["labels"]
        self.num_rows = self.data["input_ids"].shape[0]

    def __len__(self):
        return self.num_rows

    def __getitem__(self, idx):
        _input = self.data["input_ids"][idx].squeeze(0)
        label = self.data["labels"][idx].squeeze(0)

        return {"input_ids": _input, "labels": label}


def encode_data(data):
    # Codifica as strings de entrada e rótulos como sequências de tokens BART
    encoded_input = tokenizer(
        list(data["data_input"]),
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH,
        add_special_tokens=True,
        return_attention_mask=False,
    )
    encoded_label = tokenizer(
        list(data["data_label"]),
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH,
        add_special_tokens=True,
        return_attention_mask=False,
    )

    # Cria uma lista de exemplos
    examples = []
    for i in range(len(data)):
        input_ids = encoded_input["input_ids"][i]
        labels = encoded_label["input_ids"][i]
        examples.append({"input_ids": input_ids, "labels": labels})

    # Cria um objeto DataCollatorForLanguageModeling
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer, padding="max_length", max_length=MAX_LENGTH
    )

    # Prepara os dados de treinamento
    prepared_data = data_collator(examples)

    return prepared_data


def train_valid_split(data):
    train, valid = train_test_split(data, test_size=0.2, random_state=42)
    return encode_data(train), encode_data(valid)


def generate_train_valid_dataloader(data):
    train, valid = train_valid_split(data)

    train_loader = DataLoader(DatasetKGC(train), batch_size=BATCH_SIZE, shuffle=False)

    valid_loader = DataLoader(DatasetKGC(valid), batch_size=BATCH_SIZE, shuffle=False)

    return train_loader, valid_loader


def generate_train_valid_dataset(data):
    train, valid = train_valid_split(data)

    train_loader = DatasetKGC(train)

    valid_loader = DatasetKGC(valid)

    return train_loader, valid_loader


def _decode(tokens, tokenizer=tokenizer, batch=True):
    # Remove padding tokens and decode
    # tokens = tokens[tokens != -100]
    if batch:
        return tokenizer.batch_decode(tokens, skip_special_tokens=True)

    return tokenizer.decode(tokens, skip_special_tokens=True)


def _decode_error(tokens_i, tokens_j, tokenizer=tokenizer):
    """ """
    # Remove padding tokens
    text_i = _decode(tokens_i)
    text_j = _decode(tokens_j)

    # print(text_i)
    # print(text_j)

    error = 0

    len_i = len(tokens_i)
    len_j = len(tokens_j)
    total_tokens = max(len_i, len_j)

    diff_shape = abs(len_i - len_j)

    error += diff_shape

    for i in range(min(len_i, len_j)):
        if text_i[i] != text_j[i]:
            error += 1

    return (error, total_tokens, error / total_tokens)


def _decode_error_epoch(model, valid_dataset, debug=False):
    error_tokens = 0
    total_tokens = 0

    with torch.no_grad():
        for _input, label in tqdm(valid_dataset, desc="Analyzing decoder error"):
            model_output = model.generate(_input.to(device), max_length=MAX_LENGTH)

            for i in range(len(model_output)):
                a, b, percentage = _decode_error(label[i], model_output[i].to("cpu"))
                error_tokens += a
                total_tokens += b

        if debug:
            print("Total tokens analyzed: %d" % total_tokens)
            print("Total erroneous tokens predicted: %d" % error_tokens)
            print("Percentage of error: %.3f%%" % ((error_tokens / total_tokens) * 100))

        return error_tokens, total_tokens, error_tokens / total_tokens

In [5]:
train_ds, valid_ds = (
    torch.load(PATH_FB15k237 + "/procesed_data_DECODE_ONLY_LABEL" + "_train_ds.pth"),
    torch.load(PATH_FB15k237 + "/procesed_data_DECODE_ONLY_LABEL" + "_valid_ds.pth"),
)

### Evaluation

In [6]:
MAX_LENGTH = 128

In [7]:
valid_ds = valid_ds[0]

In [8]:
with torch.no_grad():
    print("Sample input: ")
    print(tokenizer.decode(valid_ds["input_ids"]).replace("<pad>", ""), "\n")

    print("Expected output: ")
    print(tokenizer.decode(valid_ds["labels"], skip_special_tokens=True), "\n")

    print("Model Output: ")
    print(
        tokenizer.decode(
            model.generate(
                valid_ds["input_ids"].to(device).reshape(1, -1), max_length=MAX_LENGTH
            )[0]
        )
    )

Sample input: 
<s>Geraldine Chaplin has place of birth of Santa Monica. Jerry Siegel has place of birth of Cleveland. John Michael Higgins has place of birth of<mask>.</s> 

Expected output: 
Boston 

Model Output: 
</s><s>Los Angeles</s>


### Hits

This metric measure the distance between the representation of label and output tokens.

In [117]:
data = valid_ds[3]

In [156]:
text_label = tokenizer.decode(data["labels"], skip_special_tokens=True)
print(text_label)

embedding_label = model(
    tokenizer.encode(
        text_label, padding="max_length", max_length=128, return_tensors="pt"
    )
    .reshape(1, -1)
    .to(device)
).encoder_last_hidden_state
embedding_label = torch.mean(embedding_label, dim=1)

Jackson


In [171]:
token_ids_output = model.generate(
    data["input_ids"].to(device).reshape(1, -1), max_length=MAX_LENGTH
)[0]
text_output = tokenizer.decode(token_ids_output, skip_special_tokens=True)
print(text_output)

embeddings_output = model(
    tokenizer.encode(
        text_output, padding="max_length", max_length=128, return_tensors="pt"
    )
    .to(device)
    .reshape(1, -1)
).encoder_last_hidden_state
embeddings_output = torch.mean(embeddings_output, dim=1)

Jackson


In [143]:
import pandas as pd


def cosine_similarity(embd_i, embd_j):
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    return cos(embd_i, embd_j)


def hits_sim(model, tokenizer, valid_ds: dict):
    results = []

    data_size = valid_ds["input_ids"].shape[0]

    for i in range(data_size):
        text_label = tokenizer.decode(valid_ds["labels"][i], skip_special_tokens=True)
        embedding_label = model(
            tokenizer.encode(
                text_label, padding="max_length", max_length=128, return_tensors="pt"
            )
            .reshape(1, -1)
            .to(device)
        ).encoder_last_hidden_state
        embedding_label = torch.mean(embedding_label, dim=1)

        token_ids_output = model.generate(
            valid_ds["input_ids"][i].to(device).reshape(1, -1), max_length=MAX_LENGTH
        )[0]
        text_output = tokenizer.decode(token_ids_output, skip_special_tokens=True)

        embeddings_output = model(
            tokenizer.encode(
                text_output, padding="max_length", max_length=128, return_tensors="pt"
            )
            .to(device)
            .reshape(1, -1)
        ).encoder_last_hidden_state
        embeddings_output = torch.mean(embeddings_output, dim=1)

        similarity = cosine_similarity(embedding_label, embeddings_output)[0].item()

        # print("Label: ", text_label)
        # print("Output: ", text_output)
        # print("Similarity: ", similarity)
        # print("---------------------------------------------------------")

        results.append((text_label, text_output, similarity))

    results = pd.DataFrame(results, columns=["label", "output", "similarity"])
    return results

In [144]:
validation_batch = 50
results = hits(model, tokenizer, valid_ds[:validation_batch])

In [233]:
_t = tokenizer.decode(valid_ds[0]["input_ids"], skip_special_tokens=False)

### Original Hits Metric

In [280]:
def training_data_to_text(input_ids, tokenizer=tokenizer):
    text = tokenizer.decode(input_ids, skip_special_tokens=False)
    # 3 -> len(<s>), 7 -> len(<mask>.)

    return text[3 : text.find(tokenizer.mask_token) + 7]


def generate_beam_search(model, tokenizer, text, beam_size=5, max_length=128):
    # Text Example: My name is <mask>.
    input_ids = tokenizer.encode(
        text, padding="max_length", max_length=max_length, return_tensors="pt"
    ).to(device)

    beam_outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=beam_size,
        no_repeat_ngram_size=2,
        num_return_sequences=beam_size,
        early_stopping=True,
    )

    return [
        tokenizer.decode(beam_output, skip_special_tokens=True)
        for beam_output in beam_outputs
    ]


def compute_hits(model, tokenizer, valid_ds, beam_size=5, max_length=128, debug=False):
    results = []
    hits = 0
    data_size = valid_ds["input_ids"].shape[0]

    for i in range(data_size):
        text = training_data_to_text(valid_ds["input_ids"][i], tokenizer=tokenizer)
        label = tokenizer.decode(valid_ds["labels"][i], skip_special_tokens=True)
        output_list = generate_beam_search(
            text=text,
            model=model,
            tokenizer=tokenizer,
            beam_size=beam_size,
            max_length=max_length,
        )

        if label.lower() in list(map(lambda x: x.lower(), output_list)):
            hits += 1

        results.append((text, label, output_list))

        if debug:
            print("Text: ", text)
            print("Label: ", label)
            print("Output: ", output_list)
            print()

    return pd.DataFrame(results, columns=["text", "label", "output_list"]), hits

In [283]:
# Results beam size 5
batch = 1000 # Max -> len(valid_ds)
results, hits = compute_hits(model, tokenizer, valid_ds[:batch], beam_size=5, debug=False)

In [None]:
hits

25

### testes bart

In [182]:
from transformers import BartTokenizer, BartForConditionalGeneration

TXT = "Michelle Obama is wife of <mask>;"

input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
logits = model(input_ids.to(device)).logits

masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(10)

tokenizer.decode(predictions).split()

['President',
 'Barack',
 'the',
 'president',
 'former',
 'first',
 'Obama',
 'First',
 'Donald',
 'Bill']

In [187]:
from transformers import BartTokenizer, BartForConditionalGeneration

TXT = "Michelle Obama is wife of <mask>"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
logits = model(input_ids.to(device)).logits

masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)

predicted_tokens = tokenizer.batch_decode(predictions)

suggested_words = tokenizer.decode(predictions).split()

# Gerar 5 preenchimentos de 2 tokens cada
combinations = [
    (suggested_words[i], suggested_words[j]) for i in range(5) for j in range(i + 1, 5)
]

print(combinations)

[('President', 'Barack'), ('President', 'the'), ('President', 'president'), ('President', 'former'), ('Barack', 'the'), ('Barack', 'president'), ('Barack', 'former'), ('the', 'president'), ('the', 'former'), ('president', 'former')]


In [212]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

TXT = "Piano is a type of <mask>."

# Definir hiperparâmetros da busca beam search
num_beams = 5
num_tokens = 10

# Tokenizar a entrada
input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]

# Gerar preenchimentos usando busca beam search
output = model.generate(
    input_ids=input_ids.to(device),
    max_length=input_ids.shape[-1] + num_tokens,
    num_beams=num_beams,
    num_return_sequences=num_beams,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=False,
)

# Decodificar as sequências geradas
decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)

# Gerar combinações de 2 tokens para as sequências decodificadas
print(decoded_outputs)

['piano', 'percussion instrument', 'guitar', 'drum', 'drum kit']
