In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
import os

os.chdir("../")

In [3]:
### SETTINGS ###

DATASETS = {
    "FB15k-237-DECODE-ONLY-LABEL": "data/data_processed/FB15k-237/decode_only_label/",
    "ALL-DATA-DECODE-ONLY-LABEL": "data/data_processed/FB15k_FB15k237_WN18_WN18RR/",
}

MODELS = {
    "bart-small": "lucadiliello/bart-small",
    "bart-base": "facebook/bart-base",
    "bart-large": "facebook/bart-large",
}

# Dataset
DATASET = "ALL-DATA-DECODE-ONLY-LABEL"
MODEL = "bart-base"
MODEL_NAME = MODEL + "_" + DATASET
MODEL_PATH = f"models/{MODEL_NAME}/hf_trainer/trained_model/"
MAX_LENGTH = 50

# Training
params = {
    # Dir
    "output_dir": f"models/{MODEL_NAME}/",
    # Batch
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    # Learning rate
    "learning_rate": 5e-5,
    "seed": 42,
    # Epochs
    "num_train_epochs": 50,
    # Logging
    "logging_dir": "model/logs",
    "logging_strategy": "epoch",
    "logging_steps": 10,
    # Evaluation
    "evaluation_strategy": "epoch",
    "eval_steps": 1,
    # Checkpoint
    "save_strategy": "epoch",
    "save_steps": 2,
    "save_total_limit": 2,
    "ddp_find_unused_parameters": False,
    "warmup_steps": 2,
}

### Load the model

In [4]:
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
)

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = (
    BartForConditionalGeneration.from_pretrained(MODEL_PATH).cuda().float().to(device)
)
tokenizer = BartTokenizer.from_pretrained(MODELS[MODEL])

### Load Data

In [5]:
from src.datasetkgc import DatasetKGC

In [6]:
import torch

train_ds, valid_ds = torch.load(DATASETS[DATASET] + "train_ds.pth"), torch.load(
    DATASETS[DATASET] + "valid_ds.pth"
)

In [7]:
from torch.utils.data import DataLoader

train_ds, valid_ds = (
    DataLoader(
        train_ds, batch_size=params["per_device_train_batch_size"], shuffle=False
    ),
    DataLoader(
        valid_ds, batch_size=params["per_device_eval_batch_size"], shuffle=False
    ),
)

### Evaluation - Iterative Playground

define idx variable and can run standlone predict.

In [11]:
# Sample
idx = 1
_valid_ds = next(iter(valid_ds))
input_ids = _valid_ds["input_ids"][idx].to(device)
attention_mask = _valid_ds["attention_mask"][idx].to(device)
labels = _valid_ds["labels"][idx].to(device)

In [12]:
with torch.no_grad():
    print("Sample input: ")
    print(tokenizer.decode(input_ids).replace("<pad>", ""), "\n")

    print("Expected output: ")
    print(tokenizer.decode(labels, skip_special_tokens=True), "\n")

    print("Model Output: ")
    print(
        tokenizer.decode(
            model.generate(input_ids.reshape(1, -1), max_length=MAX_LENGTH)[0]
        )
    )

Sample input: 
<s>musical film has films in this genre of Beauty and the Beast. war film has films in this genre of The Living Daylights. satire has films in this genre of<mask>.</s> 

Expected output: 
The Simpsons Movie 

Model Output: 
</s><s>The Manchurian Candidate</s>


### Hits

This metric measure the distance between the representation of label and output tokens.

In [13]:
import pandas as pd


def cosine_similarity(embd_i, embd_j):
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    return cos(embd_i, embd_j)


def hits_sim(model, tokenizer, valid_ds: dict):
    results = []

    data_size = valid_ds["input_ids"].shape[0]

    for i in range(data_size):
        text_label = tokenizer.decode(valid_ds["labels"][i], skip_special_tokens=True)
        embedding_label = model(
            tokenizer.encode(
                text_label, padding="max_length", max_length=128, return_tensors="pt"
            )
            .reshape(1, -1)
            .to(device)
        ).encoder_last_hidden_state
        embedding_label = torch.mean(embedding_label, dim=1)

        token_ids_output = model.generate(
            valid_ds["input_ids"][i].to(device).reshape(1, -1), max_length=MAX_LENGTH
        )[0]
        text_output = tokenizer.decode(token_ids_output, skip_special_tokens=True)

        embeddings_output = model(
            tokenizer.encode(
                text_output, padding="max_length", max_length=128, return_tensors="pt"
            )
            .to(device)
            .reshape(1, -1)
        ).encoder_last_hidden_state
        embeddings_output = torch.mean(embeddings_output, dim=1)

        similarity = cosine_similarity(embedding_label, embeddings_output)[0].item()

        # print("Label: ", text_label)
        # print("Output: ", text_output)
        # print("Similarity: ", similarity)
        # print("---------------------------------------------------------")

        results.append((text_label, text_output, similarity))

    results = pd.DataFrame(results, columns=["label", "output", "similarity"])
    return results

#### Analysis of cosine similarity of label and output encoded by model.

In [None]:
# Sample
idx = 0
_valid_ds = next(iter(valid_ds))
input_ids = _valid_ds["input_ids"][idx].to(device)
attention_mask = _valid_ds["attention_mask"][idx].to(device)
labels = _valid_ds["labels"][idx].to(device)

In [None]:
text_label = tokenizer.decode(labels, skip_special_tokens=True)
print(text_label)

embedding_label = model(
    tokenizer.encode(
        text_label, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt"
    )
    .reshape(1, -1)
    .to(device)
).encoder_last_hidden_state
embedding_label = torch.mean(embedding_label, dim=1)

In [None]:
token_ids_output = model.generate(input_ids.reshape(1, -1), max_length=MAX_LENGTH)[0]
text_output = tokenizer.decode(token_ids_output, skip_special_tokens=True)
print(text_output)

embeddings_output = model(
    tokenizer.encode(
        text_output, padding="max_length", max_length=128, return_tensors="pt"
    )
    .to(device)
    .reshape(1, -1)
).encoder_last_hidden_state
embeddings_output = torch.mean(embeddings_output, dim=1)

In [None]:
cosine_similarity(embedding_label, embeddings_output)

In [None]:
validation_batch = 50
results = hits(model, tokenizer, valid_ds[:validation_batch])

In [None]:
_t = tokenizer.decode(valid_ds[0]["input_ids"], skip_special_tokens=False)

### Original Hits Metric

In [20]:
import tqdm
from tqdm.auto import tqdm
import pandas as pd

tqdm.pandas()


def training_data_to_text(input_ids, tokenizer=tokenizer):
    text = tokenizer.decode(input_ids, skip_special_tokens=False)
    # 3 -> len(<s>), 7 -> len(<mask>.)

    return text[3 : text.find(tokenizer.mask_token) + 7]


def generate_beam_search(model, tokenizer, text, beam_size=5, max_length=128):
    # Text Example: My name is <mask>.
    input_ids = tokenizer.encode(
        text, padding="max_length", max_length=max_length, return_tensors="pt"
    ).to(device)

    beam_outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=beam_size,
        num_return_sequences=beam_size,
        early_stopping=True,
    )

    return [
        tokenizer.decode(beam_output, skip_special_tokens=True)
        for beam_output in beam_outputs
    ]


def compute_hits(model, tokenizer, valid_ds, beam_size=5, max_length=128, debug=False):
    results = []
    hits = 0
    data_size = valid_ds["input_ids"].shape[0]

    for i in tqdm(range(data_size)):
        text = training_data_to_text(valid_ds["input_ids"][i], tokenizer=tokenizer)
        label = tokenizer.decode(valid_ds["labels"][i], skip_special_tokens=True)
        output_list = generate_beam_search(
            text=text,
            model=model,
            tokenizer=tokenizer,
            beam_size=beam_size,
            max_length=max_length,
        )

        if label.lower() in list(map(lambda x: x.lower(), output_list)):
            hits += 1

            if debug:
                print("Text: ", text)
                print("Label: ", label)
                print("Output: ", output_list)
                print()

            results.append((text, label, output_list, True))
        else:
            results.append((text, label, output_list, False))

    return (
        pd.DataFrame(results, columns=["text", "label", "output_list", "match"]),
        hits,
    )

In [8]:
train_ds, valid_ds = torch.load(DATASETS[DATASET] + "train_ds.pth"), torch.load(
    DATASETS[DATASET] + "valid_ds.pth"
)

In [22]:
# Results beam size 1
batch = 1000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=1, debug=False
)

print("Hits: ", hits)

  0%|          | 0/1000 [00:00<?, ?it/s]

Hits:  133


In [32]:
pd.set_option("max_colwidth", 800)

In [39]:
results[results["match"] == False].sample(10)

Unnamed: 0,text,label,output_list,match
983,teacher has people with this profession of Donald McAlpine. screenwriter has people with this profession of Peter Coyote. composer has people with this profession of<mask>.,Bill Wyman,[John Cage],False
496,Scream 2 has music of Marco Beltrami. Red Dragon has music of Danny Elfman. The Da Vinci Code has music of<mask>.,Hans Zimmer,[John Williams],False
142,16th United States Congress has legislative sessions of 13th United States Congress. 95th United States Congress has legislative sessions of 110th United States Congress. New Jersey has legislative sessions of<mask>.,99th United States Congress,[16th United States Congress],False
457,56th Golden Globe Awards has honored for of Elizabeth. George Lucas has honored for of American Graffiti. Dakota Fanning has honored for of<mask>.,War of the Worlds,[The Twilight Saga: Eclipse],False
139,Mortal Kombat: Annihilation has actor of James Remar. Lethal Weapon 4 has actor of Mel Gibson. Hercules has actor of<mask>.,Keith David,[John Cleese],False
238,Star Wars: The Clone Wars has genre of adventure film. A Christmas Carol has genre of fantasy. Joseph Haydn has genre of<mask>.,chamber music,[opera],False
634,Mortal Kombat: Annihilation has actor of James Remar. Lethal Weapon 4 has actor of Mel Gibson. L.A. Confidential has actor of<mask>.,Kevin Spacey,[Michael Caine],False
82,Henry King has cause of death of myocardial infarction. Susan Hayward has cause of death of brain tumor. Edward G. Robinson has cause of death of<mask>.,cancer,[myocardial infarction],False
860,Star Wars: The Clone Wars has genre of adventure film. A Christmas Carol has genre of fantasy. Deep Purple has genre of<mask>.,psychedelic rock,[rock music],False
417,"BAFTA Award for Best Editing has nominated for of Alien. MTV Movie Award for Best Kiss has nominated for of Monster. Golden Raspberry Award for Worst Prequel, Remake, Rip-off or Sequel has nominated for of<mask>.",Sex and the City 2,[The Adventures of Ford Fairlane],False


In [20]:
# Results beam size 3
batch = 20000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=3, debug=False
)

print("Hits: ", hits)

  0%|          | 0/20000 [00:00<?, ?it/s]

Hits:  3380


In [21]:
# Results beam size 10
batch = 20000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=10, debug=False
)

print("Hits: ", hits)

  0%|          | 0/20000 [00:00<?, ?it/s]

Hits:  5338


In [24]:
# Results beam size 1
batch = 20000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=1, debug=False
)

print("Hits: ", hits)

# Results beam size 3
batch = 20000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=3, debug=False
)

print("Hits: ", hits)

# Results beam size 10
batch = 20000  # Max -> len(valid_ds)
results, hits = compute_hits(
    model, tokenizer, valid_ds[:batch], beam_size=10, debug=False
)

print("Hits: ", hits)

  0%|          | 0/20000 [00:00<?, ?it/s]

Hits:  2600


  0%|          | 0/20000 [00:00<?, ?it/s]

Hits:  3945


  0%|          | 0/20000 [00:00<?, ?it/s]

Hits:  6306


### Beam Search Standalone

In [40]:
# from transformers import BartTokenizer, BartForConditionalGeneration

TXT = "Brazil capital is São <mask>."

input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
logits = model(input_ids.to(device)).logits

masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(10)

tokenizer.decode(predictions).split()

['Paulo',
 'Francisco',
 'Jo',
 'Tom',
 'José',
 'Pedro',
 'Jose',
 'Miguel',
 'Antonio',
 'Juan']

In [41]:
# from transformers import BartTokenizer, BartForConditionalGeneration

TXT = "New Zeland has capital of <mask>."

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
logits = model(input_ids.to(device)).logits

masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)

predicted_tokens = tokenizer.batch_decode(predictions)

suggested_words = tokenizer.decode(predictions).split()

# Gerar 5 preenchimentos de 2 tokens cada
combinations = [
    (suggested_words[i], suggested_words[j]) for i in range(5) for j in range(i + 1, 5)
]

print(suggested_words)

['€', '$', '£', 'A', '5']


In [64]:
def _generate(txt):
    # Definir hiperparâmetros da busca beam search
    num_beams = 10
    num_tokens = 10

    # Tokenizar a entrada
    input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]

    # Gerar preenchimentos usando busca beam search
    output = model.generate(
        input_ids=input_ids.to(device),
        max_length=input_ids.shape[-1] + num_tokens,
        num_beams=num_beams,
        num_return_sequences=num_beams,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=False,
    )

    # Decodificar as sequências geradas
    decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)

    # Gerar combinações de 2 tokens para as sequências decodificadas
    print(decoded_outputs)

### Concepts - Languages

In [110]:
TXT = "Brasil has official language of <mask>."

_generate(TXT)

['English', 'Tamil', 'Urdu', 'French', 'Telugu', 'Arabic', 'Spanish', 'Italian', 'English literature', 'Malay']


In [111]:
TXT = "Spain has the official language of Spanish. EUA has official language of English. Brasil has official language of <mask>."

_generate(TXT)

['Spanish', 'English', 'portuguese', 'Latin', 'Brazilian', 'French', 'Italian', 'Brasilian', 'Bolero', 'Arabic']


### Concets - Game

In [112]:
TXT = "Mortal Kombat has type of <mask>."

_generate(TXT)

['computer animation', 'action film', 'drama film', 'adventure film', 'fantasy', 'video game', 'vampire', 'anime', 'video game music', 'video game game']


In [79]:
TXT = "Russia has capital of <mask>."

_generate(TXT)

['Moscow', 'Saint Petersburg', 'Kiev', 'St. Petersburg', 'Tbilisi', 'Minsk', 'Alexandria', 'Baku', 'Rome', 'Varna']


In [80]:
TXT = "Portugal has capital of <mask>."

_generate(TXT)

['Porto', 'Portugal City', 'Rio de Janeiro', 'Lisbon', 'Puerto Rico', 'Sarasota', 'Porto City', 'Amsterdam', 'Madrid', 'Portugal']


In [84]:
TXT = "Barack Obama is married of <mask>."

_generate(TXT)

['Jenna Bush Hager', 'Loretta Devine', 'Geraldine Chaplin', 'Katharine Hepburn', 'Geraldine Somerville', 'Jill Clayburgh', 'Jennifer Aniston', 'Geraldine Page', 'Loretta Young', 'Hillary Clinton']


In [89]:
TXT = "John Kennedy is married of Jacqueline Kennedy. George Bush is married of Laura Bush. B. Obama is married of <mask>."

_generate(TXT)

['Barack Obama', 'Barbara Walters', 'Barbara Hershey', 'Debra Winger', 'Michelle Obama', 'Lauren Bacall', 'Lauren Holly', 'Michelle Forbes', 'Michelle Branch', 'Lauren Conrad']


In [93]:
TXT = "Films: <mask>."

_generate(TXT)

['Wyatt Earp', 'The Godfather Part III', 'Thelma & Louise', 'The Godfather Part II', 'The Adventures of Ford Fairlane', 'Saving Private Ryan', 'The Adventures of Tintin', 'The Adventures of Pluto Nash', 'Sideways', 'The Quiet American']


In [105]:
TXT = "Films: Star Wars, Star Trek, <mask>."

_generate(TXT)

['Star Wars Episode IV: A New Hope', 'Star Trek', 'Star Trek: First Contact', 'Star Trek IV: The Voyage Home', 'Star Trek VI: The Undiscovered Country', 'Star Trek V: The Final Frontier', 'Star Trek: The Original Series', 'Star Trek: The Next Generation', 'Star Trek: Nemesis', 'Star Wars']


In [104]:
TXT = "Computer <mask>."

_generate(TXT)

['computer hardware', 'computer science', 'computer animation', 'electronic keyboard', 'artificial intelligence', 'software engineering', 'acoustic guitar', 'programming', 'computer engineering', 'software']


In [102]:
TXT = "Barack <mask>."

_generate(TXT)

['Barack Obama', 'George W. Bush', 'Theodore Bikel', 'Sidney Sheldon', 'Barry Gibb', 'George Lucas', 'Barry Pepper', 'Theodore Roosevelt', 'Michael Moore', 'George Harrison']


In [109]:
TXT = "Walt Disney <mask>."

_generate(TXT)

['The Walt Disney Company', 'Walt Disney Pictures', 'Walt Disney', 'Pixar', 'George Lucas', 'DreamWorks Animation', 'Hollywood Squares', 'Hollywood Pictures', 'Hollywood Forever Cemetery', 'DreamWorks']
