## Configuração

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 47.5 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [2]:
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tqdm import tqdm

import torch
from torch import nn, from_numpy
import torch.nn.functional as F
from torch.nn.functional import cross_entropy, softmax
from torch.utils.data import Dataset, DataLoader

In [3]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
RANDOM_SEED = 2021
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

## Dataset

In [5]:
entities = pd.read_csv("./entities_final.csv")

In [6]:
entities.columns

Index(['question', 'entities'], dtype='object')

In [7]:
entities.head()

Unnamed: 0,question,entities
0,who's the writer of Experiment Perilous,"['0', '0', '0', '0', 'B-movie', 'I-movie']"
1,who's the writer of Games,"['0', '0', '0', '0', 'B-movie']"
2,which person wrote The Wolf Man,"['0', '0', '0', 'B-movie', 'I-movie', 'I-movie']"
3,who is the writer of the film Sweet Charity,"['0', '0', '0', '0', '0', '0', '0', 'B-movie',..."
4,who was the writer of Extract,"['0', '0', '0', '0', '0', 'B-movie']"


In [8]:
entities['entities'] = entities['entities'].apply(lambda x: str([i if len(i) == 1 else i[2:] for i in eval(x)]))

In [9]:
entities['entities'][0]

"['0', '0', '0', '0', 'movie', 'movie']"

In [10]:
word2tag = {"texts": [], "entities": []}

for _, row in entities.iterrows():
    question = row["question"].split()
    entity = eval(row["entities"])

    if len(question) == len(entity):
        word2tag["texts"] += question
        word2tag["entities"] += entity

entities_separeted = pd.DataFrame(word2tag)

In [11]:
word2tag = {"texts": [], "entities": []}

for _, row in entities.iterrows():
    question = row["question"].split()
    entity = eval(row["entities"])

    if len(question) == len(entity):
        word2tag["texts"].append(question)
        word2tag["entities"].append(entity)

entities_list = pd.DataFrame(word2tag)

In [12]:
entities_list.head()

Unnamed: 0,texts,entities
0,"[who's, the, writer, of, Experiment, Perilous]","[0, 0, 0, 0, movie, movie]"
1,"[who's, the, writer, of, Games]","[0, 0, 0, 0, movie]"
2,"[which, person, wrote, The, Wolf, Man]","[0, 0, 0, movie, movie, movie]"
3,"[who, is, the, writer, of, the, film, Sweet, C...","[0, 0, 0, 0, 0, 0, 0, movie, movie]"
4,"[who, was, the, writer, of, Extract]","[0, 0, 0, 0, 0, movie]"


In [13]:
class EntityDataset(Dataset):

    def __init__(self, texts, entities):
        self.texts = texts.to_list() # Lista de palavras
        self.entities = entities.to_list() # Lista de entidades
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        entities = self.entities[item]

        ids = []
        target_entities = []

        for i, s in enumerate(text):
            inputs = self.tokenizer.encode(s, add_special_tokens=False) # Os tokens especiais serão adicionados depois
            input_len = len(inputs)
            ids.extend(inputs)
            # Caso a palavra tenha se quebrado na tokenizaçao
            # Será preciso colocar as entidades para esse novos tokens
            target_entities.extend([entities[i]] * input_len) 
        
        ids = ids[:MAX_LEN - 2]
        target_entities = target_entities[:MAX_LEN-2]

        ids = [101] + ids + [102]
        target_entities = [0] + target_entities + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        # Adicionando o padding
        padding_len = MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_entities = target_entities + ([0] * padding_len)


        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_entities": torch.tensor(target_entities, dtype=torch.long),
        } 

## Modelo

In [14]:
entities_set = set([label for line in entities_list["entities"] for label in line])
num_entities = len(entities_set)

In [15]:
def loss_fn(output, target, mask, num_entities):
    lfn = nn.CrossEntropyLoss()

    active_loss = mask.view(-1) == 1 # Calcular a loss onde n tem padding
    active_logits = output.view(-1, num_entities)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )

    loss = lfn(active_logits, active_labels)
    return loss

In [16]:
class EntityModel(nn.Module):

    def __init__(self, num_entities):
        super(EntityModel, self).__init__()
        self.num_entities = num_entities
        self.bert = AutoModel.from_pretrained("bert-base-cased")
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, self.num_entities)

    def forward(self, ids, mask, token_type_ids, target_entities):
        last_hidden_states = self.bert(
            ids,
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        bo_entities = self.drop(last_hidden_states[0])
        entities = self.out(bo_entities)

        loss = loss_fn(entities, target_entities, mask, self.num_entities)

        return entities, loss

## Função de treino e avaliação

In [17]:
def train_model(model, data_loader, optimizer, scheduler):
    model.train()

    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)

        optimizer.zero_grad()
        _, loss = model(**data)
        # entities = model(**data)
        # # print(entities)
        # # print(type(entities))
        # # print()
        # loss = loss_fn(entities, data["target_entities"], data["mask"])

        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    
    return final_loss / len(data_loader)

In [18]:
def eval_model(model, data_loader):
    model.eval()

    final_loss = 0
    with torch.no_grad():
        for data in tqdm(data_loader, total=len(data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)

            _, loss = model(**data)
            final_loss += loss.item()
    
    return final_loss / len(data_loader)

## Treino

In [19]:
entity2idx = {u:i for i, u in enumerate(entities_set)}
idx2entity = list(np.array(entities_set).tolist())

In [20]:
def get_idx(entities):
    return [entity2idx[entity] for entity in entities]

In [21]:
entities_list["entities"] = entities_list["entities"].apply(get_idx)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
         entities_list["texts"], entities_list["entities"], test_size=0.5, random_state=2021)

X_val, X_test, y_val, y_test = train_test_split(
         X_test, y_test, test_size=0.5, random_state=2021)

In [23]:
train_dataset = EntityDataset(X_train, y_train)

val_dataset = EntityDataset(X_val, y_val)

test_dataset = EntityDataset(X_test, y_test)

In [24]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=2
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

In [25]:
model = EntityModel(num_entities)
model.to(device);

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
]


optimizer = AdamW(optimizer_parameters, lr=3e-4)

num_train_steps = len(X_train) // (TRAIN_BATCH_SIZE * EPOCHS) # == 780

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=num_train_steps
)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
best_loss = np.inf

for epoch in range(EPOCHS):
    print()
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 20)
    
    train_loss = train_model(model, train_data_loader, optimizer, scheduler)
    print()
    print(f"TRAIN LOSS: {train_loss}")

    val_loss = eval_model(model, val_data_loader)
    print()
    print(f"VAL LOSS: {val_loss}")

    if val_loss <= best_loss:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_loss = val_loss


Epoch 1/5
--------------------


100%|██████████| 156/156 [01:40<00:00,  1.55it/s]



TRAIN LOSS: 0.15548965475784662


100%|██████████| 311/311 [00:19<00:00, 15.81it/s]



VAL LOSS: 0.10214722976447876

Epoch 2/5
--------------------


100%|██████████| 156/156 [01:42<00:00,  1.53it/s]



TRAIN LOSS: 0.11864268536177966


100%|██████████| 311/311 [00:19<00:00, 15.81it/s]



VAL LOSS: 0.10214722976447876

Epoch 3/5
--------------------


100%|██████████| 156/156 [01:42<00:00,  1.52it/s]



TRAIN LOSS: 0.11779756667331243


100%|██████████| 311/311 [00:19<00:00, 15.75it/s]



VAL LOSS: 0.10214722976447876

Epoch 4/5
--------------------


100%|██████████| 156/156 [01:42<00:00,  1.52it/s]



TRAIN LOSS: 0.11873508299677035


100%|██████████| 311/311 [00:19<00:00, 15.67it/s]



VAL LOSS: 0.10214722976447876

Epoch 5/5
--------------------


100%|██████████| 156/156 [01:42<00:00,  1.52it/s]



TRAIN LOSS: 0.11876060684713033


100%|██████████| 311/311 [00:19<00:00, 15.68it/s]



VAL LOSS: 0.10214722976447876


In [27]:
best_loss

0.10214722976447876

## Inferencia

In [28]:
sentence = "what films did Guillermo del Toro directed"

tokenized_sentence = tokenizer(sentence, add_special_tokens=False)
sentence = sentence.split()

In [29]:
test = EntityDataset(
    texts=pd.Series([sentence]), 
    entities=pd.Series([[0] * len(sentence)]),
)

In [30]:
tokenized_sentence

{'input_ids': [1184, 2441, 1225, 23167, 3687, 27470, 2002], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [31]:
with torch.no_grad():
    data = test[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    entities, _ = model(**data)
    
    idx = entities.argmax(2).cpu().numpy().reshape(-1)[:len(tokenized_sentence['input_ids'])]
    print([idx2entity[i] for i in idx])
    # print(data["target_entities"])
    # probs = F.softmax(entities, dim=1)
    # print(probs)

['writer', '0', '0', '0', 'director', 'director', 'director']
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')


## Performance

In [None]:
def get_predictions(model, data_loader):
    model.eval()

    predictions = []
    real_values = []

    with torch.no_grad():
        for data in data_loader:
            for k, v in data.items():
                data[k] = v.to(device)

            outputs, _ = model(**data)

            idx = outputs.argmax(2).cpu()
            # print(len(idx[0]))
            # print(idx)
            predictions.extend(idx)
            # print(len(data["target_entities"][0]))
            # print(data["target_entities"])

            real_values.extend(data["target_entities"])

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

In [None]:
y_pred, y_real = get_predictions(model, test_data_loader)

In [None]:
y_pred.reshape(-1)

tensor([0, 0, 0,  ..., 1, 1, 1])

In [None]:
y_real.reshape(-1)

tensor([0, 0, 0,  ..., 0, 0, 0])

In [None]:
print(classification_report(y_real, y_pred))

ValueError: ignored

In [None]:
y_pred

tensor([[ 0,  4,  5,  ..., 63, 11, 62],
        [ 0,  6,  9,  ...,  3,  7, 48],
        [ 0,  8,  8,  ...,  3, 57, 45],
        ...,
        [ 0,  8, 58,  ...,  0, 12, 49],
        [ 0,  3,  8,  ..., 12,  6, 45],
        [15,  9, 11,  ..., 47, 12, 49]])

In [None]:
y_test

tensor([[0, 7, 7,  ..., 0, 0, 0],
        [0, 7, 7,  ..., 0, 0, 0],
        [0, 7, 7,  ..., 0, 0, 0],
        ...,
        [0, 7, 7,  ..., 0, 0, 0],
        [0, 7, 7,  ..., 0, 0, 0],
        [0, 7, 7,  ..., 0, 0, 0]])