## Configuração

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 8.6MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 40.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 36.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547

In [2]:
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tqdm import tqdm

import torch
from torch import nn, from_numpy
import torch.nn.functional as F
from torch.nn.functional import cross_entropy, softmax
from torch.utils.data import Dataset, DataLoader

In [3]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
RANDOM_SEED = 2021
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




## Dataset

In [5]:
entities = pd.read_csv("./entities_final.csv")

In [6]:
entities.columns

Index(['question', 'entities'], dtype='object')

In [7]:
word2tag = {"texts": [], "entities": []}

for _, row in entities.iterrows():
    question = row["question"].split()
    entity = eval(row["entities"])

    if len(question) == len(entity):
        word2tag["texts"] += question
        word2tag["entities"] += entity

entities_separeted = pd.DataFrame(word2tag)

In [8]:
word2tag = {"texts": [], "entities": []}

for _, row in entities.iterrows():
    question = row["question"].split()
    entity = eval(row["entities"])

    if len(question) == len(entity):
        word2tag["texts"].append(question)
        word2tag["entities"].append(entity)

entities_list = pd.DataFrame(word2tag)

In [9]:
entities_list.head()

Unnamed: 0,texts,entities
0,"[who's, the, writer, of, Experiment, Perilous]","[0, 0, 0, 0, B-movie, I-movie]"
1,"[who's, the, writer, of, Games]","[0, 0, 0, 0, B-movie]"
2,"[which, person, wrote, The, Wolf, Man]","[0, 0, 0, B-movie, I-movie, I-movie]"
3,"[who, is, the, writer, of, the, film, Sweet, C...","[0, 0, 0, 0, 0, 0, 0, B-movie, I-movie]"
4,"[who, was, the, writer, of, Extract]","[0, 0, 0, 0, 0, B-movie]"


In [10]:
class EntityDataset(Dataset):

    def __init__(self, texts, entities):
        self.texts = texts.to_list() # Lista de palavras
        self.entities = entities.to_list() # Lista de entidades
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        entities = self.entities[item]

        ids = []
        target_entities = []

        for i, s in enumerate(text):
            inputs = self.tokenizer.encode(s, add_special_tokens=False) # Os tokens especiais serão adicionados depois
            input_len = len(inputs)
            ids.extend(inputs)
            # Caso a palavra tenha se quebrado na tokenizaçao
            # Será preciso colocar as entidades para esse novos tokens
            target_entities.extend([entities[i]] * input_len) 
        
        ids = ids[:MAX_LEN - 2]
        target_entities = target_entities[:MAX_LEN-2]

        ids = [101] + ids + [102]
        target_entities = [0] + target_entities + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        # Adicionando o padding
        padding_len = MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_entities = target_entities + ([0] * padding_len)


        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_entities": torch.tensor(target_entities, dtype=torch.long),
        } 

## Modelo

In [11]:
entities_set = set([label for line in entities_list["entities"] for label in line])
num_entities = len(entities_set)

In [12]:
def loss_fn(output, target, mask, num_entities):
    lfn = nn.CrossEntropyLoss()

    active_loss = mask.view(-1) == 1 # Calcular a loss onde n tem padding
    active_logits = output.view(-1, num_entities)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )

    loss = lfn(active_logits, active_labels)
    return loss

In [13]:
class EntityModel(nn.Module):

    def __init__(self, num_entities):
        super(EntityModel, self).__init__()
        self.num_entities = num_entities
        self.bert = AutoModel.from_pretrained("bert-base-cased")
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, self.num_entities)

    def forward(self, ids, mask, token_type_ids, target_entities):
        last_hidden_states = self.bert(
            ids,
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        bo_entities = self.drop(last_hidden_states[0])
        entities = self.out(bo_entities)

        loss = loss_fn(entities, target_entities, mask, self.num_entities)

        return entities, loss

## Função de treino e avaliação

In [14]:
def train_model(model, data_loader, optimizer, scheduler):
    model = model.train()

    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)

        optimizer.zero_grad()
        _, loss = model(**data)

        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    
    return final_loss / len(data_loader)

In [15]:
def eval_model(model, data_loader):
    model = model.eval()

    final_loss = 0
    with torch.no_grad():
        for data in tqdm(data_loader, total=len(data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)

            _, loss = model(**data)
            final_loss += loss.item()
    
    return final_loss / len(data_loader)

## Treino

In [16]:
entity2idx = {u:i for i, u in enumerate(entities_set)}
idx2entity = list(np.array(entities_set).tolist())

In [17]:
def get_idx(entities):
    return [entity2idx[entity] for entity in entities]

In [18]:
entities_list["entities"] = entities_list["entities"].apply(get_idx)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
         entities_list["texts"], entities_list["entities"], test_size=0.5, random_state=2021)

X_val, X_test, y_val, y_test = train_test_split(
         X_test, y_test, test_size=0.5, random_state=2021)

In [20]:
train_dataset = EntityDataset(X_train, y_train)

val_dataset = EntityDataset(X_val, y_val)

test_dataset = EntityDataset(X_test, y_test)

In [21]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=2
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

In [22]:
model = EntityModel(num_entities)
model.to(device);

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
]


optimizer = AdamW(optimizer_parameters, lr=3e-4)

num_train_steps = len(X_train) // (TRAIN_BATCH_SIZE * EPOCHS) # == 780

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=num_train_steps
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
best_loss = np.inf

for epoch in range(EPOCHS):
    print()
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 20)
    
    train_loss = train_model(model, train_data_loader, optimizer, scheduler)
    print()
    print(f"TRAIN LOSS: {train_loss}")

    val_loss = eval_model(model, val_data_loader)
    print()
    print(f"VAL LOSS: {val_loss}")

    if val_loss < best_loss:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_loss = val_loss

  0%|          | 0/156 [00:00<?, ?it/s]


Epoch 1/5
--------------------


100%|██████████| 156/156 [00:53<00:00,  2.93it/s]
  0%|          | 0/311 [00:00<?, ?it/s]


TRAIN LOSS: 0.25812397357554007


100%|██████████| 311/311 [00:09<00:00, 31.54it/s]



VAL LOSS: 0.1878037149523807


  0%|          | 0/156 [00:00<?, ?it/s]


Epoch 2/5
--------------------


100%|██████████| 156/156 [00:56<00:00,  2.75it/s]
  0%|          | 0/311 [00:00<?, ?it/s]


TRAIN LOSS: 0.20893481794076088


100%|██████████| 311/311 [00:10<00:00, 28.91it/s]
  0%|          | 0/156 [00:00<?, ?it/s]


VAL LOSS: 0.1878037149523807

Epoch 3/5
--------------------


100%|██████████| 156/156 [00:58<00:00,  2.65it/s]
  0%|          | 0/311 [00:00<?, ?it/s]


TRAIN LOSS: 0.20827965853879085


100%|██████████| 311/311 [00:10<00:00, 29.30it/s]
  0%|          | 0/156 [00:00<?, ?it/s]


VAL LOSS: 0.1878037149523807

Epoch 4/5
--------------------


100%|██████████| 156/156 [00:58<00:00,  2.65it/s]
  0%|          | 0/311 [00:00<?, ?it/s]


TRAIN LOSS: 0.20856091547279787


100%|██████████| 311/311 [00:10<00:00, 29.19it/s]
  0%|          | 0/156 [00:00<?, ?it/s]


VAL LOSS: 0.1878037149523807

Epoch 5/5
--------------------


100%|██████████| 156/156 [00:58<00:00,  2.65it/s]
  0%|          | 0/311 [00:00<?, ?it/s]


TRAIN LOSS: 0.2091262170519584


100%|██████████| 311/311 [00:10<00:00, 28.89it/s]


VAL LOSS: 0.1878037149523807





In [24]:
best_loss

0.1878037149523807

## Inferencia

In [25]:
sentence = "who directed the film Blade Runner 2047"

tokenized_sentence = tokenizer(sentence, add_special_tokens=False)
sentence = sentence.split()

In [26]:
test = EntityDataset(
    texts=pd.Series([sentence]), 
    entities=pd.Series([[0] * len(sentence)]), 
)

In [27]:
tokenized_sentence

{'input_ids': [1150, 2002, 1103, 1273, 17360, 11204, 21355, 1559], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [74]:
with torch.no_grad():
    data = test[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    entities, _ = model(**data)
    idx = entities.argmax(2).cpu().numpy().reshape(-1)[:len(tokenized_sentence['input_ids'])]
    
    print([idx2entity[i] for i in idx])

['I-tag', '0', '0', '0', '0', 'B-movie', 'I-movie', 'I-movie']


## Performance

In [55]:
def get_predictions(model, data_loader):
    model = model.eval()

    predictions = []
    real_values = []

    with torch.no_grad():
        for data in data_loader:
            for k, v in data.items():
                data[k] = v.to(device)

            entitites, _ = model(**data)
            predictions.append(entities.argmax(2).cpu().numpy().reshape(-1))
            
            real_values.append(data["target_entities"].cpu().numpy().reshape(-1))

    predictions = [a.squeeze().tolist() for a in predictions]
    real_values = [a.squeeze().tolist() for a in real_values]
    return predictions, real_values

In [56]:
y_pred, y_test = get_predictions(model, test_data_loader)

In [69]:
len(max(y_pred, key= lambda x: len(x)))

64

In [72]:
y_pred_extended = [pred for i in y_pred for pred in i]
y_test_extended = [pred for i in y_test for pred in i[:64]]

In [73]:
print(classification_report(y_pred_extended, y_test_extended))

              precision    recall  f1-score   support

           0       0.03      0.75      0.05       622
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.68      0.08      0.14     13995
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.11      0.02      0.04      1866
          10       0.43      0.06      0.10      3421

    accuracy                           0.09     19904
   macro avg       0.11      0.08      0.03     19904
weighted avg       0.56      0.09      0.12     19904



  _warn_prf(average, modifier, msg_start, len(result))
