In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

from transformers import AutoTokenizer, AutoModelForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [3]:
OG_LABELS = ['OBJECTIVE', 'NEGATIVE_EFFECT', 'AGENT', 'VICTIM', 'FACILITATOR', 'CAMPAIGNER']

BIO_TAGS = ["O"] + ["B-" + label for label in OG_LABELS] + ["I-" + label for label in OG_LABELS]

id2tag = {i: tag for i, tag in enumerate(BIO_TAGS)}
tag2id = {tag: i for i, tag in enumerate(BIO_TAGS)}

#Add PAD token in the dictionary with the index -100
tag2id["<PAD>"] = -100
id2tag[-100] = "<PAD>"

print(tag2id)

{'O': 0, 'B-OBJECTIVE': 1, 'B-NEGATIVE_EFFECT': 2, 'B-AGENT': 3, 'B-VICTIM': 4, 'B-FACILITATOR': 5, 'B-CAMPAIGNER': 6, 'I-OBJECTIVE': 7, 'I-NEGATIVE_EFFECT': 8, 'I-AGENT': 9, 'I-VICTIM': 10, 'I-FACILITATOR': 11, 'I-CAMPAIGNER': 12, '<PAD>': -100}


In [4]:
import json

def read_data(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data

def alig_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id == None else labels[word_id]
            new_labels.append(label)
            
        elif word_id == None:
            new_labels.append(-100)
            
        else:
            label = labels[word_id]
            
            if label%2 == 1:
                label = label
            new_labels.append(label)
        
    return new_labels

def get_labels(data):
    labels = []
    for d in data:
        words = d['text'].split()
        label = ['O'] * len(words)
        for anotation in d['annotations']:
            if anotation['category'] in OG_LABELS:
                category = anotation['category']
                start = anotation['start_spacy_token']
                end = anotation['end_spacy_token']
                if start < len(words):
                    label[start] = 'B-' + category
                if end - start > 1 and end < len(words):
                    for i in range(start + 1, end):
                        label[i] = 'I-' + category
        labels.append(label)
        
    return labels

# def get_labels(data):
#     labels = []
#     for d in data:
#         words = d['text']
#         label = ['O'] * len(d['text'])
#         for a in d['annotations']:
#             if a['category'] in OG_LABELS:
#                 category = a['category']
#                 if category not in OG_LABELS:
#                     continue
#                 start = a['start_char']
#                 end = a['end_char']
#                 prev = words[:start].split()
#                 t = words[start:end].split()
#                 for i in range(len(prev), len(prev) + len(t)):
#                     if i == len(prev):
#                         label[i] = 'B-' + category
#                     else:
#                         label[i] = 'I-' + category
#             labels.append(label)
            
#     return labels

class Dataset_en(torch.utils.data.Dataset):
    def __init__(self, path, tokenizer):
        self.data = read_data(path)
        self.tokenizer = tokenizer
        self.max_len = 512
        
        self.texts = [d['text'] for d in self.data]
        
        self.labels = get_labels(self.data)        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.texts[idx].split()
        label = self.labels[idx]
        tokens = self.tokenizer(text, is_split_into_words=True, truncation=True, max_length=self.max_len)
        #tokens = self.tokenizer.tokenize(text, add_special_tokens=True, max_length=self.max_len, truncation=True)
        label = [tag2id[l] for l in label]
        
        new_label = alig_labels_with_tokens(label, tokens.word_ids())
        
        input_ids = torch.tensor(tokens.input_ids)
        attention_mask = torch.tensor(tokens.attention_mask)
        
        #Add padding to the labels
        padding_len = self.max_len - len(new_label)
        new_label = new_label + [-100]*padding_len
        
        #Add padding to the input_ids and attention_mask
        padding_len = self.max_len - len(input_ids)
        input_ids = torch.cat((input_ids, torch.tensor([0]*padding_len, dtype=torch.long)))
        attention_mask = torch.cat((attention_mask, torch.tensor([0]*padding_len, dtype=torch.long)))
        
        return input_ids, attention_mask, torch.tensor(new_label)
        
        
        

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
dataset = Dataset_en('../data/dataset_en_train.json', tokenizer)

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))], generator=torch.Generator().manual_seed(42))

print(len(train_dataset), len(val_dataset))

3200 800


In [7]:
dataset[0]

(tensor([  101,  2023,  2003,  5294,  2827,  5205,  8861,  7031, 14451,  2015,
         28991, 15007,  2179,  1999,  1996,  2522, 17258,  1520, 28896,  1521,
          1998,  2758,  2027,  2024, 14052,  1012,  2002,  2003,  1996,  2034,
          3761,  2000, 14451,  2023,   999,  3745, 16770,  1024,  1013,  1013,
          1056,  1012,  2033,  1013,  6874,  7875, 23518,  4819, 11877,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [8]:
sequence_classifier = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(BIO_TAGS))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#test the model
input_ids, attention_mask, labels = dataset[0]
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)
labels = labels.unsqueeze(0)

output = sequence_classifier(input_ids, attention_mask=attention_mask, labels=labels)

print(output)

TokenClassifierOutput(loss=tensor(2.7321, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.2842,  0.3092,  0.2819,  ..., -0.0835,  0.0566, -0.2003],
         [ 0.0285,  0.1695, -0.1788,  ..., -0.1107, -0.1965, -0.1904],
         [-0.0779,  0.0577,  0.1549,  ..., -0.0242, -0.1806, -0.0449],
         ...,
         [ 0.4117,  0.0139,  0.0044,  ...,  0.0460, -0.0512, -0.2815],
         [ 0.2250, -0.3237,  0.2713,  ..., -0.2005,  0.1521, -0.3711],
         [ 0.2023, -0.3613,  0.1973,  ..., -0.3404,  0.1569, -0.4396]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [10]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

In [11]:
optimizer = optim.Adam(sequence_classifier.parameters(), lr=1e-5)

In [12]:
#Training loop save the model with the best validation loss

from tqdm import tqdm

def train(model, train_loader, val_loader, optimizer, epochs, device):
    model.to(device)
    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for input_ids, attention_mask, labels in tqdm(train_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = output.loss
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for input_ids, attention_mask, labels in tqdm(val_loader):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                
                output = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = output.loss
                
                val_loss += loss.item()
                
        val_loss /= len(val_loader)
        
        print(f"Epoch: {epoch+1}/{epochs} Train Loss: {train_loss} Val Loss: {val_loss}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "model.pth")
            print("Model saved")

In [13]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 10

train(sequence_classifier, train_loader, val_loader, optimizer, EPOCHS, DEVICE)

100%|██████████| 400/400 [03:19<00:00,  2.00it/s]
100%|██████████| 100/100 [00:17<00:00,  5.82it/s]


Epoch: 1/10 Train Loss: 0.9566751609742642 Val Loss: 0.7397705042362213
Model saved


100%|██████████| 400/400 [03:33<00:00,  1.87it/s]
100%|██████████| 100/100 [00:18<00:00,  5.34it/s]


Epoch: 2/10 Train Loss: 0.6758533646166325 Val Loss: 0.6585752719640732
Model saved


100%|██████████| 400/400 [03:40<00:00,  1.82it/s]
100%|██████████| 100/100 [00:18<00:00,  5.38it/s]


Epoch: 3/10 Train Loss: 0.5633992763236165 Val Loss: 0.6318622589111328
Model saved


100%|██████████| 400/400 [03:39<00:00,  1.82it/s]
100%|██████████| 100/100 [00:18<00:00,  5.39it/s]


Epoch: 4/10 Train Loss: 0.47351330481469633 Val Loss: 0.6200276619195938
Model saved


100%|██████████| 400/400 [03:34<00:00,  1.86it/s]
100%|██████████| 100/100 [00:18<00:00,  5.50it/s]


Epoch: 5/10 Train Loss: 0.40428358428180217 Val Loss: 0.6377912746369838


100%|██████████| 400/400 [03:34<00:00,  1.87it/s]
100%|██████████| 100/100 [00:18<00:00,  5.48it/s]


Epoch: 6/10 Train Loss: 0.34821637902408836 Val Loss: 0.6798687529563904


100%|██████████| 400/400 [03:33<00:00,  1.87it/s]
100%|██████████| 100/100 [00:18<00:00,  5.53it/s]


Epoch: 7/10 Train Loss: 0.30441058535128834 Val Loss: 0.6788625597953797


100%|██████████| 400/400 [03:33<00:00,  1.87it/s]
100%|██████████| 100/100 [00:18<00:00,  5.51it/s]


Epoch: 8/10 Train Loss: 0.26470691822469233 Val Loss: 0.7161314772069454


100%|██████████| 400/400 [03:33<00:00,  1.87it/s]
100%|██████████| 100/100 [00:18<00:00,  5.52it/s]


Epoch: 9/10 Train Loss: 0.233928245883435 Val Loss: 0.755239574611187


 72%|███████▏  | 288/400 [02:34<00:59,  1.87it/s]


KeyboardInterrupt: 

In [16]:
sequence_classifier = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(BIO_TAGS))
sequence_classifier.load_state_dict(torch.load("model.pth"))
sequence_classifier.eval()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [17]:
from seqeval.metrics import classification_report

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def predict(model, data, device):
    model.to(device)
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(data):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            output = model(input_ids, attention_mask=attention_mask)
            logits = output.logits
            predictions.extend(torch.argmax(logits, 2).tolist())
            true_labels.extend(labels.tolist())
            
    return predictions, true_labels

pred_labels, true_labels = predict(sequence_classifier, val_loader, DEVICE)

100%|██████████| 100/100 [00:16<00:00,  6.05it/s]


In [18]:
print(true_labels[1])
print(pred_labels[1])

[-100, 3, 9, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 1, 7, 4, 7, 7, 7, 7, 0, 3, 9, 9, 9, 9, 9, 9, 0, 0, 2, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [19]:
cleaned_true_labels = []
cleaned_pred_labels = []

for i in range(len(true_labels)):
    aux_true = []
    aux_pred = []
    for j in range(len(true_labels[i])):
        if true_labels[i][j] != -100:
            aux_true.append(id2tag[true_labels[i][j]])
            aux_pred.append(id2tag[pred_labels[i][j]])
    cleaned_true_labels.append(aux_true)
    cleaned_pred_labels.append(aux_pred)

In [20]:
print(cleaned_true_labels[1])
print(cleaned_pred_labels[1])

print(len(cleaned_true_labels[1]))
print(len(cleaned_pred_labels[1]))

['B-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OBJECTIVE', 'I-OBJECTIVE', 'B-VICTIM', 'I-OBJECTIVE', 'I-OBJECTIVE', 'I-OBJECTIVE', 'I-OBJECTIVE', 'O', 'B-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'O', 'O', 'B-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'I-NEGATIVE_EFFECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VICTIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'I-VICTIM', 'O', 'O', 'O', 'O']
['B-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'I-AGENT', 'O'

In [21]:
print(classification_report(cleaned_true_labels, cleaned_pred_labels))

                 precision    recall  f1-score   support

          AGENT       0.42      0.52      0.46      1504
     CAMPAIGNER       0.40      0.55      0.47      1738
    FACILITATOR       0.14      0.09      0.11       590
NEGATIVE_EFFECT       0.12      0.14      0.13      1252
      OBJECTIVE       0.09      0.08      0.08       379
         VICTIM       0.28      0.41      0.33       750

      micro avg       0.31      0.37      0.34      6213
      macro avg       0.24      0.30      0.26      6213
   weighted avg       0.29      0.37      0.33      6213

