In [64]:
import platform
import numpy as np
import pandas as pd
import random

import torch 
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

from tqdm.notebook import tqdm
from transformers import AutoTokenizer

# enable tqdm in pandas
tqdm.pandas()

# select device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif 'arm64' in platform.platform():
    device = torch.device('cpu') # 'mps'
else:
    device = torch.device('cpu')
print(f'device: {device.type}') 

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

# which transformer to use
transformer_name =  'xlm-roberta-base' # 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

device: cpu
random seed: 1234


In [65]:

# map labels to the first token in each word
def align_labels(word_ids, labels, label_to_index):
    # default value for CrossEntropyLoss ignore_index parameter
    ignore_index = -100
    
    label_ids = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None or word_id == previous_word_id:
            # ignore if not a word or word id has already been seen
            label_ids.append(ignore_index)
        else:
            # get label id for corresponding word
            label_id = label_to_index[labels[word_id]]
            label_ids.append(label_id)
        # remember this word id
        previous_word_id = word_id
    
    return label_ids
            
# build a set of labels in the dataset            
def read_label_set(fn):
    labels = set()
    with open(fn) as f:
        for index, line in enumerate(f):
            line = line.strip()
            tokens = line.split()
            if tokens != []:
                label = tokens[-1]
                labels.add(label)
    return labels

# converts a two-column file in the basic MTL format ("word \t label") into a dataframe
def read_dataframe(fn, label_to_index):
    # now build the actual dataframe for this dataset
    data = {'words': [], 'labels': [], 'token_ids': [], 'word_ids': [], 'token_labels': []}
    with open(fn) as f:
        sent_words = []
        sent_labels = [] 
        for index, line in tqdm(enumerate(f)):
            line = line.strip()
            tokens = line.split()
            if tokens == []:
                data['words'].append(sent_words)
                data['labels'].append(sent_labels)
                
                # tokenize each sentence
                token_input = tokenizer(sent_words, is_split_into_words = True)  
                token_ids = token_input['input_ids']
                word_ids = token_input.word_ids(batch_index = 0)
                
                # map labels to the first token in each word
                token_labels = align_labels(word_ids, sent_labels, label_to_index)
                
                data['token_ids'].append(token_ids)
                data['word_ids'].append(word_ids)
                data['token_labels'].append(token_labels)
                sent_words = []
                sent_labels = [] 
            else:
                sent_words.append(tokens[0])
                sent_labels.append(tokens[1])
    return pd.DataFrame(data)


In [66]:
class MyDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        x = torch.tensor(self.x[index])
        y = torch.tensor(self.y[index])
        return x, y

In [67]:
# hyperparameters
lr = 1e-5
weight_decay = 1e-5
batch_size = 2
shuffle = True
n_epochs = 2
hidden_size = 100
dropout = 0.1

In [68]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # separate xs and ys
    xs, ys = zip(*batch)
    # get lengths
    lengths = [len(x) for x in xs]
    # pad sequences
    x_padded = pad_sequence(xs, batch_first=True, padding_value=0)
    y_padded = pad_sequence(ys, batch_first=True, padding_value=-100)
    # return padded
    return x_padded, y_padded, lengths

In [69]:
labels = read_label_set("data/conll-ner/train.txt")
index_to_label = {i:t for i,t in enumerate(labels)}
label_to_index = {t:i for i,t in enumerate(labels)}
print("index_to_label: ", index_to_label)

train_dataframe = read_dataframe("data/conll-ner/train.txt", label_to_index)
train_ds = MyDataset(train_dataframe['token_ids'], train_dataframe['token_labels'])
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

dev_dataframe = read_dataframe("data/conll-ner/dev.txt", label_to_index)
dev_ds = MyDataset(dev_dataframe['token_ids'], dev_dataframe['token_labels'])
dev_dl = DataLoader(dev_ds, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

index_to_label:  {0: 'B-MISC', 1: 'I-PER', 2: 'I-ORG', 3: 'O', 4: 'B-LOC', 5: 'B-ORG', 6: 'I-LOC', 7: 'I-MISC', 8: 'B-PER'}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [70]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            inputs = logits.view(-1, self.num_labels)
            targets = labels.view(-1)
            loss = loss_fn(inputs, targets)
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [71]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    transformer_name,
    num_labels=len(index_to_label),
)

model = (
    XLMRobertaForTokenClassification
    .from_pretrained(transformer_name, config=config)
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classif

In [79]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
loss_func = nn.CrossEntropyLoss()

for epoch in range(n_epochs):
    print(f'starting epoch {epoch}...')
    losses, acc = [], []
    model.train()
    for x, y_true, _ in train_dl:
        # clear gradients
        model.zero_grad()
        # send datum to right device
        x = x.to(device)
        y_true = y_true.to(device)
        
        # predict
        token_outputs = model(x)
        loss = token_outputs.loss
        logits = token_outputs.logits
        
        #print(logits)
        #print(y_true)
        loss = loss_func(logits, y_true)
        
        # backprop and optimize
        #loss.backward()
        #optimizer.step()
        
        # compute accuracy

starting epoch 0...


RuntimeError: Expected target size [2, 9], got [2, 56]