In [67]:
import platform
import numpy as np
import pandas as pd
import random

import torch 
from torch import optim 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

from tqdm.notebook import tqdm
from transformers import AutoTokenizer

# enable tqdm in pandas
tqdm.pandas()

# select device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif 'arm64' in platform.platform():
    device = torch.device('mps') # 'mps'
else:
    device = torch.device('cpu')
print(f'device: {device.type}') 

# random seed
seed = 1234

# pytorch ignores this label in the loss
ignore_index = -100

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

# which transformer to use
transformer_name = "bert-base-cased" # 'xlm-roberta-base' # 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

device: mps
random seed: 1234


In [68]:

# map labels to the first token in each word
def align_labels(word_ids, labels, label_to_index):
    label_ids = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None or word_id == previous_word_id:
            # ignore if not a word or word id has already been seen
            label_ids.append(ignore_index)
        else:
            # get label id for corresponding word
            label_id = label_to_index[labels[word_id]]
            label_ids.append(label_id)
        # remember this word id
        previous_word_id = word_id
    
    return label_ids
            
# build a set of labels in the dataset            
def read_label_set(fn):
    labels = set()
    with open(fn) as f:
        for index, line in enumerate(f):
            line = line.strip()
            tokens = line.split()
            if tokens != []:
                label = tokens[-1]
                labels.add(label)
    return labels

# converts a two-column file in the basic MTL format ("word \t label") into a dataframe
def read_dataframe(fn, label_to_index, task_id):
    # now build the actual dataframe for this dataset
    data = {'words': [], 'str_labels': [], 'input_ids': [], 'word_ids': [], 'labels': [], 'task_ids': []}
    with open(fn) as f:
        sent_words = []
        sent_labels = [] 
        for index, line in tqdm(enumerate(f)):
            line = line.strip()
            tokens = line.split()
            if tokens == []:
                data['words'].append(sent_words)
                data['str_labels'].append(sent_labels)
                
                # tokenize each sentence
                token_input = tokenizer(sent_words, is_split_into_words = True)  
                token_ids = token_input['input_ids']
                word_ids = token_input.word_ids(batch_index = 0)
                
                # map labels to the first token in each word
                token_labels = align_labels(word_ids, sent_labels, label_to_index)
                
                data['input_ids'].append(token_ids)
                data['word_ids'].append(word_ids)
                data['labels'].append(token_labels)
                data['task_ids'].append(task_id)
                sent_words = []
                sent_labels = [] 
            else:
                sent_words.append(tokens[0])
                sent_labels.append(tokens[1])
    return pd.DataFrame(data)


In [69]:
class Task():
    def __init__(self, task_id, train_file_name, dev_file_name, test_file_name):
        self.task_id = task_id
        # we need an index of labels first
        self.labels = read_label_set(train_file_name)
        self.index_to_label = {i:t for i,t in enumerate(self.labels)}
        self.label_to_index = {t:i for i,t in enumerate(self.labels)}
        self.num_labels = len(self.index_to_label)
        # create data frames for the datasets
        self.train_df = read_dataframe(train_file_name, self.label_to_index, self.task_id)
        self.dev_df = read_dataframe(dev_file_name, self.label_to_index, self.task_id)
        self.test_df = read_dataframe(test_file_name, self.label_to_index, self.task_id)
                

In [70]:
ner_task = Task(0, "data/conll-ner/train.txt", "data/conll-ner/dev.txt", "data/conll-ner/test.txt")
pos_task = Task(1, "data/pos/train.txt", "data/pos/dev.txt", "data/pos/test.txt")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [71]:
ner_task.train_df

Unnamed: 0,words,str_labels,input_ids,word_ids,labels,task_ids
0,[-DOCSTART-],[O],"[101, 118, 141, 9244, 9272, 12426, 1942, 118, ...","[None, 0, 0, 0, 0, 0, 0, 0, None]","[-100, 4, -100, -100, -100, -100, -100, -100, ...",0
1,"[EU, rejects, German, call, to, boycott, Briti...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[101, 7270, 22961, 1528, 1840, 1106, 21423, 14...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]","[-100, 1, 4, 8, 4, 4, 4, 8, 4, -100, 4, -100]",0
2,"[Peter, Blackburn]","[B-PER, I-PER]","[101, 1943, 14428, 102]","[None, 0, 1, None]","[-100, 3, 0, -100]",0
3,"[BRUSSELS, 1996-08-22]","[B-LOC, O]","[101, 26660, 13329, 12649, 15928, 1820, 118, 4...","[None, 0, 0, 0, 0, 1, 1, 1, 1, 1, None]","[-100, 6, -100, -100, -100, 4, -100, -100, -10...",0
4,"[The, European, Commission, said, on, Thursday...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,...","[101, 1109, 1735, 2827, 1163, 1113, 9170, 1122...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[-100, 4, 1, 7, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, ...",0
...,...,...,...,...,...,...
14982,"[on, Friday, :]","[O, O, O]","[101, 1113, 5286, 131, 102]","[None, 0, 1, 2, None]","[-100, 4, 4, 4, -100]",0
14983,"[Division, two]","[O, O]","[101, 1784, 1160, 102]","[None, 0, 1, None]","[-100, 4, 4, -100]",0
14984,"[Plymouth, 2, Preston, 1]","[B-ORG, O, B-ORG, O]","[101, 10033, 123, 8083, 122, 102]","[None, 0, 1, 2, 3, None]","[-100, 1, 4, 1, 4, -100]",0
14985,"[Division, three]","[O, O]","[101, 1784, 1210, 102]","[None, 0, 1, None]","[-100, 4, 4, -100]",0


In [72]:
pos_task.train_df

Unnamed: 0,words,str_labels,input_ids,word_ids,labels,task_ids
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...","[101, 4855, 25354, 6378, 117, 5391, 1201, 1385...","[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[-100, 9, 9, -100, 39, 21, 0, 30, 39, 20, 10, ...",1
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...","[101, 1828, 119, 25354, 6378, 1110, 3931, 1104...","[None, 0, 0, 1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, ...","[-100, 9, -100, 9, -100, 33, 37, 5, 9, -100, -...",1
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...","[101, 19922, 138, 8376, 2246, 117, 3731, 1201,...","[None, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...","[-100, 9, 9, -100, -100, 39, 21, 0, 30, 19, 30...",1
3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS...","[101, 138, 1532, 1104, 1112, 12866, 11990, 151...","[None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10,...","[-100, 28, 37, 5, 37, -100, -100, 17, 26, 2, 1...",1
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V...","[101, 1109, 1112, 12866, 11990, 12753, 117, 17...","[None, 0, 1, 1, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, ...","[-100, 28, 37, -100, -100, 37, 39, 37, -100, -...",1
...,...,...,...,...,...,...
46787,"[Says, Peter, Mokaba, ,, president, of, the, S...","[VBZ, NNP, NNP, ,, NN, IN, DT, NNP, NNP, NNP, ...","[101, 8652, 1116, 1943, 12556, 1968, 2822, 117...","[None, 0, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, ...","[-100, 33, -100, 9, 9, -100, -100, 39, 37, 5, ...",1
46788,"[They, never, considered, themselves, to, be, ...","[PRP, RB, VBD, PRP, TO, VB, NN, RB, .]","[101, 1220, 1309, 1737, 2310, 1106, 1129, 1625...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]","[-100, 43, 17, 27, 43, 2, 10, 37, 17, 18, -100]",1
46789,"[At, last, night, 's, rally, ,, they, called, ...","[IN, JJ, NN, POS, NN, ,, PRP, VBD, IN, PRP$, N...","[101, 1335, 1314, 1480, 112, 188, 11158, 117, ...","[None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[-100, 5, 30, 37, 23, -100, 37, 39, 43, 27, 5,...",1
46790,"[``, We, emphasize, discipline, because, we, k...","[``, PRP, VBP, NN, IN, PRP, VBP, IN, DT, NN, V...","[101, 169, 169, 1284, 19291, 9360, 1272, 1195,...","[None, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[-100, 38, -100, 43, 16, 37, 5, 43, 16, 5, 28,...",1


In [73]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import PreTrainedModel
from transformers import AutoConfig, AutoModel

# This class is adapted from: https://towardsdatascience.com/how-to-create-and-train-a-multi-task-transformer-model-18c54a146240
class TokenClassificationModel(PreTrainedModel):    
    def __init__(self, config, tasks):
        super().__init__(config)
        self.encoder = AutoModel.from_pretrained(transformer_name, config=config)

        self.output_heads = nn.ModuleDict()
        for task in tasks:
            head = TokenClassificationHead(self.encoder.config.hidden_size, task.num_labels, config.hidden_dropout_prob)
            # ModuleDict requires keys to be strings
            self.output_heads[str(task.task_id)] = head
        
        self._init_weights()
        
    def _init_weights(self):
        for task_id in self.output_heads:
            self.output_heads[task_id]._init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, task_ids=None, **kwargs):
        outputs = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        sequence_output = outputs[0]
        
        print(f'batch size = {len(input_ids)}')
        print(f'task_ids in this batch: {task_ids}')
        
        unique_task_ids_list = torch.unique(task_ids).tolist()
        for unique_task_id in unique_task_ids_list:
            task_id_filter = task_ids == unique_task_id
            filtered_sequence_output = sequence_output[task_id_filter]
            print(f'size of batch for task {unique_task_id} is: {len(filtered_sequence_output)}')
                    
        logits = None
        loss = None
        # TODO: add heads here
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

class TokenClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_p=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout_p)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.num_labels = num_labels

        self._init_weights()

    def _init_weights(self):
        self.classifier.weight.data.normal_(mean=0.0, std=0.02)
        if self.classifier.bias is not None:
            self.classifier.bias.data.zero_()

    def forward(self, sequence_output, pooled_output, labels=None, attention_mask=None, **kwargs):
        sequence_output_dropout = self.dropout(sequence_output)
        logits = self.classifier(sequence_output_dropout)
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()            
            inputs = logits.view(-1, self.num_labels)
            targets = labels.view(-1)
            loss = loss_fn(inputs, targets)

        return logits, loss

In [74]:
tasks = [ner_task, pos_task]
config = AutoConfig.from_pretrained(transformer_name)
model = TokenClassificationModel(config, tasks)


In [75]:
from transformers import TrainingArguments

num_epochs = 4
batch_size = 128
weight_decay = 0.01
model_name = f'{transformer_name}-ner'

use_mps_device = True if str(device) == 'mps' else False

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    weight_decay=weight_decay,
    use_mps_device = use_mps_device
)

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=True,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_na

In [76]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    # gold labels
    label_ids = eval_pred.label_ids
    # predictions
    pred_ids = np.argmax(eval_pred.predictions, axis=-1)
    # collect gold and predicted labels, ignoring ignore_index label
    y_true, y_pred = [], []
    batch_size, seq_len = pred_ids.shape
    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != ignore_index:
                y_true.append(index_to_label[label_ids[i][j]])
                y_pred.append(index_to_label[pred_ids[i][j]])
    # return computed metrics
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [77]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(pd.concat([ner_task.train_df, pos_task.train_df]))
ds['validation'] = Dataset.from_pandas(pd.concat([ner_task.dev_df, pos_task.dev_df]))
ds['test'] = Dataset.from_pandas(pd.concat([ner_task.test_df, pos_task.test_df]))

# these are no longer needed; discard them to save memory
ner_task.train_df = None
ner_task.dev_df = None
pos_task.train_df = None
pos_task.dev_df = None

ds

DatasetDict({
    train: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels', 'task_ids', '__index_level_0__'],
        num_rows: 61779
    })
    validation: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels', 'task_ids', '__index_level_0__'],
        num_rows: 8504
    })
    test: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels', 'task_ids', '__index_level_0__'],
        num_rows: 6101
    })
})

In [78]:
from transformers import Trainer
from transformers import DataCollatorForTokenClassification
from datetime import datetime

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    tokenizer=tokenizer,
)

now = datetime.now()
print("Current Time =", now.strftime("%H:%M:%S"))

trainer.train()

now = datetime.now()
print("Current Time =", now.strftime("%H:%M:%S"))

Current Time = 14:55:20




batch size = 128
task_ids in this batch: tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 0], device='mps:0')


NameError: name 'println' is not defined

In [47]:
output = trainer.predict(ds['test'])

In [48]:
from sklearn.metrics import classification_report

num_labels = model.num_labels
label_ids = output.label_ids.reshape(-1)
predictions = output.predictions.reshape(-1, num_labels)
predictions = np.argmax(predictions, axis=-1)
mask = label_ids != ignore_index

y_true = label_ids[mask]
y_pred = predictions[mask]
target_names = [index_to_label.get(ele, "") for ele in range(num_labels)]
print(target_names)

report = classification_report(
    y_true, y_pred,
    target_names=target_names
)
print(report)

['B-MISC', 'O', 'I-PER', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'B-LOC', 'I-ORG']
              precision    recall  f1-score   support

      B-MISC       0.82      0.84      0.83       702
           O       1.00      0.99      1.00     38554
       I-PER       0.99      0.98      0.99      1156
       B-ORG       0.90      0.92      0.91      1661
       B-PER       0.97      0.95      0.96      1617
       I-LOC       0.87      0.92      0.90       257
      I-MISC       0.65      0.78      0.71       216
       B-LOC       0.94      0.94      0.94      1668
       I-ORG       0.88      0.92      0.90       835

    accuracy                           0.98     46666
   macro avg       0.89      0.92      0.90     46666
weighted avg       0.98      0.98      0.98     46666



In [49]:
f = open("test_out.txt", "w")
for i in range(0, len(y_true)):
    f.write(f"X {index_to_label.get(y_true[i])} {index_to_label.get(y_pred[i])}\n")
f.close()