In [38]:
import platform
import numpy as np
import pandas as pd
import random

import torch 
from torch import optim 
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

from tqdm.notebook import tqdm
from transformers import AutoTokenizer

# enable tqdm in pandas
tqdm.pandas()

# select device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif 'arm64' in platform.platform():
    device = torch.device('mps') # 'mps'
else:
    device = torch.device('cpu')
print(f'device: {device.type}') 

# random seed
seed = 1234

# pytorch ignores this label in the loss
ignore_index = -100

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

# which transformer to use
transformer_name = "bert-base-cased" # 'xlm-roberta-base' # 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

device: mps
random seed: 1234


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [39]:

# map labels to the first token in each word
def align_labels(word_ids, labels, label_to_index):
    label_ids = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None or word_id == previous_word_id:
            # ignore if not a word or word id has already been seen
            label_ids.append(ignore_index)
        else:
            # get label id for corresponding word
            label_id = label_to_index[labels[word_id]]
            label_ids.append(label_id)
        # remember this word id
        previous_word_id = word_id
    
    return label_ids
            
# build a set of labels in the dataset            
def read_label_set(fn):
    labels = set()
    with open(fn) as f:
        for index, line in enumerate(f):
            line = line.strip()
            tokens = line.split()
            if tokens != []:
                label = tokens[-1]
                labels.add(label)
    return labels

# converts a two-column file in the basic MTL format ("word \t label") into a dataframe
def read_dataframe(fn, label_to_index):
    # now build the actual dataframe for this dataset
    data = {'words': [], 'str_labels': [], 'input_ids': [], 'word_ids': [], 'labels': []}
    with open(fn) as f:
        sent_words = []
        sent_labels = [] 
        for index, line in tqdm(enumerate(f)):
            line = line.strip()
            tokens = line.split()
            if tokens == []:
                data['words'].append(sent_words)
                data['str_labels'].append(sent_labels)
                
                # tokenize each sentence
                token_input = tokenizer(sent_words, is_split_into_words = True)  
                token_ids = token_input['input_ids']
                word_ids = token_input.word_ids(batch_index = 0)
                
                # map labels to the first token in each word
                token_labels = align_labels(word_ids, sent_labels, label_to_index)
                
                data['input_ids'].append(token_ids)
                data['word_ids'].append(word_ids)
                data['labels'].append(token_labels)
                sent_words = []
                sent_labels = [] 
            else:
                sent_words.append(tokens[0])
                sent_labels.append(tokens[1])
    return pd.DataFrame(data)


In [40]:
# we need an index of labels first
labels = read_label_set("data/conll-ner/train.txt")
index_to_label = {i:t for i,t in enumerate(labels)}
label_to_index = {t:i for i,t in enumerate(labels)}
print("index_to_label: ", index_to_label)

# create data frames for the datasets
train_df = read_dataframe("data/conll-ner/train.txt", label_to_index)
dev_df = read_dataframe("data/conll-ner/dev.txt", label_to_index)
test_df = read_dataframe("data/conll-ner/test.txt", label_to_index)



index_to_label:  {0: 'B-MISC', 1: 'O', 2: 'I-PER', 3: 'B-ORG', 4: 'B-PER', 5: 'I-LOC', 6: 'I-MISC', 7: 'B-LOC', 8: 'I-ORG'}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [41]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import PreTrainedModel
from transformers import AutoConfig, AutoModel

class TokenClassificationModel(PreTrainedModel):    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.encoder = AutoModel.from_pretrained(transformer_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self._init_weights()
        
    def _init_weights(self):
        self.classifier.weight.data.normal_(mean=0.0, std=0.02)
        # torch.nn.init.xavier_normal_(self.classifier.weight.data)
        if self.classifier.bias is not None:
            self.classifier.bias.data.zero_()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            inputs = logits.view(-1, self.num_labels)
            targets = labels.view(-1)
            loss = loss_fn(inputs, targets)
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [42]:

config = AutoConfig.from_pretrained(
    transformer_name,
    num_labels=len(index_to_label),
)

model = TokenClassificationModel(config)


Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [43]:
from transformers import TrainingArguments

num_epochs = 4
batch_size = 128
weight_decay = 0.01
model_name = f'{transformer_name}-ner'

no_cuda = True if str(device) == 'cpu' else False
use_mps_device = True if str(device) == 'mps' else False

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    weight_decay=weight_decay,
    no_cuda = no_cuda,
    use_mps_device = use_mps_device
)

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=True,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_na

In [44]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    # gold labels
    label_ids = eval_pred.label_ids
    # predictions
    pred_ids = np.argmax(eval_pred.predictions, axis=-1)
    # collect gold and predicted labels, ignoring ignore_index label
    y_true, y_pred = [], []
    batch_size, seq_len = pred_ids.shape
    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != ignore_index:
                y_true.append(index_to_label[label_ids[i][j]])
                y_pred.append(index_to_label[pred_ids[i][j]])
    # return computed metrics
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [45]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(dev_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

DatasetDict({
    train: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['words', 'str_labels', 'input_ids', 'word_ids', 'labels'],
        num_rows: 3685
    })
})

In [46]:
from transformers import Trainer
from transformers import DataCollatorForTokenClassification
from datetime import datetime

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    tokenizer=tokenizer,
)

now = datetime.now()
print("Current Time =", now.strftime("%H:%M:%S"))

trainer.train()

now = datetime.now()
print("Current Time =", now.strftime("%H:%M:%S"))

Current Time = 16:54:15




{'eval_loss': 0.010167015716433525, 'eval_accuracy': 0.9869324130443212, 'eval_runtime': 14.4291, 'eval_samples_per_second': 240.209, 'eval_steps_per_second': 1.941, 'epoch': 1.0}
{'eval_loss': 0.008905312977731228, 'eval_accuracy': 0.9890651052774438, 'eval_runtime': 13.3202, 'eval_samples_per_second': 260.206, 'eval_steps_per_second': 2.102, 'epoch': 2.0}
{'eval_loss': 0.008601239882409573, 'eval_accuracy': 0.9900538989491644, 'eval_runtime': 12.6725, 'eval_samples_per_second': 273.505, 'eval_steps_per_second': 2.21, 'epoch': 3.0}
{'eval_loss': 0.008612300269305706, 'eval_accuracy': 0.9903447206173175, 'eval_runtime': 12.4307, 'eval_samples_per_second': 278.825, 'eval_steps_per_second': 2.252, 'epoch': 4.0}
{'train_runtime': 775.38, 'train_samples_per_second': 77.314, 'train_steps_per_second': 0.609, 'train_loss': 0.01545483176991091, 'epoch': 4.0}
Current Time = 17:07:10


In [47]:
output = trainer.predict(ds['test'])

In [48]:
from sklearn.metrics import classification_report

num_labels = model.num_labels
label_ids = output.label_ids.reshape(-1)
predictions = output.predictions.reshape(-1, num_labels)
predictions = np.argmax(predictions, axis=-1)
mask = label_ids != ignore_index

y_true = label_ids[mask]
y_pred = predictions[mask]
target_names = [index_to_label.get(ele, "") for ele in range(num_labels)]
print(target_names)

report = classification_report(
    y_true, y_pred,
    target_names=target_names
)
print(report)

['B-MISC', 'O', 'I-PER', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'B-LOC', 'I-ORG']
              precision    recall  f1-score   support

      B-MISC       0.82      0.84      0.83       702
           O       1.00      0.99      1.00     38554
       I-PER       0.99      0.98      0.99      1156
       B-ORG       0.90      0.92      0.91      1661
       B-PER       0.97      0.95      0.96      1617
       I-LOC       0.87      0.92      0.90       257
      I-MISC       0.65      0.78      0.71       216
       B-LOC       0.94      0.94      0.94      1668
       I-ORG       0.88      0.92      0.90       835

    accuracy                           0.98     46666
   macro avg       0.89      0.92      0.90     46666
weighted avg       0.98      0.98      0.98     46666



In [49]:
f = open("test_out.txt", "w")
for i in range(0, len(y_true)):
    f.write(f"X {index_to_label.get(y_true[i])} {index_to_label.get(y_pred[i])}\n")
f.close()