Project Baseline Model

In [4]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoConfig, RobertaTokenizerFast, DataCollatorForTokenClassification
import numpy as np
import tqdm as notebook_tqdm
from datasets import Dataset, DatasetDict
import torch

#from bert.bert_topic import ClassModel

#all of the functions from the span_f1 file
from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled

model_link = 'deepset/roberta-base-squad2'
train_data_source = 'en_ewt-ud-train.iob2'
dev_data_source = 'en_ewt-ud-dev.iob2'
test_data_source = 'en_ewt-ud-test-masked.iob2'

  from .autonotebook import tqdm as notebook_tqdm


Read in raw data (to get sentences)

In [5]:
def read_conll_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[0])
            current_tags.append(tok[1])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [6]:
train_data = read_conll_file(train_data_source)
dev_data = read_conll_file(dev_data_source)
test_data = read_conll_file(test_data_source)

train_data_sentences = []

for data_point in train_data:
    train_data_sentences.append(data_point[1])

dev_data_sentences = []

for data_point in dev_data:
    dev_data_sentences.append(data_point[1])

test_data_sentences = []

for data_point in test_data:
    test_data_sentences.append(data_point[1])



Read in the labels for the above sentences

In [7]:
#read in the label data, iob2 format
train_data_labels = readNlu(train_data_source)
dev_data_labels = readNlu(dev_data_source)
test_data_labels = readNlu(test_data_source)

#create the labels set
label_set = set()

for sentence in train_data_labels:
    for label in sentence:
        label_set.add(label)


#not sure why this is useful, will delete later if we find no use
label_list = list(label_set)

nlabels = len(label_set)
print(label_set)


{'I-LOC', 'B-LOC', 'O', 'I-ORG', 'B-ORG', 'I-PER', 'B-PER'}


In [8]:
# label mapping

lab2idx = {lab: idx for idx, lab in enumerate(label_list)}

idx2lab = {idx: lab for lab, idx in lab2idx.items()}
print(lab2idx)
print(idx2lab)
lab2idx['O']

{'I-LOC': 0, 'B-LOC': 1, 'O': 2, 'I-ORG': 3, 'B-ORG': 4, 'I-PER': 5, 'B-PER': 6}
{0: 'I-LOC', 1: 'B-LOC', 2: 'O', 3: 'I-ORG', 4: 'B-ORG', 5: 'I-PER', 6: 'B-PER'}


2

In [9]:
#convert labels to label ids

ner_train_ids = []

for sentence in train_data_labels:
    labels_of_sentence = []
    for label in sentence:
        labels_of_sentence.append(lab2idx[label])
    ner_train_ids.append(labels_of_sentence)
    
ner_dev_ids = []

for sentence in dev_data_labels:
    labels_of_sentence = []
    for label in sentence:
        labels_of_sentence.append(lab2idx[label])
    ner_dev_ids.append(labels_of_sentence)

ner_test_ids = []

for sentence in test_data_labels:
    labels_of_sentence = []
    for label in sentence:
        labels_of_sentence.append(lab2idx[label])
    ner_test_ids.append(labels_of_sentence)

In [10]:
keys = ['sents', 'ner_tags', 'ids']
values_train = [train_data_sentences, train_data_labels, ner_train_ids ]
values_dev = [dev_data_sentences, dev_data_labels, ner_dev_ids]
values_test = [test_data_sentences, test_data_labels, ner_test_ids]

train_dict = dict(zip(keys, values_train))
dev_dict = dict(zip(keys, values_dev))
test_dict = dict(zip(keys, values_test))

In [11]:
train_dataset = Dataset.from_dict(train_dict)
dev_dataset = Dataset.from_dict(dev_dict)
test_dataset = Dataset.from_dict(test_dict)

In [12]:
text_column_name = 'sents'
label_column_name = 'ids'

In [13]:
import pickle

with open('./idx2lab', 'rb') as f:
    idx2lab = pickle.load(f)

with open('./lab2idx', 'rb') as f:
    lab2idx = pickle.load(f)

In [14]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_link, use_fast=True, add_prefix_space=True)
config = AutoConfig.from_pretrained(model_link, num_labels=nlabels, id2label = idx2lab, label2id = lab2idx)

def tokenize_and_align_labels(examples):
    """
    For each example, tokenize the list of tokens and align the original labels 
    to the resulting subwords. Tokens can be split into multiple subwords, so we mark 
    the "extra" subwords with -100 to ignore them in the loss.
    """
    # 'is_split_into_words=True' tells the tokenizer each item in the list is already a separate word/token.
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=128,             
        padding=False,              
        truncation=True, 
        is_split_into_words=True
    )

    # 2) Prepare a new "labels" list aligned to the subword tokens
    all_labels = []
    
    for batch_index, labels in enumerate(examples[label_column_name]):
        # 'word_ids()' returns a list the same length as the subword-tokens,
        # each entry telling you which 'word' or token it came from
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)

        label_ids = []
        prev_word_id = None
        
        for word_id in word_ids:
            if word_id is None:
                # e.g. special tokens or padding
                label_ids.append(-100)
            elif word_id == prev_word_id:
                # subword token of the same word => ignore
                label_ids.append(-100)
            else:
                # new subword, so use the label for the original token
                label_ids.append(labels[word_id])
            
            prev_word_id = word_id
        
        all_labels.append(label_ids)

    # 3) Attach the new "labels" to our tokenized inputs
    tokenized_inputs["labels"] = all_labels

    # 4) Return the updated dictionary
    return tokenized_inputs

In [15]:
processed_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Running tokenizer on dataset"
)

processed_dev_dataset = dev_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dev_dataset.column_names,
    desc="Running tokenizer on dataset"
)

processed_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_dataset.column_names,
    desc="Running tokenizer on dataset"
)



Running tokenizer on dataset: 100%|██████████| 12543/12543 [00:00<00:00, 19390.10 examples/s]
Running tokenizer on dataset: 100%|██████████| 2001/2001 [00:00<00:00, 25203.69 examples/s]
Running tokenizer on dataset: 100%|██████████| 2077/2077 [00:00<00:00, 23041.18 examples/s]


In [16]:
model = AutoModelForTokenClassification.from_pretrained(model_link, torch_dtype='auto', config=config)
data_collator = DataCollatorForTokenClassification(tokenizer)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [17]:
training_args = TrainingArguments(output_dir="output_trainer", eval_strategy = 'epoch')

In [18]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis = -1)
    true_labels = [[idx2lab[label] for label in label_sequence if label != -100] for label_sequence in labels]
    true_predictions = [[idx2lab[pred] for pred, label in zip(preds_sequence, labels_sequence) if label != -100] for preds_sequence, labels_sequence in zip(predictions, labels)]
    return true_labels, true_predictions

In [19]:
import evaluate
metric = evaluate.load("seqeval")

In [20]:
def compute_metrics(preds):
    true_labels, true_predictions = convert_int_to_labels(preds)
    results = metric.compute(predictions = true_predictions, references = true_labels)
    return {
        "Precision": results["overall_precision"],
        "Recall": results["overall_recall"],
        "F1": results["overall_f1"],
        "Accuracy": results["overall_accuracy"]
    }

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_dev_dataset,
    compute_metrics= compute_metrics,
    data_collator=data_collator
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0448,0.069069,0.987011,0.987492,0.987252,0.984373
2,0.0219,0.080834,0.98503,0.98599,0.98551,0.982942
3,0.0108,0.084337,0.987607,0.987005,0.987306,0.984771


TrainOutput(global_step=4704, training_loss=0.033425377517127666, metrics={'train_runtime': 6299.8874, 'train_samples_per_second': 5.973, 'train_steps_per_second': 0.747, 'total_flos': 813355072895574.0, 'train_loss': 0.033425377517127666, 'epoch': 3.0})

In [23]:
results = trainer.evaluate()

In [24]:
predictions, labels, metrics = trainer.predict(processed_test_dataset)

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
labels, predictions = convert_int_to_labels((predictions, labels))

In [26]:
final_format = []
for i in range(len(predictions)):
    final_format.append((test_data[i][0], predictions[i]))

In [27]:
def write_conll_file(data, path):
    """
    Write data back to a CoNLL file format.
    :param data: list of tuples with words and labels
    :param path: path to write to
    """
    with open(path, "w", encoding = "utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start = 1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")

write_conll_file(final_format, "test_ouput.iob2")

In [28]:
print(train_dataset[0]['sents'])
print(train_dataset[0]['ner_tags'])
print(train_dataset[0]['ids'])  # numerical label IDs


['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?']
['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']
[2, 2, 2, 2, 2, 1, 2]
