In [1]:
# imports

from typing import List, Dict
import codecs
import torch
import sys
import myutils
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Reading in the data

In [2]:
myl = [i for i in range(0,11)]
myl

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [3]:
def read_data(path):
    ents = []
    curEnts = []
    with open(path, encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == '':
                ents.append(curEnts)
                curEnts = []
            elif line[0] == '#' and len(line.split('\t')) == 1:
                continue
            else:
                curEnts.append(line.split('\t')[1])
    return ents

def read_labels(path):
    ents = []
    curEnts = []
    with open(path, encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == '':
                ents.append(curEnts)
                curEnts = []
            elif line[0] == '#' and len(line.split('\t')) == 1:
                continue
            else:
                curEnts.append(line.split('\t')[2])
    return ents

def read_index(path):
    ents = []
    curEnts = []
    with open(path, encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == '':
                ents.append(curEnts)
                curEnts = []
            elif line[0] == '#' and len(line.split('\t')) == 1:
                continue
            else:
                curEnts.append(line.split('\t')[0])
    return ents


In [4]:
train_data = read_data("en_ewt-ud-train.iob2")
train_labels = read_labels("en_ewt-ud-train.iob2")
train_index = read_index("en_ewt-ud-train.iob2")

In [5]:
train_data[0:3]

[['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'],
 ['Iguazu', 'Falls'],
 ['Widely',
  'considered',
  'to',
  'be',
  'one',
  'of',
  'the',
  'most',
  'spectacular',
  'waterfalls',
  'in',
  'the',
  'world',
  ',',
  'the',
  'Iguazu',
  'Falls',
  'on',
  'the',
  'border',
  'of',
  'Argentina',
  'and',
  'Brazil',
  ',',
  'are',
  'a',
  'certainly',
  'must',
  'see',
  'attraction',
  'in',
  'the',
  'area',
  '.']]

In [6]:
train_labels[0]

['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']

# Preprocessing

Creating a list of unique labels

In [7]:
label_list = []

for labels in train_labels:
    for label in labels:
        if label not in label_list:
            label_list.append(label)

In [8]:
label_list

['O', 'B-LOC', 'I-LOC', 'B-PER', 'B-ORG', 'I-ORG', 'I-PER']

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [10]:
def tokenize_and_align_labels(data, label_list):
    tokenized_inputs = tokenizer(data, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(label_list):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_train = tokenize_and_align_labels(train_data, train_labels)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
print(tokenized_train[0])

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [13]:
tokenized_labels = tokenized_train["labels"]


In [14]:
print(tokenized_labels[0])
print(tokenized_train[0].tokens)

[-100, 'O', 'O', 'O', 'O', 'O', 'B-LOC', -100, -100, 'O', -100]
['[CLS]', 'where', 'in', 'the', 'world', 'is', 'i', '##gua', '##zu', '?', '[SEP]']


In [15]:
id2label = {
    0: "O",
    1: "B-LOC",
    2: "I-LOC",
    3: "B-PER",
    4: "B-ORG",
    5: "I-ORG",
    6: "I-PER",
}
label2id = {
    "O": 0,
    'B-LOC': 1,
    'I-LOC': 2,
    'B-PER': 3,
    'B-ORG': 4,
    'I-ORG': 5,
    'I-PER': 6
}

# Copying from the website

In [16]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")

In [17]:
wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [30]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [31]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [32]:
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 '@',
 'paul',
 '##walk',
 'it',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'i',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'empire',
 'state',
 'building',
 '=',
 'es',
 '##b',
 '.',
 'pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

In [33]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [34]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1287/1287 [00:00<00:00, 3961.10 examples/s]


In [35]:
tokenized_wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1287
    })
})

In [36]:
tokenized_wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  1030,
  2703,
  17122,
  2009,
  1005,
  1055,
  1996,
  3193,
  2013,
  2073,
  1045,
  1005,
  1049,
  2542,
  2005,
  2048,
  3134,
  1012,
  3400,
  2110,
  2311,
  1027,
  9686,
  2497,
  1012,
  3492,
  2919,
  4040,
  2182,
  2197,
  3944,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  -100,
  -100,
  0,
  0,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [37]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [39]:
import evaluate

seqeval = evaluate.load("seqeval")

In [40]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [41]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [42]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/426 [09:36<?, ?it/s]
  0%|          | 0/426 [08:15<?, ?it/s]
  0%|          | 0/426 [00:00<?, ?it/s]

ModuleNotFoundError: No module named 'distutils'