In [None]:
# %%capture
# NOTE: uncomment capture to hide installation logs
# Transformers installation
! pip install transformers datasets evaluate accelerate seqeval

# Task 2 - Named Entity Recognition (NER) with Transformers

Based on https://huggingface.co/docs/transformers/tasks/token_classification

Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization.

There are 3 dimensions to scale in NER tasks:
- select a model
- play with hyperparameters
- experiment with different datasets


Helpful resources:
- For GPU training:
    - Colab
    - Kaggle GPUs (Kaggle provides 30 hours of free GPU time per week)

## Load WNUT 17 dataset

# TODO 1: here you must load the Ukrainian NER dataset in the same way as WNUT 17 is loaded below


```python
sample_item = {
    "id": 0,
    "tokens": ['Атака', 'розпочалась', 'тієї', 'ж', 'миті', ',', 'тільки', 'Христоф', 'та', 'двоє', 'найманців', ',', 'досі', 'виряджених',
  'у', 'монахів', ',', 'лишилися', 'на', 'місці', '.'],
    "ner_tags":[6, 6, 6, 6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
}
```

Your `ner_tags` should correspond to indices of `label_list` below.

## You have to load data from `./data` folder.

In [2]:
label_list = ['B-LOC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-ORG', 'I-PERS', 'O']
label_list

['B-LOC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-ORG', 'I-PERS', 'O']

The letter that prefixes each `ner_tag` indicates the token position of the entity:

- `B-` indicates the beginning of an entity.
- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like
  `Empire State Building`).
- `0` indicates the token doesn't correspond to any entity.

In [None]:
from datasets import load_dataset
from ast import literal_eval


# TODO: you have to create a CSV file yourself
# you can use the JSONl as long as it conforms to the expected format above
# wnut = load_dataset("csv", data_files={"train": "train.csv"})

# in data folder there pair of files: .txt and .ann

# example of .ann file content:
#T1	ORG	0	24	Приазовська селищна рада
#T2	DATE	25	37	13 листопада
#T3	ORG	47	83	ТОВ «Керуюча компанія «Днепросервіс»
#T4	MON	172	184	1,48 млн грн
#T5	ART	201	222	Наші гроші.

# explanation of the format:
# T1 - unique entity ID
# ORG - entity type
# 0 - start character index of the entity in the text
# 24 - end character index of the entity in the text
# Приазовська селищна рада - the actual entity text

# for the assignment, we will use only three entity types: PER, ORG, LOC

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# NOTE: if you're loading from csv, you can use literal_eval to convert string representation of lists back to lists
# wnut = wnut.map(lambda x: {"tokens": literal_eval(x["tokens"]), "ner_tags": literal_eval(x["ner_tags"])}) 
# such problems don't occur when loading from the json or jsonl formats

In [5]:
wnut = wnut["train"].train_test_split(test_size=0.2, seed=42) # NOTE: you can experiment with different test sizes
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 294
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 74
    })
})

Then take a look at an example:

In [6]:
# TODO: uncomment to see an example data point
# wnut["train"][0]

## Preprocess

In [7]:
from transformers import AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

As you saw in the example `tokens` field above, it looks like the input has already been tokenized. But the input actually hasn't been tokenized yet and you'll need to set `is_split_into_words=True` to tokenize the words into subwords. For example:

More details in practice notebook.  
Here we're realigning the tokens and labels, and truncating sequences to be no longer than model's maximum input length:

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 294
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 74
    })
})

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [10]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

In [11]:
tokenized_wnut["train"]

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 294
})

In [12]:
print(len(tokenized_wnut["train"][0]["tokens"]),len(tokenized_wnut["train"][0]["labels"]))

769 512


Now create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy.

In [14]:
import evaluate

seqeval = evaluate.load("seqeval")

Get the NER labels first, and then create a function that passes your true predictions and true labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the scores:

In [15]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

example = tokenized_wnut["train"][0]

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    average_method = "macro"
    results = {
        "f1": f1_score(true_labels, true_predictions, average=average_method),
        "precision": precision_score(true_labels, true_predictions, average=average_method),
        "recall": recall_score(true_labels, true_predictions, average=average_method),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }
    return {
        "precision": results["precision"],
        "recall": results["recall"],
        "f1": results["f1"],
        "accuracy": results["accuracy"],
    }

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [16]:
# Get label names from the dataset features
#label_list = wnut["train"].features["ner_tags"].feature.names
print("Label list:", label_list)

# Create id2label mapping
id2label = {i: label for i, label in enumerate(label_list)}
print("\nid2label:")
for i, label in id2label.items():
    print(f"    {i}: \"{label}\",")

# Create label2id mapping  
label2id = {label: i for i, label in enumerate(label_list)}
print("\nlabel2id:")
for label, i in label2id.items():
    print(f"    \"{label}\": {i},")

# Print formatted dictionaries for copying
print("\n# Copy these dictionaries:")
print("id2label = {")
for i, label in id2label.items():
    print(f"    {i}: \"{label}\",")
print("}")

print("\nlabel2id = {")
for label, i in label2id.items():
    print(f"    \"{label}\": {i},")
print("}")

# Get number of labels for model
num_labels = len(label_list)
print(f"\nnum_labels = {num_labels}")

Label list: ['B-LOC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-ORG', 'I-PERS', 'O']

id2label:
    0: "B-LOC",
    1: "B-ORG",
    2: "B-PERS",
    3: "I-LOC",
    4: "I-ORG",
    5: "I-PERS",
    6: "O",

label2id:
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PERS": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PERS": 5,
    "O": 6,

# Copy these dictionaries:
id2label = {
    0: "B-LOC",
    1: "B-ORG",
    2: "B-PERS",
    3: "I-LOC",
    4: "I-ORG",
    5: "I-PERS",
    6: "O",
}

label2id = {
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PERS": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PERS": 5,
    "O": 6,
}

num_labels = 7


In [17]:
len(label2id)

7

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load DistilBERT with [AutoModelForTokenClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForTokenClassification) along with the number of expected labels, and the label mappings:

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id, trust_remote_code=True
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


At this point, only three steps remain:

1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the seqeval scores and save the training checkpoint.
2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    max_steps=400,
    # num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    eval_steps=100,
    load_best_model_at_end=True,
    save_total_limit=6,
    report_to="none", # replace with "wandb" to enable logging
    #warmup_ratio=0.1,
    # lr_scheduler_type="cosine",
    metric_for_best_model="f1",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,

)

trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.369486,0.091241,0.113636,0.101215,0.901918
200,No log,0.31073,0.223047,0.183418,0.178318,0.914057
300,No log,0.291804,0.249517,0.218777,0.219973,0.92261
400,No log,0.288186,0.274238,0.236456,0.241408,0.922748


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=400, training_loss=0.3132986640930176, metrics={'train_runtime': 65.8704, 'train_samples_per_second': 194.321, 'train_steps_per_second': 6.073, 'total_flos': 1536619363614720.0, 'train_loss': 0.3132986640930176, 'epoch': 40.0})

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [20]:
#trainer.push_to_hub()

## Inference for submission

Great, now that you've finetuned a model, we can inference it for the leaderboard.

Grab some text you'd like to run inference on:

In [21]:
import ast
import torch

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=True, truncation=True)

    with torch.no_grad():
        logits = model(**inputs.to(model.device)).logits

    predictions = torch.argmax(logits, dim=2)

    token_predictions = predictions[0].tolist()
    
    # Get word IDs to map tokens back to words
    word_ids = inputs.word_ids()
    
    # Extract word-level predictions (first token of each word)
    word_predictions = []
    previous_word_idx = None
    
    for i, word_idx in enumerate(word_ids):
        # Skip special tokens (None word_idx)
        if word_idx is not None:
            # Only take the first token of each word
            if word_idx != previous_word_idx:
                word_predictions.append(token_predictions[i])
            previous_word_idx = word_idx
    

    # predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    return word_predictions
    # return list(zip(tokenizer.tokenize(text), predicted_token_class))

classify_text(["Американський", "президент", "Дональд", "Трамп", "на", "тлі", "нового", "загострення", "торговельних", "відносин", "з", "Китаєм", "запевнив", "–", "все", "буде", "добре."])


[1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

In [22]:
import pandas as pd
df = pd.read_csv("test.csv")
df["tokens"] = df["tokens"].apply(ast.literal_eval)
df

Unnamed: 0,id,tokens
0,0,"[Агресію, російської, влади, стосовно, України..."
1,1,"[Київський, апеляційний, господарський, суд, в..."
2,2,"[ДП, «Клавдієвське, лісове, господарство», (Ки..."
3,3,"[Метою, запропонованої, статті, є, аналіз, стр..."
4,4,"[Уряд, візьме, кредити, під, держгарантії, на,..."
...,...,...
153,164,"[Віднедавна, ..., І, затримок, із, передачею, ..."
154,165,"[Тарас, Шевченко, —, це, наш, порятунок, від, ..."
155,166,"[Дорогі, у, Христі, !, Великий, піст, –, це, т..."
156,167,"[Черкаське, обласне, територіальне, відділення..."


In [23]:
df['ner_tags'] = df['tokens'].apply(lambda x: classify_text(x))
df["ner_tags"] = df["ner_tags"].apply(lambda x: [label_list[tag] for tag in x])
df

Unnamed: 0,id,tokens,ner_tags
0,0,"[Агресію, російської, влади, стосовно, України...","[B-ORG, I-ORG, I-ORG, O, I-ORG, B-ORG, I-ORG, ..."
1,1,"[Київський, апеляційний, господарський, суд, в...","[B-ORG, I-ORG, I-ORG, I-ORG, O, I-ORG, O, O, O..."
2,2,"[ДП, «Клавдієвське, лісове, господарство», (Ки...","[B-ORG, I-ORG, I-ORG, I-ORG, B-ORG, I-ORG, O, ..."
3,3,"[Метою, запропонованої, статті, є, аналіз, стр...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,4,"[Уряд, візьме, кредити, під, держгарантії, на,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
153,164,"[Віднедавна, ..., І, затримок, із, передачею, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
154,165,"[Тарас, Шевченко, —, це, наш, порятунок, від, ...","[O, I-PERS, O, O, O, O, O, O, O, O, O, O, O, O..."
155,166,"[Дорогі, у, Христі, !, Великий, піст, –, це, т...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
156,167,"[Черкаське, обласне, територіальне, відділення...","[B-ORG, I-ORG, I-ORG, O, O, O, O, O, O, O, O, ..."


In [24]:
df.drop(columns=["tokens"]).to_csv("submission.csv", index=False)