<a href="https://colab.research.google.com/github/Muhtasham/tajik-nlp/blob/main/Tajik_Token_classification_(PyTorch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset, DatasetDict

raw_datasets_train_ts = load_dataset("wikiann", "tg", split='train+test')
raw_datasets_valid = load_dataset("wikiann", "tg", split='validation')
raw_datasets =  DatasetDict({"train": raw_datasets_train_ts, "validation": raw_datasets_valid})

Downloading builder script:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/tg (download: 223.17 MiB, generated: 81.20 KiB, post-processed: Unknown size, total: 223.25 MiB) to /root/.cache/huggingface/datasets/wikiann/tg/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset wikiann downloaded and prepared to /root/.cache/huggingface/datasets/wikiann/tg/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.




In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 100
    })
})

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 100
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['Факултаи',
 'табиатшиносию',
 'географияи',
 'ДДОЛ',
 'ба',
 'номи',
 'С.М.Кировро',
 'хатм',
 'намудааст',
 '(',
 '1980',
 ')',
 '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Факултаи табиатшиносию географияи ДДОЛ  ба    номи  С.М.Кировро хатм намудааст ( 1980 ) . 
O        O             O          B-ORG I-ORG I-ORG I-ORG       O    O         O O    O O 


In [None]:
from transformers import AutoTokenizer
model_checkpoint = "muhtasham/RoBERTa-tg"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.83M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 'ĠÐ',
 '¤',
 'Ð°Ðº',
 'ÑĥÐ»',
 'ÑĤÐ°Ð¸',
 'ĠÑĤÐ°Ð±Ð¸Ð°ÑĤÑĪÐ¸Ð½Ð¾Ñģ',
 'Ð¸Ñİ',
 'ĠÐ³ÐµÐ¾Ð³ÑĢÐ°ÑĦÐ¸ÑıÐ¸',
 'ĠÐ',
 'Ķ',
 'Ð',
 'Ķ',
 'Ð',
 'ŀ',
 'Ð',
 'Ľ',
 'ĠÐ±Ð°',
 'ĠÐ½Ð¾Ð¼Ð¸',
 'ĠÐ',
 '¡',
 '.',
 'Ð',
 'ľ',
 '.',
 'Ð',
 'ļ',
 'Ð¸ÑĢ',
 'Ð¾Ð²ÑĢÐ¾',
 'ĠÑħÐ°ÑĤÐ¼',
 'ĠÐ½Ð°Ð¼ÑĥÐ´Ð°Ð°ÑģÑĤ',
 'Ġ',
 '(',
 'Ġ',
 '1',
 '9',
 '8',
 '0',
 'Ġ',
 ')',
 'Ġ',
 '.',
 '</s>']

In [None]:
inputs.word_ids()

[None,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 8,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 12,
 12,
 None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=512
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    3,    4,    4,
            4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
            4,    4,    4,    4,    4,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0, -100],
        [-100,    3,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -100]


In [None]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)


Downloading pytorch_model.bin:   0%|          | 0.00/319M [00:00<?, ?B/s]

In [None]:
model.config.num_labels

7

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, IntervalStrategy

args = TrainingArguments(
    "tajroberto-ner", 
    num_train_epochs=200, 
    learning_rate=2e-5,
    evaluation_strategy="steps", 
    eval_steps = 50, 
    load_best_model_at_end=True,  
    metric_for_best_model='f1',
    save_total_limit=5, 
    log_level='error',
    push_to_hub=True,
    hub_private_repo=True,
    disable_tqdm=False,)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete")