In [1]:
!pip install torch torchtext pytorch-crf torchcrf transformers seqeval datasets huggingface_hub evaluate




In [2]:
import os
import warnings
import numpy as np
import torch
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, DataCollatorForTokenClassification,
                          TrainingArguments, Trainer, AutoModelForTokenClassification)
from huggingface_hub import login
import evaluate
import ast


In [3]:
# Log in to Hugging Face Hub
login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
# Disable W&B logs and filter warnings
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

# Set device (GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [5]:
# Load the dataset
dataset = load_dataset("LocalDoc/azerbaijani-ner-dataset")

# Split the train dataset into train and validation subsets
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

# Check the dataset split
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['index', 'tokens', 'ner_tags'],
        num_rows: 79636
    })
    validation: Dataset({
        features: ['index', 'tokens', 'ner_tags'],
        num_rows: 19909
    })
})


In [6]:
def process_ner_tags(example):
    if example["tokens"] and example["ner_tags"]:
        try:
            example["tokens"] = ast.literal_eval(example["tokens"])
            example["ner_tags"] = [int(tag) for tag in ast.literal_eval(example["ner_tags"])]
        except (ValueError, SyntaxError):
            example["tokens"], example["ner_tags"] = [], []
    else:
        example["tokens"], example["ner_tags"] = [], []
    return example

# Apply preprocessing function
dataset = dataset.map(process_ner_tags)


Map:   0%|          | 0/79636 [00:00<?, ? examples/s]

Map:   0%|          | 0/19909 [00:00<?, ? examples/s]

In [7]:
label_list = [
    "O", "PERSON", "LOCATION", "ORGANISATION", "DATE", "TIME", "MONEY", "PERCENTAGE",
    "FACILITY", "PRODUCT", "EVENT", "ART", "LAW", "LANGUAGE", "GPE", "NORP",
    "ORDINAL", "CARDINAL", "DISEASE", "CONTACT", "ADAGE", "QUANTITY", "MISCELLANEOUS",
    "POSITION", "PROJECT"
]
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for idx, label in enumerate(label_list)}
print("Label to ID Mapping:", label_to_id)


Label to ID Mapping: {'O': 0, 'PERSON': 1, 'LOCATION': 2, 'ORGANISATION': 3, 'DATE': 4, 'TIME': 5, 'MONEY': 6, 'PERCENTAGE': 7, 'FACILITY': 8, 'PRODUCT': 9, 'EVENT': 10, 'ART': 11, 'LAW': 12, 'LANGUAGE': 13, 'GPE': 14, 'NORP': 15, 'ORDINAL': 16, 'CARDINAL': 17, 'DISEASE': 18, 'CONTACT': 19, 'ADAGE': 20, 'QUANTITY': 21, 'MISCELLANEOUS': 22, 'POSITION': 23, 'PROJECT': 24}


In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != previous_word_id:
                if word_id < len(label):  # Boundary check
                    aligned_labels.append(label[word_id])
                else:
                    aligned_labels.append(-100)
            else:
                aligned_labels.append(-100)
            previous_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization and alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/79636 [00:00<?, ? examples/s]

Map:   0%|          | 0/19909 [00:00<?, ? examples/s]

In [10]:
# Load the metric for evaluation
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [11]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_list))
model.to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [14]:
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2952,0.265711,0.715424,0.622853,0.665937,0.919136
2,0.2486,0.252083,0.721036,0.637979,0.67697,0.921439
3,0.2068,0.253372,0.704872,0.650684,0.676695,0.920898


TrainOutput(global_step=3735, training_loss=0.27468724043334186, metrics={'train_runtime': 989.0494, 'train_samples_per_second': 241.553, 'train_steps_per_second': 3.776, 'total_flos': 1.15112026864878e+16, 'train_loss': 0.27468724043334186, 'epoch': 3.0})

In [15]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluation results: {'eval_loss': 0.25337204337120056, 'eval_precision': 0.7048723772442137, 'eval_recall': 0.6506839057507987, 'eval_f1': 0.6766950472432769, 'eval_accuracy': 0.9208976538986301, 'eval_runtime': 34.0968, 'eval_samples_per_second': 583.896, 'eval_steps_per_second': 9.15, 'epoch': 3.0}


In [18]:
model.save_pretrained("mbert-azerbaijani-ner")
tokenizer.save_pretrained("mbert-azerbaijani-ner")


('mbert-azerbaijani-ner/tokenizer_config.json',
 'mbert-azerbaijani-ner/special_tokens_map.json',
 'mbert-azerbaijani-ner/vocab.txt',
 'mbert-azerbaijani-ner/added_tokens.json',
 'mbert-azerbaijani-ner/tokenizer.json')

In [19]:
model = AutoModelForTokenClassification.from_pretrained("mbert-azerbaijani-ner")
tokenizer = AutoTokenizer.from_pretrained("mbert-azerbaijani-ner")


In [21]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load the trained model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("mbert-azerbaijani-ner")
tokenizer = AutoTokenizer.from_pretrained("mbert-azerbaijani-ner")

# Sample input text
text = "Azərbaycanın paytaxtı Bakı, Xəzər dənizi sahilində yerləşir."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

# Extract logits and predicted token class IDs
logits = outputs.logits
predicted_token_class_ids = logits.argmax(dim=-1).squeeze().tolist()
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

# Map predictions to label names
label_to_id = {
    "O": 0, "PERSON": 1, "LOCATION": 2, "ORGANISATION": 3, "DATE": 4, "TIME": 5, "MONEY": 6,
    "PERCENTAGE": 7, "FACILITY": 8, "PRODUCT": 9, "EVENT": 10, "ART": 11, "LAW": 12, "LANGUAGE": 13,
    "GPE": 14, "NORP": 15, "ORDINAL": 16, "CARDINAL": 17, "DISEASE": 18, "CONTACT": 19, "ADAGE": 20,
    "QUANTITY": 21, "MISCELLANEOUS": 22, "POSITION": 23, "PROJECT": 24
}
id_to_label = {v: k for k, v in label_to_id.items()}
predicted_labels = [id_to_label[label_id] for label_id in predicted_token_class_ids]

# Combine subwords and print only complete tokens with labels
word_label_pairs = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        # Append subword to the current word
        current_word += token[2:]
    else:
        # Print the completed word and label before starting a new word
        if current_word:
            word_label_pairs.append((current_word, current_label))
        # Start a new word
        current_word = token
        current_label = label

# Append the last word
if current_word:
    word_label_pairs.append((current_word, current_label))

# Display the combined words and their labels
for word, label in word_label_pairs:
    print(f"{word}: {label}")


[CLS]: O
Azərbaycanın: GPE
paytaxtı: POSITION
Bakı: GPE
,: O
Xəzər: LOCATION
dənizi: LOCATION
sahilində: O
yerləşir: O
.: O
[SEP]: O
