In [None]:
!pip install transformers datasets seqeval huggingface_hub




In [None]:
from huggingface_hub import login
login(token="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")
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer, AutoModelForTokenClassification, get_linear_schedule_with_warmup, EarlyStoppingCallback
from torch.optim import AdamW
from torch.amp import GradScaler
import torch
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import ast


import os
os.environ["WANDB_DISABLED"] = "true"

import warnings
warnings.filterwarnings("ignore")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
dataset = load_dataset("LocalDoc/azerbaijani-ner-dataset")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['index', 'tokens', 'ner_tags'],
        num_rows: 99545
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def preprocess_example(example):
    try:
        # Convert string representations of lists into actual lists
        example["tokens"] = ast.literal_eval(example["tokens"])
        example["ner_tags"] = list(map(int, ast.literal_eval(example["ner_tags"])))
    except (ValueError, SyntaxError) as e:
        # Handle the error: skip or provide default values if data is malformed
        print(f"Skipping malformed example: {example['index']} due to error: {e}")
        example["tokens"] = []
        example["ner_tags"] = []
    return example

# Apply the preprocessing step
dataset = dataset.map(preprocess_example)


In [None]:
def tokenize_and_align_labels(example):
    # Tokenize the tokens in the single example
    tokenized_inputs = tokenizer(
        example["tokens"],  # Pass tokens directly
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            # Ensure word_idx does not exceed the length of ner_tags
            if word_idx < len(example["ner_tags"]):
                labels.append(example["ner_tags"][word_idx])
            else:
                labels.append(-100)
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)

# Create a 90-10 split for training and validation
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)

print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 89590
    })
    test: Dataset({
        features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9955
    })
})


In [17]:
label_list = [
    "O",                   # Outside of any entity
    "B-PERSON", "I-PERSON",           # Person names
    "B-LOCATION", "I-LOCATION",       # Locations
    "B-ORGANISATION", "I-ORGANISATION", # Organizations
    "B-DATE", "I-DATE",               # Dates
    "B-TIME", "I-TIME",               # Times
    "B-MONEY", "I-MONEY",             # Monetary values
    "B-PERCENTAGE", "I-PERCENTAGE",   # Percentages
    "B-FACILITY", "I-FACILITY",       # Facilities
    "B-PRODUCT", "I-PRODUCT",         # Products
    "B-EVENT", "I-EVENT",             # Events
    "B-ART", "I-ART",                 # Artworks
    "B-LAW", "I-LAW",                 # Legal documents
    "B-LANGUAGE", "I-LANGUAGE",       # Languages
    "B-GPE", "I-GPE",                 # Geopolitical entities
    "B-NORP", "I-NORP",               # Nationalities or groups
    "B-ORDINAL", "I-ORDINAL",         # Ordinal numbers
    "B-CARDINAL", "I-CARDINAL",       # Cardinal numbers
    "B-DISEASE", "I-DISEASE",         # Diseases
    "B-CONTACT", "I-CONTACT",         # Contact info
    "B-ADAGE", "I-ADAGE",             # Sayings
    "B-QUANTITY", "I-QUANTITY",       # Quantities
    "B-MISCELLANEOUS", "I-MISCELLANEOUS", # Miscellaneous entities
    "B-POSITION", "I-POSITION",       # Positions
    "B-PROJECT", "I-PROJECT"          # Projects
]

In [None]:

# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load the model with the correct number of labels
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=25,  # Ensure this matches the number of unique labels in `label_list`
)

# Custom metric function for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Map back to the label list, skipping ignored tokens (-100)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    # Optionally print a classification report
    print(classification_report(true_labels, true_predictions))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Define training arguments with optimized parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,                    # Adjusted learning rate for stability
    per_device_train_batch_size=64,        # Adjusted for memory optimization
    per_device_eval_batch_size=64,
    num_train_epochs=8,                    # Adjusted to reduce overfitting
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,         # Higher accumulation for smaller batch size
    logging_dir='./logs',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"                       # Disables logging integrations (e.g., WANDB)
)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

# Learning rate scheduler setup
num_training_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 10% warmup steps
    num_training_steps=num_training_steps
)

# Define the Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),  # Pass optimizer and scheduler
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping with patience of 2
)

training_metrics = trainer.train()


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.492728,0.716705,0.536168,0.613429
2,1.437600,0.316428,0.750777,0.673112,0.709826
3,0.329100,0.287577,0.759884,0.705648,0.731763
4,0.329100,0.272844,0.746619,0.727007,0.736683
5,0.276700,0.263447,0.760019,0.727704,0.743511
6,0.253900,0.257133,0.760282,0.732646,0.746208
7,0.253900,0.253817,0.787151,0.716427,0.750126
8,0.238800,0.260643,0.74496,0.73782,0.741373


              precision    recall  f1-score   support

         ART       0.00      0.00      0.00      1882
        DATE       0.00      0.00      0.00       835
       EVENT       0.00      0.00      0.00        74
    FACILITY       0.00      0.00      0.00      1183
         LAW       0.00      0.00      0.00      1140
    LOCATION       0.66      0.69      0.67      8965
       MONEY       0.00      0.00      0.00       552
ORGANISATION       0.00      0.00      0.00       498
  PERCENTAGE       0.62      0.81      0.70      3652
      PERSON       0.89      0.71      0.79      7138
     PRODUCT       0.73      0.73      0.73      2598
        TIME       0.00      0.00      0.00      1634

   micro avg       0.72      0.54      0.61     30151
   macro avg       0.24      0.25      0.24     30151
weighted avg       0.54      0.54      0.54     30151

              precision    recall  f1-score   support

         ART       0.52      0.10      0.16      1882
        DATE       0.39 

In [None]:
eval_results = trainer.evaluate()
print(eval_results)


              precision    recall  f1-score   support

         ART       0.47      0.16      0.24      1882
        DATE       0.57      0.37      0.45       835
       EVENT       0.70      0.35      0.47        74
    FACILITY       0.73      0.67      0.70      1183
         LAW       0.62      0.57      0.60      1140
    LOCATION       0.81      0.79      0.80      8965
       MONEY       0.57      0.54      0.56       552
ORGANISATION       0.69      0.68      0.68       498
  PERCENTAGE       0.79      0.81      0.80      3652
      PERSON       0.89      0.81      0.85      7138
     PRODUCT       0.82      0.85      0.83      2598
        TIME       0.61      0.46      0.53      1634

   micro avg       0.79      0.72      0.75     30151
   macro avg       0.69      0.59      0.63     30151
weighted avg       0.77      0.72      0.74     30151

{'eval_loss': 0.25381746888160706, 'eval_precision': 0.7871510822826324, 'eval_recall': 0.7164273158435873, 'eval_f1': 0.750125883353

In [None]:
# Define a path to save the model
save_directory = "./XLM-RoBERTa"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


('./XLM-RoBERTa/tokenizer_config.json',
 './XLM-RoBERTa/special_tokens_map.json',
 './XLM-RoBERTa/sentencepiece.bpe.model',
 './XLM-RoBERTa/added_tokens.json',
 './XLM-RoBERTa/tokenizer.json')

In [18]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./XLM-RoBERTa")
model = AutoModelForTokenClassification.from_pretrained("./XLM-RoBERTa")

# Initialize the NER pipeline with GPU if available
device = 0 if torch.cuda.is_available() else -1
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# Define label mapping based on the label list
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list) if label != "O"}

# Define the function to prepare test data and evaluate
def evaluate_model(test_texts, true_labels):
    predictions = []
    for text in test_texts:
        pred_entities = nlp_ner(text)

        # Map predicted labels using the label_mapping and filter out non-entities
        pred_labels = [label_mapping.get(entity["entity_group"], "O") for entity in pred_entities if entity["entity_group"] in label_mapping]
        predictions.append(pred_labels)

    # Calculate metrics if lengths match, otherwise notify about inconsistency
    if len(true_labels) == len(predictions):
        precision = precision_score(true_labels, predictions)
        recall = recall_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions)

        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print(classification_report(true_labels, predictions))
    else:
        print("Warning: Inconsistent number of samples between true labels and predictions.")
        print(f"True Labels: {true_labels}")
        print(f"Predictions: {predictions}")

# Example test data
test_texts = ["Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat."]
true_labels = [["B-PERSON", "B-ORGANISATION"]]  # True labels for the sample text
evaluate_model(test_texts, true_labels)


Precision: 0.5
Recall: 0.5
F1-Score: 0.5
              precision    recall  f1-score   support

    LOCATION       0.00      0.00      0.00         0
ORGANISATION       0.00      0.00      0.00         1
      PERSON       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.33      0.33      0.33         2
weighted avg       0.50      0.50      0.50         2



In [20]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./XLM-RoBERTa")
model = AutoModelForTokenClassification.from_pretrained("./XLM-RoBERTa")

# Initialize the NER pipeline with GPU if available
device = 0 if torch.cuda.is_available() else -1
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# Define label mapping based on the label list
label_list = [
    "O", "B-PERSON", "I-PERSON", "B-LOCATION", "I-LOCATION",
    "B-ORGANISATION", "I-ORGANISATION", "B-DATE", "I-DATE",
    "B-TIME", "I-TIME", "B-MONEY", "I-MONEY", "B-PERCENTAGE",
    "I-PERCENTAGE", "B-FACILITY", "I-FACILITY", "B-PRODUCT",
    "I-PRODUCT", "B-EVENT", "I-EVENT", "B-ART", "I-ART",
    "B-LAW", "I-LAW", "B-LANGUAGE", "I-LANGUAGE", "B-GPE",
    "I-GPE", "B-NORP", "I-NORP", "B-ORDINAL", "I-ORDINAL",
    "B-CARDINAL", "I-CARDINAL", "B-DISEASE", "I-DISEASE",
    "B-CONTACT", "I-CONTACT", "B-ADAGE", "I-ADAGE",
    "B-QUANTITY", "I-QUANTITY", "B-MISCELLANEOUS", "I-MISCELLANEOUS",
    "B-POSITION", "I-POSITION", "B-PROJECT", "I-PROJECT"
]
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list) if label != "O"}

# Define the function to prepare test data and evaluate
def evaluate_model(test_texts, true_labels):
    predictions = []
    for i, text in enumerate(test_texts):
        pred_entities = nlp_ner(text)

        # Map predicted labels using the label_mapping and filter out non-entities
        pred_labels = [label_mapping.get(entity["entity_group"], "O") for entity in pred_entities if entity["entity_group"] in label_mapping]

        # Adjust prediction length to match the true labels
        if len(pred_labels) != len(true_labels[i]):
            print(f"Warning: Inconsistent number of entities in sample {i+1}. Adjusting predicted entities.")
            pred_labels = pred_labels[:len(true_labels[i])]

        predictions.append(pred_labels)

    # Calculate metrics if lengths match, otherwise notify about inconsistency
    if all(len(true) == len(pred) for true, pred in zip(true_labels, predictions)):
        precision = precision_score(true_labels, predictions)
        recall = recall_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions)

        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print(classification_report(true_labels, predictions))
    else:
        print("Error: Could not align all samples correctly for evaluation.")
        print(f"True Labels: {true_labels}")
        print(f"Predictions: {predictions}")

# Example test data
test_texts = [
    "Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat.",
    "İlham Əliyev Bakıda keçirilən konfransda çıxış etdi.",
    "Azərbaycan Respublikası Müdafiə Nazirliyi yeni layihə təqdim etdi.",
    "Neftçala şəhərində 10 milyondan çox pul sərf edildi.",
    "2023-cü il avqust ayında bu qərar elan edildi."
]
true_labels = [
    ["B-PERSON", "B-ORGANISATION"],
    ["B-PERSON", "B-LOCATION"],
    ["B-ORGANISATION"],
    ["B-LOCATION", "B-MONEY"],
    ["B-DATE"]
]

# Evaluate the model
evaluate_model(test_texts, true_labels)


Precision: 0.25
Recall: 0.25
F1-Score: 0.25
              precision    recall  f1-score   support

        DATE       0.00      0.00      0.00         1
    LOCATION       0.00      0.00      0.00         2
       MONEY       0.00      0.00      0.00         1
ORGANISATION       0.00      0.00      0.00         2
  PERCENTAGE       0.00      0.00      0.00         0
      PERSON       1.00      1.00      1.00         2
     PRODUCT       0.00      0.00      0.00         0

   micro avg       0.25      0.25      0.25         8
   macro avg       0.14      0.14      0.14         8
weighted avg       0.25      0.25      0.25         8

