In [1]:
!pip install transformers datasets seqeval accelerate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-non

In [6]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install transformers datasets seqeval accelerate
# %%
import pandas as pd
def load_conll_data(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                # Split the line and take only the first two elements
                parts = line.strip().split()
                if len(parts) >= 2: # Ensure there are at least two parts
                    token, tag = parts[0], parts[1]
                    sentence.append(token)
                    label.append(tag)
                # Optionally, handle lines with less than 2 parts or log a warning
                # else:
                #     print(f"Skipping invalid line: {line.strip()}")

    # Handle the last sentence if the file doesn't end with a blank line
    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_conll_data("amharic_ner_conll.txt")




In [7]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"  # or "Davlan/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_list = list(set(tag for sent in tags for tag in sent))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def tokenize_and_align_labels(sentences, tags):
    tokenized_inputs = tokenizer(sentences, is_split_into_words=True, truncation=True, padding=True)
    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [8]:
from datasets import Dataset

df = pd.DataFrame({"tokens": sentences, "ner_tags": tags})
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x["tokens"], x["ner_tags"]), batched=True)


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [18]:
trainer.evaluate()
trainer.save_model("amharic-ner-model")
tokenizer.save_pretrained("amharic-ner-model")



('amharic-ner-model/tokenizer_config.json',
 'amharic-ner-model/special_tokens_map.json',
 'amharic-ner-model/sentencepiece.bpe.model',
 'amharic-ner-model/added_tokens.json',
 'amharic-ner-model/tokenizer.json')

In [16]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import f1_score # Import f1_score here
import numpy as np # Import numpy for argmax and array handling

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list),
                                                      id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    # Get the predicted label IDs by taking the argmax over the logits
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [
        [id2label[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]

    # Compute F1 score
    # Ensure that the lengths of true_labels and true_predictions are consistent for f1_score
    # This is important because some examples might have had all labels ignored (-100)
    # Only include predictions and labels for examples where there are actual labels to evaluate
    valid_indices = [i for i, labels in enumerate(true_labels) if labels]
    filtered_true_labels = [true_labels[i] for i in valid_indices]
    filtered_true_predictions = [true_predictions[i] for i in valid_indices]

    # Check if there are any valid examples left to compute metrics on
    if not filtered_true_labels:
        print("Warning: No valid labels found to compute metrics.")
        return {"f1": 0.0} # Return 0 or another appropriate value

    f1 = f1_score(filtered_true_labels, filtered_true_predictions, average="macro")

    return {"f1": f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # use a real split if available
    tokenizer=tokenizer,
    compute_metrics=compute_metrics # Use the defined compute_metrics function
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,3.41766,0.0
2,No log,3.378611,0.0
3,No log,3.347895,0.0
4,No log,3.328202,0.033333
5,3.403600,3.319797,0.066667




TrainOutput(global_step=10, training_loss=3.403605651855469, metrics={'train_runtime': 664.5698, 'train_samples_per_second': 0.226, 'train_steps_per_second': 0.015, 'total_flos': 1148567445000.0, 'train_loss': 3.403605651855469, 'epoch': 5.0})

In [21]:
model_checkpoint = "bert-base-multilingual-cased"  # Swap this with the model name in a loop
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value}")

Evaluation Results:
eval_loss: 3.319796562194824
eval_f1: 0.06666666666666667
eval_runtime: 2.472
eval_samples_per_second: 12.136
eval_steps_per_second: 0.809
epoch: 5.0
