#### AutoTokenizer: Helps convert text into the token format needed by the model. AutoModelForTokenClassification: Loads the pre-trained model for NER tasks with the specified number of labels.

Step 1: Load the Dataset

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_conll_format(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = []
        sentence = []
        labels = []

        for line in f:
            line = line.strip()
            if line == "":
                if sentence:  # Only append if the sentence list is not empty
                    data.append((sentence, labels))
                    sentence = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) == 2:  # Ensure there are exactly two parts
                    token, label = parts
                    sentence.append(token)
                    labels.append(label)
                else:
                    print(f"Skipping line: {line}")  # Optional: print to debug which lines are problematic

        if sentence:  # Append the last sentence if the file doesn't end with a newline
            data.append((sentence, labels))

    return pd.DataFrame(data, columns=['tokens', 'labels'])

df = load_conll_format("telegram_data_conll_format_51_101.txt")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
train_df

Unnamed: 0,tokens,labels
12,"[15, 540, ለፀጉሮ, እና, ለፂሞ, ዕድገት, ለፊቶ, ጥራት, የፊት, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[5, 1, 5, 2, 2, 600, 1, 1, 5500, ብር, ከ, ነፃ, ማድ...","[O, O, O, O, O, O, O, O, B-PRICE, I-PRICE, O, ..."
37,"[10, 450, ብር, ውስን, ፍሬ, ነው, የቀረው, ️ጥራት, ዋስትና, ቅ...","[O, B-PRICE, I-PRICE, O, O, O, O, O, O, O, O, ..."
8,"[️500, ያለን, ግማሽ, ሊትር, 500ሚሊ, ብቻ, ነው, ያለው, ከለር,...","[O, O, O, O, O, O, O, O, O, O, O, B-PRICE, I-P..."
3,"[5, 1, አምስት, በአንድ, ለአጠቃቀም, ቀላል, ውበትን, የሚያላበስ, ...","[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, I..."
6,"[100ኦርጂናል, ጋርመንት, ስቲመር, ️የራሱ, መተኮሻ, ቦርድ, የተገጠመ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, O, O, O, ..."
41,"[3, 3, ፍሬ, የልጆች, ጡጦ, 1984, 0, 260, 3, ፍሬ, 1400...","[O, O, B-PRODUCT, I-PRODUCT, I-PRODUCT, O, O, ..."
46,"[⏱⏱, በመጠቀም, የእግሮን, ጤና, እና, ልስላሴ, ይጠብቁ, ️ብዙ, ጊዜ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
47,"[100, ️360°, 250, 15, 220240, 2000, ዋጋ, 2700, ...","[O, O, O, O, O, O, B-PRICE, I-PRICE, I-PRICE, ..."
15,"[34, 1500, 4500, ብር, ከ, ነፃ, ማድረስ, ጋር, ለማዘዝ, ዲሊ...","[O, O, B-PRICE, I-PRICE, O, O, O, O, O, O, O, ..."


In [14]:
# Create a label mapping from string labels to integer IDs
label_list = list(set([label for sublist in df['labels'] for label in sublist]))  # Get unique labels
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

Step 2: Tokenize the Data

In [18]:
def tokenize_and_align_labels(dataframe, tokenizer, label_to_id):
    tokenized_inputs = tokenizer(
        list(dataframe['tokens']),
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors='pt'  # Use PyTorch tensors
    )

    labels = []
    for i, label in enumerate(dataframe['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Use -100 for ignored labels
        for word_index in range(len(word_ids)):
            if word_ids[word_index] is not None:  # If it's a valid word
                if word_ids[word_index] < len(label):  # Ensure word_ids is within bounds
                    label_ids[word_index] = label_to_id.get(label[word_ids[word_index]], -100)  # Use -100 if label not found
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    # Ensure all expected keys are present
    expected_keys = ['input_ids', 'attention_mask', 'labels']
    for key in expected_keys:
        if key not in tokenized_inputs:
            raise ValueError(f"Key {key} not found in tokenized inputs.")

    return tokenized_inputs

In [23]:
# Ensure this section has been executed first
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_names = [
    "xlm-roberta-base",
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased"
]

# Initialize dictionaries to hold tokenizers and models
tokenizers = {}
models = {}

for model_name in model_names:
    tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name)
    models[model_name] = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))

ImportError: 
AutoModelForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


Step 3: Prepare the Dataset for Training

Step 4: Set Up Training Arguments

In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)



Step 5: Fine-tune the Model

In [41]:
from transformers import Trainer

results = {}

for model_name in model_names:
    trainer = Trainer(
        model=models[model_name],
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[model_name] = eval_result

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Step 6: Evaluate the Model

In [None]:
import numpy as np

# Example evaluation metrics
for model_name, metrics in results.items():
    print(f"{model_name} - Accuracy: {metrics['eval_accuracy']}, Loss: {metrics['eval_loss']}")

{'eval_loss': 0.37177950143814087,
 'eval_runtime': 0.2086,
 'eval_samples_per_second': 47.939,
 'eval_steps_per_second': 4.794,
 'epoch': 30.0}

Step 7: Save the Model

In [None]:
best_model_name = max(results, key=lambda x: results[x]['eval_accuracy'])
print(f"Best model for production: {best_model_name}")

In [None]:
best_model = models[best_model_name]
best_model.save_pretrained(f"./fine_tuned_{best_model_name}")
tokenizers[best_model_name].save_pretrained(f"./fine_tuned_{best_model_name}")