In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
def load_conll_format(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = []
        sentence = []
        labels = []

        for line in f:
            line = line.strip()
            if line == "":
                if sentence:  # Only append if the sentence list is not empty
                    data.append((sentence, labels))
                    sentence = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) == 2:  # Ensure there are exactly two parts
                    token, label = parts
                    sentence.append(token)
                    labels.append(label)
                else:
                    print(f"Skipping line: {line}")  # Optional: print to debug which lines are problematic

        if sentence:  # Append the last sentence if the file doesn't end with a newline
            data.append((sentence, labels))

    return pd.DataFrame(data, columns=['tokens', 'labels'])

df = load_conll_format("/kaggle/input/datset11/dataset.txt")

Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ግራንድI-LOC
Skipping line: ️
Skipping line: ️
Skipping line: ️


In [22]:
df

Unnamed: 0,tokens,labels
0,"[2, ሊትር, ፔርሙስ, ለቤት, ለቢሮ, ለሆቴል, አገልግሎት, መዋል, የሚ...","[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O..."
1,"[ቢላ, የስጋ, የአጥንት, 1000, ብር, ለማዘዝ, ውስን, ፍሬ, ነው, ...","[B-PRODUCT, O, O, B-PRICE, I-PRICE, O, O, O, O..."
2,"[ዋስትና, ቅናሽ, አድራሻ, ቁጥር, 1, መገናኛ, ዘፍመሽ, ግራንድ, ሞል...","[O, O, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, I-..."
3,"[2, ️, አንድ, ብሩሽ, እና, አንድ, ስፓቹላ, ዋጋ, 300, ብር, ለ...","[O, O, O, B-PRODUCT, O, O, B-PRODUCT, B-PRICE,..."
4,"[2, ️, አንድ, ብሩሽ, እና, አንድ, ስፓቹላ, ዋጋ, 300, ብር, ው...","[O, O, O, B-PRODUCT, O, O, B-PRODUCT, B-PRICE,..."
...,...,...
56,"[ዋጋ, 1000, ብር, ለማዘዝ, ውስን, ፍሬ, ነው, የቀረው, ️ጥራት, ...","[B-PRICE, I-PRICE, I-PRICE, O, O, O, O, O, O, ..."
57,"[ለቤቶ, ለስጦታ, በጥራት, በቅናሽ, ውስን, ፍሬ, የቀሩ, ዕቃወች, አሁ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
58,"[ለልጆ, ለስጦታ, በጥራት, በቅናሽ, ውስን, ፍሬ, የቀሩ, ዕቃወች, አሁ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
59,"[4, 1, 400, 10000, ብር, ከ, ነፃ, ማድረስ, ጋር, ለማዘዝ, ...","[O, O, O, B-PRICE, I-PRICE, O, O, O, O, O, O, ..."


In [35]:
from transformers import AutoTokenizer

model_name = 'bert-base-multilingual-cased'  # or "distilbert-base-multilingual-cased", "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_and_align_labels(dataframe):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(
        list(dataframe['tokens']),
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(dataframe['labels']):
        # Get word IDs for the current batch
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # Initialize label_ids with -100 for padding
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])

        for word_index in range(len(word_ids)):
            if word_ids[word_index] is not None:
                # Get the label for the current token
                current_label = label[word_ids[word_index]]
                if current_label in label_to_id:
                    label_ids[word_index] = label_to_id[current_label]  # Assign the corresponding label id
                else:
                    print(f"Warning: Label '{current_label}' not found in label_to_id. Using -100.")

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Define label mappings
label_to_id = {
    "O": 0,
    "B-PRICE": 1,
    "I-PRICE": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-PRODUCT": 5,
    "I-PRODUCT": 6,
    # Add other labels as needed
}

# Tokenize the data
tokenized_data = tokenize_and_align_labels(df)

In [37]:
try:
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))
    print(f"Successfully loaded model: {model_name}")
except Exception as e:
    print(f"Error loading model: {e}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded model: bert-base-multilingual-cased


In [38]:
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification

# Load model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate after every epoch
    logging_steps=10,  # Log after every 10 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",  # Directory to save logs
    logging_first_step=True,  # Log the first step as well
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
df = load_conll_format('/kaggle/input/datset11/dataset.txt')
tokenized_data = tokenize_and_align_labels(df)
from datasets import Dataset
# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'labels': tokenized_data['labels']
})

print(f"Number of samples in the dataset: {len(dataset)}")

if len(dataset) > 1:
    # Split the dataset for training and validation
    train_dataset, val_dataset = dataset.train_test_split(test_size=0.2).values()
else:
    # Use the entire dataset for training
    train_dataset = dataset
    val_dataset = dataset  # Or create a separate validation dataset if needed

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ️
Skipping line: ግራንድI-LOC
Skipping line: ️
Skipping line: ️
Skipping line: ️
Number of samples in the dataset: 61


Epoch,Training Loss,Validation Loss
1,1.9738,0.640335
2,1.9738,0.57593
3,1.9738,0.565332


TrainOutput(global_step=9, training_loss=0.8299133910073174, metrics={'train_runtime': 114.8245, 'train_samples_per_second': 1.254, 'train_steps_per_second': 0.078, 'total_flos': 9333615433824.0, 'train_loss': 0.8299133910073174, 'epoch': 3.0})

In [28]:
eval_result = trainer.evaluate()
print(eval_result)

{'eval_loss': 1.445087194442749, 'eval_runtime': 7.7518, 'eval_samples_per_second': 1.677, 'eval_steps_per_second': 0.129, 'epoch': 3.0}


In [40]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Model names for fine-tuning
model_names = {
    "xlm-roberta": "xlm-roberta-base",
    "distilbert": "distilbert-base-multilingual-cased",
    "mbert": "bert-base-multilingual-cased"
}

tokenizers = {}
models = {}

# Load tokenizers and models
for model_key, model_name in model_names.items():
    tokenizers[model_key] = AutoTokenizer.from_pretrained(model_name)
    models[model_key] = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_to_id))


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
from datasets import Dataset

# Define function to tokenize and align labels
def tokenize_and_align_labels(dataframe, tokenizer):
    tokenized_inputs = tokenizer(
        list(dataframe['tokens']),
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(dataframe['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # Use -100 for padding tokens
        
        for word_index in range(len(word_ids)):
            if word_ids[word_index] is not None:  # Check if it's a valid word token
                current_label = label[word_ids[word_index]]
                label_ids[word_index] = label_to_id[current_label]
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize data for each model
tokenized_data = {}
for model_key, tokenizer in tokenizers.items():
    tokenized_data[model_key] = tokenize_and_align_labels(df, tokenizer)


In [49]:
# Convert the tokenized data into Hugging Face Datasets for training and validation
datasets = {}
for model_key, data in tokenized_data.items():
    dataset = Dataset.from_dict({
        'input_ids': data['input_ids'],
        'attention_mask': data['attention_mask'],
        'labels': data['labels']
    })

    # Split the dataset into training and validation sets (80/20 split)
    datasets[model_key] = dataset.train_test_split(test_size=0.2)


In [50]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Fine-tune each model and store the trainers
trainers = {}
for model_key, model in models.items():
    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test']
    )

    # Train the model
    print(f"Training model: {model_key}")
    trainers[model_key].train()


Training model: xlm-roberta


Epoch,Training Loss,Validation Loss
1,No log,1.844292
2,No log,1.591909
3,No log,1.445087


Training model: distilbert


Epoch,Training Loss,Validation Loss
1,No log,1.009381
2,No log,0.723416
3,No log,0.665008


Training model: mbert


Epoch,Training Loss,Validation Loss
1,No log,0.755505
2,No log,0.665313
3,No log,0.637359


In [51]:
# Evaluate the models on the validation dataset
results = {}
for model_key, trainer in trainers.items():
    print(f"Evaluating model: {model_key}")
    eval_result = trainer.evaluate()
    results[model_key] = eval_result

# Print the results
for model_key, result in results.items():
    print(f"Results for {model_key}: {result}")


Evaluating model: xlm-roberta


Evaluating model: distilbert


Evaluating model: mbert


Results for xlm-roberta: {'eval_loss': 1.445087194442749, 'eval_runtime': 8.1892, 'eval_samples_per_second': 1.587, 'eval_steps_per_second': 0.122, 'epoch': 3.0}
Results for distilbert: {'eval_loss': 0.6650081872940063, 'eval_runtime': 2.6282, 'eval_samples_per_second': 4.946, 'eval_steps_per_second': 0.38, 'epoch': 3.0}
Results for mbert: {'eval_loss': 0.6373586058616638, 'eval_runtime': 3.0899, 'eval_samples_per_second': 4.207, 'eval_steps_per_second': 0.324, 'epoch': 3.0}


In [52]:
# Evaluate the models on the validation dataset
results = {}
for model_key, trainer in trainers.items():
    print(f"Evaluating model: {model_key}")
    eval_result = trainer.evaluate()
    results[model_key] = eval_result

# Print the evaluation results for each model
for model_key, result in results.items():
    print(f"Results for {model_key}: {result}")


Evaluating model: xlm-roberta
Evaluating model: distilbert
Evaluating model: mbert
Results for xlm-roberta: {'eval_loss': 1.445087194442749, 'eval_runtime': 6.1911, 'eval_samples_per_second': 2.1, 'eval_steps_per_second': 0.162, 'epoch': 3.0}
Results for distilbert: {'eval_loss': 0.6650081872940063, 'eval_runtime': 1.2881, 'eval_samples_per_second': 10.092, 'eval_steps_per_second': 0.776, 'epoch': 3.0}
Results for mbert: {'eval_loss': 0.6373586058616638, 'eval_runtime': 2.5019, 'eval_samples_per_second': 5.196, 'eval_steps_per_second': 0.4, 'epoch': 3.0}


In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Flatten and filter out padding tokens (-100)
    true_labels = []
    predicted_labels = []
    
    for i in range(len(labels)):
        true_labels.extend([label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
        predicted_labels.extend([pred_label for label, pred_label in zip(labels[i], preds[i]) if label != -100])
    
    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average="macro")
    recall = recall_score(true_labels, predicted_labels, average="macro")
    f1 = f1_score(true_labels, predicted_labels, average="macro")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [58]:
trainers = {}
for model_key, model in models.items():
    trainers[model_key] = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets[model_key]['train'],
        eval_dataset=datasets[model_key]['test'],
        compute_metrics=compute_metrics  # Use the updated compute_metrics function
    )

# Evaluate the models
results = {}
for model_key, trainer in trainers.items():
    print(f"Evaluating model: {model_key}")
    eval_result = trainer.evaluate()
    results[model_key] = eval_result

# Print results
for model_key, result in results.items():
    print(f"Results for {model_key}: {result}")


Evaluating model: xlm-roberta


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating model: distilbert


  _warn_prf(average, modifier, msg_start, len(result))


Evaluating model: mbert


Results for xlm-roberta: {'eval_loss': 1.445087194442749, 'eval_model_preparation_time': 0.0075, 'eval_accuracy': 0.7983134223471539, 'eval_precision': 0.11460855528652139, 'eval_recall': 0.1419822522184727, 'eval_f1': 0.12683525930888181, 'eval_runtime': 5.2607, 'eval_samples_per_second': 2.471, 'eval_steps_per_second': 0.19}
Results for distilbert: {'eval_loss': 0.6650081872940063, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.8673036093418259, 'eval_precision': 0.12390051562026085, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.13270527085194508, 'eval_runtime': 1.772, 'eval_samples_per_second': 7.336, 'eval_steps_per_second': 0.564}
Results for mbert: {'eval_loss': 0.6373586058616638, 'eval_model_preparation_time': 0.0061, 'eval_accuracy': 0.8425806451612903, 'eval_precision': 0.12036866359447004, 'eval_recall': 0.14285714285714285, 'eval_f1': 0.13065226090436174, 'eval_runtime': 2.7103, 'eval_samples_per_second': 4.796, 'eval_steps_per_second': 0.369}


  _warn_prf(average, modifier, msg_start, len(result))
