<a href="https://colab.research.google.com/github/Kalze1/Amharic_Named_Entity_Recognition/blob/task-1/notebook/Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
# Import necessary libraries
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers datasets seqeval pandas sklearn

# Import libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.metrics import classification_report
import pandas as pd

# Choose pre-trained model (change model_name as needed)
model_name = "xlm-roberta-base"  # or 'bert-tiny-amharic', 'afroxmlr'
num_labels = 7  # Define based on the number of entity types in the dataset (adjust this)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Define a function to read the CoNLL formatted file and return as a DataFrame
def read_conll_file(filepath):
    tokens = []
    labels = []
    sentence_tokens = []
    sentence_labels = []

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens = []
                    sentence_labels = []
            else:
                token, label = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(label)

        if sentence_tokens:
            tokens.append(sentence_tokens)
            labels.append(sentence_labels)

    return pd.DataFrame({'tokens': tokens, 'labels': labels})

# Load the dataset (in CoNLL format)
df = read_conll_file('/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/labeled_cleaned_tokenized_dataset.conll')

# Tokenize the data and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length', max_length=128)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore subwords
            elif word_idx != previous_word_idx:  # First subword of a word
                label_ids.append(int(label[word_idx]))
            else:  # Subsequent subword of the same word
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert the DataFrame into a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
)

# Compute metrics function for evaluation
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    # Remove ignored index (-100) from labels and predictions
    true_labels = [[l for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, preds)]
    true_preds = [[p for l, p in zip(label, pred) if l != -100] for label, pred in zip(labels, preds)]

    report = classification_report(true_labels, true_preds, output_dict=True)

    return {
        "accuracy": report["accuracy"],
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
    }

# Split the dataset into train and test
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("fine_tuned_ner_model")
tokenizer.save_pretrained("fine_tuned_ner_model")

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/974 [00:00<?, ? examples/s]

ValueError: invalid literal for int() with base 10: 'O'

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets seqeval pandas


In [37]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Choose pre-trained model
model_name = "xlm-roberta-base"  # Change to "bert-tiny-amharic" or "afroxmlr" if needed

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
num_labels = 7
model_name = "xlm-roberta-base"  # Replace with your chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
from datasets import load_dataset

#Load the dataset
dataset = load_dataset("israel/Amharic-News-Text-classification-Dataset")

In [None]:
import pandas as pd

# Define a function to read a CoNLL formatted file
def read_conll_file(filepath):
    tokens = []
    labels = []
    sentence_tokens = []
    sentence_labels = []

    # Read the CoNLL file line by line
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip the line to remove trailing spaces and newlines
            line = line.strip()

            # If the line is empty, it means the end of a sentence
            if not line:
                if sentence_tokens:  # Add the sentence if it's not empty
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens = []
                    sentence_labels = []
            else:
                # Split the line into token and label (assuming tab or space separation)
                token, label = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(label)

        # Add the last sentence if the file doesn't end with a blank line
        if sentence_tokens:
            tokens.append(sentence_tokens)
            labels.append(sentence_labels)

    # Create a DataFrame with tokens and labels
    df = pd.DataFrame({'tokens': tokens, 'labels': labels})
    return df

# Load the dataset
df = read_conll_file('/content/drive/MyDrive/Amharic_Named_Entity_Recognition/data/labeled_cleaned_tokenized_dataset.conll')

# Display first few rows
df.head()


In [46]:
def tokenize_articles(examples):
    return tokenizer(examples['article'], truncation=True, padding='max_length', is_split_into_words=False)

tokenized_datasets = dataset.map(tokenize_articles, batched=True)


Map:   0%|          | 0/41186 [00:00<?, ? examples/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [44]:
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'headline', 'category', 'date', 'views', 'article', 'link', 'word_len'],
        num_rows: 41186
    })
    test: Dataset({
        features: ['Unnamed: 0', 'headline', 'category', 'date', 'views', 'article', 'link', 'word_len'],
        num_rows: 10297
    })
})


In [43]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import classification_report

# Load dataset (either prebuilt or your own NER dataset)
dataset = load_dataset("israel/Amharic-News-Text-classification-Dataset")  # Adjust to your dataset

# Load pre-trained model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7)  # Update num_labels to match your NER tags

# Tokenize and align labels (use your custom tokenize_and_align_labels function)
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
)

# Define the compute metrics function for NER evaluation
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/41186 [00:00<?, ? examples/s]

KeyError: 'tokens'

In [23]:
from datasets import load_dataset



In [24]:
#Load the dataset
dataset = load_dataset("israel/Amharic-News-Text-classification-Dataset")

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/150M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/37.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/41186 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10297 [00:00<?, ? examples/s]

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'headline', 'category', 'date', 'views', 'article', 'link', 'word_len'],
        num_rows: 41186
    })
    test: Dataset({
        features: ['Unnamed: 0', 'headline', 'category', 'date', 'views', 'article', 'link', 'word_len'],
        num_rows: 10297
    })
})

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
)



In [33]:
from transformers import Trainer
from sklearn.metrics import classification_report

def compute_metrics(eval_pred):
    """
    Computes the evaluation metrics for a given evaluation prediction.

    Args:
        eval_pred: The evaluation prediction output.

    Returns:
        A dictionary containing the evaluation metrics.
    """

    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    report = classification_report(labels, preds, output_dict=True)

    return {
        "accuracy": report["accuracy"],
        "precision": report["macro_avg"]["precision"],
        "recall": report["macro_avg"]["recall"],
        "f1": report["macro_avg"]["f1-score"],
    }

# Assuming your dataset has "article" and "labels" columns (adjust if needed)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# ... rest of your code ...

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [Unnamed: 0, link, category, headline, article, views, word_len, date]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
trainer.save_model("fine_tuned_model")