In [None]:
pip install --upgrade transformers



In [None]:
from google.colab import files
uploaded = files.upload()

Saving output_conll.txt to output_conll.txt


In [None]:
import os
import numpy as np
from datasets import Dataset, DatasetDict, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split

file_path = "output_conll.txt"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"CoNLL file not found: {file_path}")

def read_conll_data(file_path):
    data = []
    current_tokens, current_tags = [], []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    current_tokens.append(token)
                    current_tags.append(tag)
                else:
                    print(f"Skipping malformed line: {line}")
            else:
                if current_tokens:
                    data.append({"tokens": current_tokens, "ner_tags": current_tags})
                    current_tokens, current_tags = [], []
        if current_tokens:
            data.append({"tokens": current_tokens, "ner_tags": current_tags})
    return data

raw_datasets = read_conll_data(file_path)

if not raw_datasets:
    raise ValueError("No data loaded. Check your CoNLL format.")

In [None]:
def normalize_tag(tag):
    if tag == 'O':
        return tag
    elif '-' in tag:
        prefix, entity = tag.split('-', maxsplit=1)
        return prefix + '-' + entity.capitalize()
    else:
        return tag  # just in case

# Normalize all tags in your dataset
for example in raw_datasets:
    example['ner_tags'] = [normalize_tag(tag) for tag in example['ner_tags']]

In [None]:
# Get all unique labels after normalization
label_list = sorted(list(set(label for example in raw_datasets for label in example['ner_tags'])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
for example in raw_datasets:
    example['ner_tags'] = [label_to_id[tag] for tag in example['ner_tags']]

In [None]:
MODEL_NAME = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )
    word_ids = tokenized_inputs.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

hf_dataset = Dataset.from_list(raw_datasets)
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/1652 [00:00<?, ? examples/s]

In [None]:
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
})

In [None]:
# Step 7: Load model
model = AutoModelForTokenClassification.from_pretrained(
     MODEL_NAME,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import transformers
print(transformers.__version__)

4.52.4


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch",  # if using recent version
    report_to="none"
)

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=cf09d8c7d70ca21d196f37993e07cc58dbf8eb31fac7605aa80c3c4c3ce80491
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
# Step 9: Define metrics
from datasets import load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
# Step 10: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 11: Train!
trainer.train()

  trainer = Trainer(


Step,Training Loss
50,0.4355
100,0.0801
150,0.0607
200,0.0307
250,0.0227
300,0.0234
350,0.0111
400,0.0077
450,0.0088
500,0.0071


Step,Training Loss
50,0.4355
100,0.0801
150,0.0607
200,0.0307
250,0.0227
300,0.0234
350,0.0111
400,0.0077
450,0.0088
500,0.0071


TrainOutput(global_step=830, training_loss=0.04302009730992547, metrics={'train_runtime': 14579.3997, 'train_samples_per_second': 0.453, 'train_steps_per_second': 0.057, 'total_flos': 431477972140800.0, 'train_loss': 0.04302009730992547, 'epoch': 5.0})

In [None]:
!ls -la /content

total 1388
drwxr-xr-x 1 root root    4096 Jun 21 19:18 .
drwxr-xr-x 1 root root    4096 Jun 21 19:09 ..
drwxr-xr-x 4 root root    4096 Jun 18 13:35 .config
drwxr-xr-x 7 root root    4096 Jun 21 23:17 ner_model
-rw-r--r-- 1 root root 1400241 Jun 21 19:16 output_conll.txt
drwxr-xr-x 1 root root    4096 Jun 18 13:35 sample_data


In [None]:
!ls /content/ner_model

checkpoint-166	checkpoint-332	checkpoint-498	checkpoint-664	checkpoint-830


In [None]:
trainer.evaluate()

In [None]:
from transformers import XLMRobertaForTokenClassification

model = XLMRobertaForTokenClassification.from_pretrained("/content/ner_model/checkpoint-830")

In [None]:
export_path = "/content/ner_model/final_model"
model.save_pretrained(export_path)
tokenizer.save_pretrained(export_path)

('/content/ner_model/final_model/tokenizer_config.json',
 '/content/ner_model/final_model/special_tokens_map.json',
 '/content/ner_model/final_model/sentencepiece.bpe.model',
 '/content/ner_model/final_model/added_tokens.json',
 '/content/ner_model/final_model/tokenizer.json')

In [None]:
!ls /content/ner_model/final_model

config.json	   sentencepiece.bpe.model  tokenizer_config.json
model.safetensors  special_tokens_map.json  tokenizer.json


In [None]:
!zip -r final_model.zip /content/ner_model/final_model
from google.colab import files
files.download("final_model.zip")

  adding: content/ner_model/final_model/ (stored 0%)
  adding: content/ner_model/final_model/special_tokens_map.json (deflated 52%)
  adding: content/ner_model/final_model/model.safetensors (deflated 29%)
  adding: content/ner_model/final_model/tokenizer.json (deflated 76%)
  adding: content/ner_model/final_model/tokenizer_config.json (deflated 76%)
  adding: content/ner_model/final_model/sentencepiece.bpe.model (deflated 49%)
  adding: content/ner_model/final_model/config.json (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.017773285508155823, 'eval_precision': 0.9748700173310225, 'eval_recall': 0.9731833910034602, 'eval_f1': 0.974025974025974, 'eval_accuracy': 0.9967580480036401, 'eval_runtime': 161.2864, 'eval_samples_per_second': 2.052, 'eval_steps_per_second': 0.26, 'epoch': 5.0}


In [None]:
!cp -r /content/ner_model /content/drive/MyDrive/NER_Project/
!cp final_model.zip /content/drive/MyDrive/NER_Project/

cp: cannot create directory '/content/drive/MyDrive/NER_Project/': No such file or directory
cp: cannot create regular file '/content/drive/MyDrive/NER_Project/': No such file or directory


In [None]:
!mkdir -p /content/drive/MyDrive/NER_Project

In [None]:
!cp -r /content/ner_model /content/drive/MyDrive/NER_Project/
!cp /content/final_model.zip /content/drive/MyDrive/NER_Project/

In [None]:
files.download("final_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast

model = XLMRobertaForTokenClassification.from_pretrained("/content/drive/MyDrive/NER_Project/ner_model/final_model")
tokenizer = XLMRobertaTokenizerFast.from_pretrained("/content/drive/MyDrive/NER_Project/ner_model/final_model")

In [None]:
files.download("final_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>