In [16]:
# STEP 1: Install requirements if not already
# !pip install transformers datasets seqeval

# STEP 2: Imports
import os
import numpy as np
from datasets import DatasetDict, Dataset
import evaluate
metric = evaluate.load("seqeval")
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification
)

# STEP 3: Load raw .txt files and preprocess
data_dir = "D:/LLM/DATA"

def read_conll(filepath):
    tokens, labels = [], []
    data = {"tokens": [], "ner_tags": []}
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    data["tokens"].append(tokens)
                    data["ner_tags"].append(labels)
                    tokens, labels = [], []
            else:
                splits = line.split()
                tokens.append(splits[0])
                labels.append(splits[-1])
    return data

# Load your train/dev/test files
train_data = read_conll(os.path.join(data_dir, "train.txt"))
dev_data = read_conll(os.path.join(data_dir, "dev.txt"))
test_data = read_conll(os.path.join(data_dir, "test.txt"))

# STEP 4: Create HuggingFace Datasets
dataset = DatasetDict({
    "train": Dataset.from_dict(train_data),
    "validation": Dataset.from_dict(dev_data),
    "test": Dataset.from_dict(test_data),
})

# STEP 5: Define labels
unique_labels = list(set(lab for labels in dataset["train"]["ner_tags"] for lab in labels))
label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
id2label = {i: label for label, i in label2id.items()}

# STEP 6: Tokenizer and aligner
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, word_ids):
    aligned = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned.append(-100)
        elif word_idx != previous_word_idx:
            label = labels[word_idx]
            if isinstance(label, list):
                label = label[0]  # if nested list, take the first
            aligned.append(label2id[label])
        else:
            aligned.append(-100)
        previous_word_idx = word_idx
    return aligned

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], truncation=True, padding="max_length", is_split_into_words=True)
    tokenized["labels"] = align_labels_with_tokens(example["ner_tags"], tokenized.word_ids())
    return tokenized

# Tokenize datasets
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# STEP 7: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

# STEP 8: Training configuration
args = TrainingArguments(
    output_dir="./xlmr_ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# STEP 9: Metric
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_preds, true_labels = [], []

    for pred, label in zip(predictions, labels):
        pred_labels = [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        true_labels_seq = [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        true_preds.append(pred_labels)
        true_labels.append(true_labels_seq)

    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# STEP 10: Trainer setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics
)

# STEP 11: Train!
trainer.train()


Map:   0%|          | 0/713 [00:00<?, ? examples/s]

ArrowInvalid: Column 4 named labels expected length 713 but got length 512