# **Sinhala NER Political Domain**



Devide dataset as

In [1]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# ========================
# Step 1: Load your dataset
# ========================

def read_conll(filepath):
    tokens = []
    labels = []
    all_tokens = []
    all_labels = []

    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # Sentence boundary
                if tokens:
                    all_tokens.append(tokens)
                    all_labels.append(labels)
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) == 2:  # Format: token label
                    token, label = splits
                elif len(splits) >= 3:  # If dataset has extra columns
                    token, label = splits[0], splits[-1]
                else:
                    continue
                tokens.append(token)
                labels.append(label)

        # Append last sentence if exists
        if tokens:
            all_tokens.append(tokens)
            all_labels.append(labels)

    return all_tokens, all_labels


tokens, labels = read_conll("After.conll")

print("Example tokens:", tokens[0])
print("Example labels:", labels[0])

# ========================
# Step 2: Create label mappings
# ========================

unique_labels = sorted(set(label for doc in labels for label in doc))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label2ID:", label2id)
print("ID2Label:", id2label)

# ========================
# Step 3: Convert to HuggingFace Dataset
# ========================

data = [{"tokens": t, "ner_tags": [label2id[l] for l in lab]} for t, lab in zip(tokens, labels)]
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

print(train_dataset)
print(test_dataset)


Example tokens: ['උනා', 'නම්', 'කොච්චර', 'ලස්සනයි', 'ද', 'කියලා', 'හිතෙනවා', 'අපිත්', 'ආදරෙයි', 'සර්', 'ඔබතුමාට', 'ඕක', 'දාලා', 'එන්න', 'එකට', 'චමුදිතට', 'තියෙන්නෙ', 'කටක්', 'නෙමේ', 'ලැට්']
Example labels: ['B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-Other', 'B-PER', 'B-Other', 'B-Other', 'B-Other', 'B-Other']
Label2ID: {'B-LOC': 0, 'B-ORG': 1, 'B-Other': 2, 'B-PER': 3, 'I-LOC': 4, 'I-ORG': 5, 'I-Other': 6, 'I-PER': 7, 'O': 8}
ID2Label: {0: 'B-LOC', 1: 'B-ORG', 2: 'B-Other', 3: 'B-PER', 4: 'I-LOC', 5: 'I-ORG', 6: 'I-Other', 7: 'I-PER', 8: 'O'}
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3059
})
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 765
})


### **Preprocess the Dataset**

In [2]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


### Choose a Pretrained Model

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "bert-base-multilingual-cased"  # or "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label2id))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Encode Dataset**

In [14]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True)
    new_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # ignore
            else:
                aligned_labels.append(label[word_id])
        new_labels.append(aligned_labels)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


### **Training**

In [16]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

# Apply the tokenization and alignment function
train_dataset_encoded = train_dataset.map(lambda examples: tokenize_and_align_labels(examples["tokens"], examples["ner_tags"]), batched=True)
test_dataset_encoded = test_dataset.map(lambda examples: tokenize_and_align_labels(examples["tokens"], examples["ner_tags"]), batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    remove_unused_columns=False,  # Keep unused columns to avoid this error
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_encoded, # Use the encoded dataset
    eval_dataset=test_dataset_encoded, # Use the encoded dataset
    tokenizer=tokenizer,
    data_collator=data_collator, # Use the data collator
)

trainer.train()

Map:   0%|          | 0/3059 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`tokens` in this case) have excessive nesting (inputs type `list` where type `int` is expected).