In [40]:
import os
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split

In [41]:
# ------------- Step 1: Read CoNLL Data -----------------
def read_conll(filename):
    sentences = []
    labels = []
    with open(filename, encoding='utf-8') as f:
        words = []
        tags = []
        for line in f:
            line = line.strip()
            if not line or line.startswith('*'):
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words = []
                    tags = []
                continue
            splits = line.split('\t')
            if len(splits) >= 2:
                words.append(splits[0])
                tags.append(splits[1])
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels

sentences, tags = read_conll('Test.txt')

In [42]:
# ------------- Step 2: Label Mapping -----------------
unique_tags = sorted(set(tag for seq in tags for tag in seq))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}


In [43]:
# ------------- Step 3: Train/Test Split --------------
train_sents, test_sents, train_tags, test_tags = train_test_split(
    sentences, tags, test_size=0.2, random_state=42
)

In [44]:
# ------------- Step 4: Tokenization & Alignment ------
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

def tokenize_and_align_labels(sentences, tags, tokenizer, tag2id, max_length=128):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(tag2id[label[word_idx]])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)
    return tokenized_inputs, labels

train_inputs, train_labels = tokenize_and_align_labels(train_sents, train_tags, tokenizer, tag2id)
test_inputs, test_labels = tokenize_and_align_labels(test_sents, test_tags, tokenizer, tag2id)

In [45]:
# ------------- Step 5: Dataset Class -----------------
class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_inputs, train_labels)
test_dataset = NERDataset(test_inputs, test_labels)


In [46]:
# ------------- Step 6: Model Setup -------------------
config = AutoConfig.from_pretrained(
    "ai4bharat/indic-bert",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

model = AutoModelForTokenClassification.from_pretrained(
    "ai4bharat/indic-bert",
    config=config
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# ------------- Step 7: Training Arguments -------------
training_args = TrainingArguments(
    output_dir='./assamese-ner-model',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# ------------- Step 8: Compute Metrics ----------------
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_preds = []
    
    for prediction, label in zip(predictions, labels):
        valid_labels = []
        valid_preds = []
        
        for p, l in zip(prediction, label):
            if l != -100:
                # Handle invalid label IDs
                true_tag = id2tag.get(l, 'O')
                pred_tag = id2tag.get(p, 'O')
                
                # Validate IOB format
                if len(true_tag) < 2 or true_tag[1] != '-':
                    true_tag = 'O'
                if len(pred_tag) < 2 or pred_tag[1] != '-':
                    pred_tag = 'O'
                
                valid_labels.append(true_tag)
                valid_preds.append(pred_tag)
        
        # Skip empty sequences
        if valid_labels:
            true_labels.append(valid_labels)
            true_preds.append(valid_preds)
    
    if not true_labels:  # Handle all-empty case
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

# ------------- Step 9: Trainer -----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# ------------- Step 10: Train! -----------------------
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.2914,1.017325,0.608474,0.620598,0.604351
2,0.6309,0.866955,0.652309,0.692359,0.660246
3,0.6251,0.766697,0.68799,0.720266,0.697459
4,0.638,0.744875,0.702543,0.726246,0.712793


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1788, training_loss=0.8832456456201455, metrics={'train_runtime': 7587.1343, 'train_samples_per_second': 1.885, 'train_steps_per_second': 0.236, 'total_flos': 79140636573696.0, 'train_loss': 0.8832456456201455, 'epoch': 4.0})

In [48]:
# ------------- Step 11: Save Model -------------------
model.save_pretrained('./assamese-ner-model')
tokenizer.save_pretrained('./assamese-ner-model')

('./assamese-ner-model\\tokenizer_config.json',
 './assamese-ner-model\\special_tokens_map.json',
 './assamese-ner-model\\spiece.model',
 './assamese-ner-model\\added_tokens.json',
 './assamese-ner-model\\tokenizer.json')

In [49]:
# ------------- Step 12: Inference Function -----------
def ner_predict(text):
    words = text.strip().split()
    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=128
    )
    
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    
    # Align predictions with original words
    word_ids = inputs.word_ids()
    tags = []
    for i, word_idx in enumerate(word_ids):
        if word_idx is not None and word_idx != word_ids[i-1]:
            tags.append(id2tag[preds[i]])
    
    return list(zip(words, tags))

In [50]:
# Example
example = "কটক চহৰৰ মিছনেৰী স্কুলতেই সুভাষে প্রথম শিক্ষা লাভ কৰে। তীক্ষ্ণ প্রতিভাসম্পন্ন সুভাষে “ৰেভেঞ্চ” কলেজিয়েট স্কুলৰপৰা ১৯১৩ চনত পাটনা বিশ্ববিদ্যালয়ৰ ভিতৰত দ্বিতীয় স্থান লাভ কৰি পৰীক্ষাত উত্তীর্ণ হয়। তাৰ পাছত কলকাতালৈ আহি প্ৰেছিডেন্সী কলেজত ভৰ্ত্তি হয়। সেই সময়তে শ্ৰীৰামকৃষ্ণ পৰমহংস আৰু স্বামী বিবেকানন্দৰ প্ৰেৰণাত সংসাৰৰ মায়া-মোহ ত্যাগ কৰি সন্ন্যাসী জীৱন যাপনৰ অভিলাষেৰে হিমালয়লৈ গৈ গভীৰ আত্ম-সাধনাত ৰত হয়। উপযুক্ত গুৰুৰ সন্ধানত তেওঁ কিছুদিন ভাৰতৰ নানা তীৰ্থ ভ্ৰমণ কৰে। কিন্তু মুক্তি-পথৰ সন্ধান থাকিল বহুত দূৰত। সেয়েহে তেওঁ ফিৰি আহি পুনৰ প্ৰেছিডেন্সি কলেজত ভৰ্ত্তি হ’লহি। ক’বলৈ গ’লে এই প্রেছিডেন্সি কলেজেই তেওঁৰ অন্তৰত ৰূপ খাই থকা দেশাত্মবোধ আৰু বিপ্লৱী ভাৱধাৰাৰ আগ্নেয়গিৰিটো উদগীৰণত বৰঙণি যোগায়।"
print(ner_predict(example))

[('কটক', 'B-LOC'), ('চহৰৰ', 'I-LOC'), ('মিছনেৰী', 'B-ORG'), ('স্কুলতেই', 'B-WOA'), ('সুভাষে', 'I-WOA'), ('প্রথম', 'B-NUM'), ('শিক্ষা', 'B-WOA'), ('লাভ', 'I-WOA'), ('কৰে।', 'I-WOA'), ('তীক্ষ্ণ', 'B-WOA'), ('প্রতিভাসম্পন্ন', 'I-WOA'), ('সুভাষে', 'I-WOA'), ('“ৰেভেঞ্চ”', 'B-WOA'), ('কলেজিয়েট', 'B-WOA'), ('স্কুলৰপৰা', 'I-WOA'), ('১৯১৩', 'B-DATE'), ('চনত', 'I-DATE'), ('পাটনা', 'B-ORG'), ('বিশ্ববিদ্যালয়ৰ', 'I-ORG'), ('ভিতৰত', 'B-WOA'), ('দ্বিতীয়', 'B-WOA'), ('স্থান', 'B-WOA'), ('লাভ', 'B-WOA'), ('কৰি', 'I-WOA'), ('পৰীক্ষাত', 'B-WOA'), ('উত্তীর্ণ', 'I-WOA'), ('হয়।', 'I-WOA'), ('তাৰ', 'B-WOA'), ('পাছত', 'I-WOA'), ('কলকাতালৈ', 'B-WOA'), ('আহি', 'I-WOA'), ('প্ৰেছিডেন্সী', 'B-WOA'), ('কলেজত', 'B-WOA'), ('ভৰ্ত্তি', 'I-WOA'), ('হয়।', 'I-WOA'), ('সেই', 'B-WOA'), ('সময়তে', 'I-WOA'), ('শ্ৰীৰামকৃষ্ণ', 'I-WOA'), ('পৰমহংস', 'B-WOA'), ('আৰু', 'I-WOA'), ('স্বামী', 'I-WOA'), ('বিবেকানন্দৰ', 'I-WOA'), ('প্ৰেৰণাত', 'I-WOA'), ('সংসাৰৰ', 'I-WOA'), ('মায়া-মোহ', 'B-WOA'), ('ত্যাগ', 'B-WOA'), ('কৰি', 'I-WOA'

In [51]:
text = "কুমাৰ গাম্বাৰ সঙ্গল মহিধৰ"
print(ner_predict(text)) 

[('কুমাৰ', 'B-LOC'), ('গাম্বাৰ', 'I-ORG'), ('সঙ্গল', 'I-ORG'), ('মহিধৰ', 'I-ORG')]
