In [1]:
import os
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig
from seqeval.metrics import classification_report

In [2]:
# ------------- Step 1: Read CoNLL Data -----------------
def read_conll(filename):
    sentences = []
    labels = []
    with open(filename, encoding='utf-8') as f:
        words = []
        tags = []
        for line in f:
            line = line.strip()
            if not line or line.startswith('*'):
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words = []
                    tags = []
                continue
            splits = line.split('\t')
            if len(splits) >= 2:
                words.append(splits[0])
                tags.append(splits[1])
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels

# Load separate train and test files
train_sents, train_tags = read_conll('Train.txt')
test_sents, test_tags = read_conll('Test.txt')

In [3]:
# ------------- Step 2: Label Mapping -----------------
# Create mappings from training data only
all_train_tags = [tag for seq in train_tags for tag in seq]
unique_tags = sorted(set(all_train_tags))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for i, tag in enumerate(unique_tags)}

In [4]:
# ------------- Step 3: Tokenization & Alignment ------
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

def tokenize_and_align_labels(sentences, tags, tokenizer, tag2id, max_length=128):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                # Handle unseen tags in test set
                try:
                    label_ids.append(tag2id[label[word_idx]])
                except KeyError:
                    label_ids.append(tag2id['O'])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)
    return tokenized_inputs, labels

# Process both datasets
train_inputs, train_labels = tokenize_and_align_labels(train_sents, train_tags, tokenizer, tag2id)
test_inputs, test_labels = tokenize_and_align_labels(test_sents, test_tags, tokenizer, tag2id)

In [5]:
# ------------- Step 4: Dataset Class -----------------
class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_inputs, train_labels)
test_dataset = NERDataset(test_inputs, test_labels)

In [6]:
# ------------- Step 5: Model Setup -------------------
config = AutoConfig.from_pretrained(
    "ai4bharat/indic-bert",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

model = AutoModelForTokenClassification.from_pretrained(
    "ai4bharat/indic-bert",
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ------------- Step 6: Training Arguments -------------
training_args = TrainingArguments(
    output_dir='./assamese-ner-model',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# ------------- Step 7: Compute Metrics ----------------
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_preds = []
    
    for prediction, label in zip(predictions, labels):
        valid_labels = []
        valid_preds = []
        
        for p, l in zip(prediction, label):
            if l != -100:
                true_tag = id2tag.get(l, 'O')
                pred_tag = id2tag.get(p, 'O')
                
                # Validate IOB format
                if len(true_tag) < 2 or true_tag[1] != '-':
                    true_tag = 'O'
                if len(pred_tag) < 2 or pred_tag[1] != '-':
                    pred_tag = 'O'
                
                valid_labels.append(true_tag)
                valid_preds.append(pred_tag)
        
        if valid_labels:
            true_labels.append(valid_labels)
            true_preds.append(valid_preds)
    
    if not true_labels:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

# ------------- Step 8: Trainer -----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# ------------- Step 9: Train! -----------------------
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5881,0.788517,0.6599,0.654527,0.640156
2,0.5551,0.740781,0.690955,0.667183,0.659436
3,0.2658,0.632279,0.68994,0.702699,0.691335
4,0.1526,0.771766,0.713484,0.706703,0.704951
5,0.1433,0.841258,0.712494,0.708769,0.705358


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=11680, training_loss=0.475557904465966, metrics={'train_runtime': 44977.3017, 'train_samples_per_second': 2.077, 'train_steps_per_second': 0.26, 'total_flos': 517311861496320.0, 'train_loss': 0.475557904465966, 'epoch': 5.0})

In [8]:
# ------------- Step 11: Save Model -------------------
model.save_pretrained('./assamese-ner-model')
tokenizer.save_pretrained('./assamese-ner-model')

('./assamese-ner-model\\tokenizer_config.json',
 './assamese-ner-model\\special_tokens_map.json',
 './assamese-ner-model\\spiece.model',
 './assamese-ner-model\\added_tokens.json',
 './assamese-ner-model\\tokenizer.json')

In [9]:
def ner_predict(text):
    words = text.strip().split()
    if not words:  # Handle empty input
        return []
    
    # Tokenize with same parameters as training
    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=128
    )
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    
    # Align with original words
    word_ids = inputs.word_ids()
    tags = []
    previous_word_idx = None
    
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        
        previous_word_idx = word_idx
        predicted_id = predictions[idx]
        
        # 1. Handle unknown labels
        tag = id2tag.get(predicted_id, 'O')
        
        # 2. Validate IOB format for Assamese
        if len(tag) < 2 or tag[1] != '-':
            tag = 'O'
        elif tag.startswith('I-') and (idx == 0 or predictions[idx-1] == -100):
            # Convert I- to B- if previous token was special
            tag = 'B' + tag[1:]
            
        tags.append(tag)
    
    return list(zip(words, tags))


In [10]:
text = "কুমাৰ গাম্বাৰ সঙ্গল মহিধৰ"
print(ner_predict(text)) 

[('কুমাৰ', 'B-PER'), ('গাম্বাৰ', 'I-PER'), ('সঙ্গল', 'B-WOA'), ('মহিধৰ', 'I-PER')]


In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# 1. Define paths
model_dir = './assamese-ner-model'  # Path to your saved model

# 2. Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(
    model_dir
)


# 4. Inference function with proper alignment
def ner_predict(text, tokenizer, model):
    words = text.split()
    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=128
    )
    
    with torch.no_grad():
        logits = model(**inputs).logits
    predictions = torch.argmax(logits, dim=2).squeeze().tolist()

    word_ids = inputs.word_ids()
    tags = []
    current_word = None
    
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == current_word:
            continue
        
        current_word = word_idx
        predicted_id = predictions[idx]
        tag = id2tag.get(predicted_id, 'O')
        
        # Handle subwords for Assamese compounds
        if tag.startswith('I-') and (idx == 0 or predictions[idx-1] == -100):
            tag = 'B' + tag[1:]  # Convert I- to B- if starts new word
            
        tags.append(tag)
    
    return list(zip(words, tags))




text = "কেনী দেউৰী বসুমতাৰী প্ৰথমখন ইংৰাজী উপন্যাস চকলেট_গীটাৰ_ম'ম'জ ২০১১ চনত উন্মোচন হৈ যায় দিবাকৰ বেনাৰ্জীৰ চাংহাই নামৰ হিন্দী চলচ্চিত্ৰত পুলিচ অফিচাৰৰ ভূমিকাত ২০১৩ মেৰীকম চলচ্চিত্ৰ ২০১৪ , লোকেল কুং ফু ২০১৩ লোকেল কুং ফু"
print(ner_predict(text, tokenizer, model))



[('কেনী', 'B-PER'), ('দেউৰী', 'I-PER'), ('বসুমতাৰী', 'I-PER'), ('প্ৰথমখন', 'B-NUM'), ('ইংৰাজী', 'B-WOA'), ('উপন্যাস', 'I-WOA'), ("চকলেট_গীটাৰ_ম'ম'জ", 'B-WOA'), ('২০১১', 'B-DATE'), ('চনত', 'I-DATE'), ('উন্মোচন', 'B-WOA'), ('হৈ', 'I-WOA'), ('যায়', 'I-WOA'), ('দিবাকৰ', 'B-WOA'), ('বেনাৰ্জীৰ', 'I-WOA'), ('চাংহাই', 'I-WOA'), ('নামৰ', 'I-WOA'), ('হিন্দী', 'I-WOA'), ('চলচ্চিত্ৰত', 'I-WOA'), ('পুলিচ', 'B-WOA'), ('অফিচাৰৰ', 'I-WOA'), ('ভূমিকাত', 'I-WOA'), ('২০১৩', 'B-DATE'), ('মেৰীকম', 'B-WOA'), ('চলচ্চিত্ৰ', 'I-WOA'), ('২০১৪', 'B-DATE'), (',', 'O'), ('লোকেল', 'B-WOA'), ('কুং', 'I-WOA'), ('ফু', 'I-WOA'), ('২০১৩', 'B-DATE'), ('লোকেল', 'B-WOA'), ('কুং', 'I-WOA'), ('ফু', 'I-WOA')]
