In [None]:
import torch
import os
import zipfile
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset, DatasetDict
import numpy as np

# ----------- MOUNT GOOGLE DRIVE -----------
from google.colab import drive
drive.mount('/content/drive')

# ----------- CONFIG -----------
MODEL_NAME = "distilbert-base-cased"  # Smaller model
OUTPUT_DIR = "/content/drive/MyDrive/my_ner_model"  # Save model to Google Drive
DATASET_ZIP_PATH = "conll2003.zip"

# Label configuration
LABEL_LIST = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
LABEL_TO_ID = {label: i for i, label in enumerate(LABEL_LIST)}
OUTPUT_LABEL_MAP = {
    "B-MISC": "MISC", "I-MISC": "MISC",
    "B-PER": "PERSON", "I-PER": "PERSON",
    "B-ORG": "ORG", "I-ORG": "ORG",
    "B-LOC": "LOC", "I-LOC": "LOC",
    "O": "O"
}

# ----------- DATASET LOADING FROM LOCAL ZIP -----------
def load_conll2003_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall("conll2003_data")

    def read_conll_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        tokens, ner_tags = [], []
        current_tokens, current_tags = [], []

        for line in lines:
            line = line.strip()
            if not line:
                if current_tokens:
                    tokens.append(current_tokens)
                    ner_tags.append(current_tags)
                    current_tokens, current_tags = [], []
            else:
                parts = line.split()
                current_tokens.append(parts[0])
                current_tags.append(LABEL_TO_ID.get(parts[-1], 0))

        if current_tokens:
            tokens.append(current_tokens)
            ner_tags.append(current_tags)

        return {"tokens": tokens, "ner_tags": ner_tags}

    train_data = read_conll_file("conll2003_data/train.txt")
    val_data = read_conll_file("conll2003_data/valid.txt")
    test_data = read_conll_file("conll2003_data/test.txt")

    return DatasetDict({
        "train": Dataset.from_dict(train_data),
        "validation": Dataset.from_dict(val_data),
        "test": Dataset.from_dict(test_data)
    })

# ----------- DATA PREPARATION -----------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    labels = []
    for i, tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            label_ids.append(-100 if word_idx is None else tags[word_idx])
        labels.append(label_ids)

    tokenized["labels"] = labels
    return tokenized

# ----------- MODEL TRAINING -----------
def train_ner_model():
    dataset = load_conll2003_from_zip(DATASET_ZIP_PATH)
    tokenized_ds = dataset.map(tokenize_and_align, batched=True)

    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(LABEL_LIST),
        id2label={i: label for i, label in enumerate(LABEL_LIST)},
        label2id=LABEL_TO_ID
    )

    training_args = TrainingArguments(
        OUTPUT_DIR,
        eval_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        fp16=torch.cuda.is_available(),
        logging_dir=os.path.join(OUTPUT_DIR, "logs"),
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer)
    )

    print("Starting training...")
    trainer.train()
    trainer.save_model(OUTPUT_DIR)
    print(f"Model saved to {OUTPUT_DIR}")

# ----------- INFERENCE PIPELINE -----------
class CleanNERPipeline:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.device = 0 if torch.cuda.is_available() else -1
        self.model.to(self.device)

    def predict(self, text):
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            return_offsets_mapping=True,
            truncation=True,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = np.argmax(outputs.logits.cpu().numpy(), axis=2)[0]

        entities = []
        current_entity = None

        for pred, offset in zip(predictions[1:-1], inputs["offset_mapping"][0][1:-1]):
            token_label = LABEL_LIST[pred]
            if offset[0] == offset[1]:
                continue
            simple_label = OUTPUT_LABEL_MAP.get(token_label, "O")
            if simple_label != "O":
                word = text[offset[0]:offset[1]]
                if token_label.startswith("B-") or current_entity is None or current_entity["type"] != simple_label:
                    if current_entity:
                        entities.append(current_entity)
                    current_entity = {"text": word, "type": simple_label, "start": offset[0], "end": offset[1]}
                else:
                    current_entity["text"] += " " + word
                    current_entity["end"] = offset[1]
            else:
                if current_entity:
                    entities.append(current_entity)
                current_entity = None

        if current_entity:
            entities.append(current_entity)

        return entities

# ----------- MAIN EXECUTION -----------
if __name__ == "__main__":
    train_ner_model()
    ner = CleanNERPipeline(OUTPUT_DIR)
    sample_text = "John works at Microsoft in Seattle."
    print("NER Output:", ner.predict(sample_text))

Mounted at /content/drive


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,0.2221,0.076589
2,0.0509,0.069201
3,0.022,0.067978


Model saved to /content/drive/MyDrive/my_ner_model


TypeError: DistilBertForTokenClassification.forward() got an unexpected keyword argument 'offset_mapping'

In [None]:
import re
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline

# Path to your local model
model_path = "/content/drive/MyDrive/my_ner_model"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create NER pipeline
ner_pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy=None,  # we'll handle subwords manually
    device=0 if torch.cuda.is_available() else -1
)

# India-specific regex patterns
INDIA_ENTITY_PATTERNS = {
    "AADHAAR": [
    r"(?<!\+)\b\d{4}\s?\d{4}\s?\d{4}\b"  # 12 digits NOT preceded by +
],

"MOBILE": [
    r"\+91[1-9]\d{9}"  # +91 followed by valid 10-digit mobile number
],
    "PAN": [r"[A-Z]{5}\d{4}[A-Z]{1}"],
    "VEHICLE_NO": [r"[A-Z]{2}\d{2}[A-Z]{1,2}\d{4}"],
    "EMAIL": [r"[a-zA-Z0-9._%+-]+@gmail\.com"],
    "DATE": [
    r"\b\d{2}[/-]\d{2}[/-]\d{4}\b",                     # DD/MM/YYYY or DD-MM-YYYY
    r"\b\d{4}[/-]\d{2}[/-]\d{2}\b",                     # YYYY/MM/DD or YYYY-MM-DD
    r"\b\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{4}\b",  # 12 June 2025
    r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},\s\d{4}\b",  # June 12, 2025
    r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\b",  # Only month
    r"\b\d{4}\b"                                       # Only year
],
    "MONEY": [
    r"(?:₹|Rs\.?)\s?\d{1,3}(?:,\d{2,3})*(?:\.\d{1,2})?\s?(?:crore|lakh|thousand)?"
  ],

}

def aggregate_entities(tokens):
    """Merge subword tokens into full words."""
    merged_entities = []
    buffer = None

    for tok in tokens:
        word = tok['word']
        entity = tok['entity']
        start = tok['start']
        end = tok['end']
        score = tok['score']

        if word.startswith("##") and buffer is not None:
            # append subword
            buffer['word'] += word[2:]
            buffer['end'] = end
            buffer['score'] = min(buffer['score'], score)
        else:
            if buffer is not None:
                merged_entities.append(buffer)
            buffer = {'word': word, 'entity': entity, 'start': start, 'end': end, 'score': score}
    if buffer:
        merged_entities.append(buffer)
    return merged_entities

def india_enhanced_ner(text):
    # Get NER predictions
    entities = ner_pipeline(text)
    entities = aggregate_entities(entities)

    # Add India-specific regex entities
    for entity_type, patterns in INDIA_ENTITY_PATTERNS.items():
        for pattern in patterns:
            for match in re.finditer(pattern, text):
                if not any(match.start() >= e['start'] and match.end() <= e['end'] for e in entities):
                    entities.append({
                        'word': match.group(),
                        'entity': entity_type,
                        'score': 0.99,
                        'start': match.start(),
                        'end': match.end()
                    })

    # Sort by position
    entities = sorted(entities, key=lambda x: x['start'])
    return "\n".join([f"{e['word']}  ->  {e['entity']}  (confidence: {e['score']:.2f})" for e in entities])

# Gradio interface
demo = gr.Interface(
    fn=india_enhanced_ner,
    inputs=gr.Textbox(lines=8, placeholder="Enter your text here..."),
    outputs=gr.Textbox(lines=15, label="NER Results"),
    title="India-Specific NER",
    description="Extract named entities"
)

if __name__ == "__main__":
    demo.launch()

Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://632593aff5a7a84001.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
PERSON, LOCATION, ORG, DATE, Money, AADHAAR, EMAIL, PAN, VEHICLE_NO, MOBILE.