In [1]:
import json
import random

input_path = "ner_dataset.jsonl"     # your existing full dataset
train_path = "train.jsonl"
val_path = "val.jsonl"

# Load all entries
data = [json.loads(line) for line in open(input_path, "r", encoding="utf-8")]

# Shuffle for randomness
random.shuffle(data)

# 85% train, 15% validation
train_size = int(len(data) * 0.85)

train_data = data[:train_size]
val_data = data[train_size:]

# Write TRAIN
with open(train_path, "w", encoding="utf-8") as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

# Write VALIDATION
with open(val_path, "w", encoding="utf-8") as f:
    for item in val_data:
        f.write(json.dumps(item) + "\n")

print(f"Total records: {len(data)}")
print(f"Train: {len(train_data)} â†’ saved to {train_path}")
print(f"Validation: {len(val_data)} â†’ saved to {val_path}")


Total records: 100
Train: 85 â†’ saved to train.jsonl
Validation: 15 â†’ saved to val.jsonl


In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp313-cp313-win_amd64.whl.metadata (3.1 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.3-cp313-cp313-win_amd64.whl.metadata (8.4 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading 


[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
!pip install seqeval

Collecting seqeval
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: seqeval
  Building wheel for seqeval (pyproject.toml): started
  Building wheel for seqeval (pyproject.toml): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16282 sha256=3cd1d87d0a85f7478c08ccf52854b3e5bfc0ac0575924b7b3a8bfb649f62798c
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\14\cf\a7\8f28ef376d707ff10e3922899482a2f23ef3002f8a952f47ac
Successfully built seqeval
Installing collect


[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import json
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DataCollatorForTokenClassification,
    DistilBertForTokenClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
from seqeval.metrics import classification_report


def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            data.append(entry)
    return data

train_data_raw = load_jsonl("train.jsonl")
val_data_raw = load_jsonl("val.jsonl")

train_dataset = Dataset.from_list(train_data_raw)
val_dataset = Dataset.from_list(val_data_raw)


labels = [
    "O",
    "B-DURATION", "I-DURATION",
    "B-WEATHER", "I-WEATHER",
    "B-LOCATION", "I-LOCATION",
    "B-DELAY_REASON", "I-DELAY_REASON",
    "B-PENALTY", "I-PENALTY",
    "B-AMOUNT", "I-AMOUNT",
    "B-LIABILITY", "I-LIABILITY",
    "B-CONDITION", "I-CONDITION",
    "B-PARTY", "I-PARTY",
    "B-JURISDICTION", "I-JURISDICTION",
    "B-DAMAGE_TYPE", "I-DAMAGE_TYPE",
    "B-EVENT", "I-EVENT",
    "B-SLA", "I-SLA"
]

label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")



def encode_examples(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        return_offsets_mapping=True
    )

    labels_out = []

    for i, offsets in enumerate(tokenized["offset_mapping"]):
        entities = examples["entities"][i]
        labels_for_tokens = ["O"] * len(offsets)

        for ent in entities:
            start = ent["start"]
            end = ent["end"]
            ent_label = ent["label"]

            for idx, (tok_start, tok_end) in enumerate(offsets):
                if tok_start >= end or tok_end <= start:
                    continue
                if tok_start == start:
                    labels_for_tokens[idx] = "B-" + ent_label
                else:
                    labels_for_tokens[idx] = "I-" + ent_label

        label_ids = [label2id[label] for label in labels_for_tokens]
        labels_out.append(label_ids)

    tokenized["labels"] = labels_out

    return tokenized


train_tokenized = train_dataset.map(encode_examples, batched=True)
val_tokenized = val_dataset.map(encode_examples, batched=True)

train_tokenized = train_tokenized.remove_columns(["offset_mapping"])
val_tokenized = val_tokenized.remove_columns(["offset_mapping"])



data_collator = DataCollatorForTokenClassification(tokenizer)


model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)



training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=10,
    no_cuda=True 
)



def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    report = classification_report(true_labels, true_predictions, output_dict=True)
    return {"f1": report["micro avg"]["f1-score"]}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()


trainer.save_model("./ner_model")
tokenizer.save_pretrained("./ner_model")

print("Training complete! Model saved in ./ner_model")


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 85/85 [00:00<00:00, 1506.91 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15/15 [00:00<00:00, 1177.12 examples/s]
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`