In [5]:
import warnings
import logging
import sys
import numpy as np
from pathlib import Path

import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    DataCollatorForTokenClassification,
    TrainingArguments, 
    Trainer, 
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    pipeline)

# supress warning in notebook
warnings.filterwarnings('ignore')
sys.path.append(str(Path('..').resolve() / 'src'))

from models import EncoderNERLoader

logging.getLogger("transformers").setLevel(logging.INFO)
logging.getLogger("httpx").setLevel(logging.INFO)
logging.getLogger("huggingface_hub").setLevel(logging.INFO)

## 1. Load Model & Tokenizer

In [None]:
CONFIG_PATH = Path("../config/indobert_config.json")

loader = EncoderNERLoader(CONFIG_PATH)
tokenizer = loader.load_tokenizer()
model = loader.load_model()

id2label, label2id = model.config.id2label, model.config.label2id
print("Label schema", id2label)

INFO:models.model_loader:Loading config from: ..\config\indobert_config.json
INFO:models.model_loader:Loading tokenizer: indolem/indobert-base-uncased
INFO:httpx:HTTP Request: HEAD https://huggingface.co/indolem/indobert-base-uncased/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/indolem/indobert-base-uncased/7ccb3cd0f5b08ffbaa465aade22328e8600e23eb/config.json "HTTP/1.1 200 OK"
loading configuration file config.json from cache at C:\Users\James\.cache\huggingface\hub\models--indolem--indobert-base-uncased\snapshots\7ccb3cd0f5b08ffbaa465aade22328e8600e23eb\config.json
Model config BertConfig {
  "add_cross_attention": false,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": null,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_si

Label schema {0: 'O', 1: 'B-BOOK', 2: 'I-BOOK', 3: 'B-CHAPTER', 4: 'I-CHAPTER'}


INFO:httpx:HTTP Request: GET https://huggingface.co/api/models/indolem/indobert-base-uncased/commits/refs%2Fpr%2F2 "HTTP/1.1 200 OK"
Attempting to convert .bin model on the fly to safetensors.
INFO:httpx:HTTP Request: POST https://safetensors-convert.hf.space/call/run "HTTP/1.1 206 Partial Content"
Exception in thread Thread-auto_conversion:
Traceback (most recent call last):
  File "C:\Python312\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "C:\Python312\Lib\threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "c:\one one\Desktop\bible_reading_recap_nlp\venv\Lib\site-packages\transformers\safetensors_conversion.py", line 117, in auto_conversion
    raise e
  File "c:\one one\Desktop\bible_reading_recap_nlp\venv\Lib\site-packages\transformers\safetensors_conversion.py", line 96, in auto_conversion
    sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^

In [7]:
def read_conll(filepath: Path):
    """Parse a CoNLL-format file into (sentences, labels) lists."""
    sentences, labels = [], []
    tokens, ner_tags = [], []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(ner_tags)
                    tokens, ner_tags = [], []
            else:
                parts = line.split()

                # FIX: skip -DOCSTART- header lines that CoNLL files often contain.
                if parts[0] == "-DOCSTART-":
                    continue

                tokens.append(parts[0])
                ner_tags.append(parts[-1])   # last column is the NER tag

    # flush last sentence if file doesn't end with a blank line
    if tokens:
        sentences.append(tokens)
        labels.append(ner_tags)

    return sentences, labels

In [8]:
DATA_PATH = Path("../data/processed/NER_tasks/ner_tasks.conll")

sentences, labels = read_conll(DATA_PATH)

print(f"Total samples : {len(sentences)}")
print(f"Example tokens: {sentences[144]}")
print(f"Example labels: {labels[144]}")

Total samples : 200
Example tokens: ['Ibr', '3', '!', '-', '4', 'done']
Example labels: ['B-BOOK', 'B-CHAPTER', 'O', 'O', 'B-CHAPTER', 'O']


In [9]:
train_sent, eval_sent, train_labels, eval_labels = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

print(f"Train: {len(train_sent)} | Eval: {len(eval_sent)}")

raw_dataset = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_sent, "ner_tags": train_labels}),
    "eval":  Dataset.from_dict({"tokens": eval_sent,  "ner_tags": eval_labels}),
})
raw_dataset

Train: 160 | Eval: 40


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 160
    })
    eval: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 40
    })
})

In [10]:
def tokenize_and_align_labels(example):
    """
    Tokenise a pre-split sentence and propagate NER labels to subword tokens.
    Continuation tokens (word_idx == previous_word_idx) are masked with -100
    so they are ignored by the loss.
    """
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
    )

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            # special tokens ([CLS], [SEP])
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            # first subword of a new word — assign the real label
            label_ids.append(label2id[example["ner_tags"][word_idx]])
        else:
            # continuation subword — ignore in loss
            label_ids.append(-100)

        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [11]:
tokenized_dataset = raw_dataset.map(tokenize_and_align_labels, batched=False)
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "ner_tags"])
tokenized_dataset.set_format("torch")
tokenized_dataset

Map: 100%|██████████| 160/160 [00:00<00:00, 1667.37 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 1459.14 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 160
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
})

In [12]:
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels, true_predictions = [], []

    for pred_seq, label_seq in zip(predictions, labels):
        current_labels, current_preds = [], []
        for pred_id, label_id in zip(pred_seq, label_seq):
            if label_id != -100:   # skip special / continuation tokens
                current_labels.append(id2label[label_id])
                current_preds.append(id2label[pred_id])
        true_labels.append(current_labels)
        true_predictions.append(current_preds)

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }

Downloading builder script: 6.34kB [00:00, 5.03MB/s]


In [13]:
train_cfg = loader.config.get("training", {})

training_args = TrainingArguments(
    output_dir="./indobert-bible-ner",

    # pulled from config
    learning_rate=train_cfg.get("learning_rate", 5e-5),
    weight_decay=train_cfg.get("weight_decay", 0.01),

    # adjusted for small data
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=train_cfg.get("per_device_eval_batch_size", 16),

    # FIX: warmup_steps expects an int; use warmup_ratio for a fractional value.
    # warmup_steps=0.1 was silently cast to 0 (no warmup at all).
    warmup_ratio=0.1,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    logging_steps=10,
    seed=42,
)

PyTorch: setting up devices
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
***** Running training *****
  Num examples = 160
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 109,971,461


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.861918,0.397911,0.825,0.87766,0.850515,0.916168
2,0.071188,0.041703,0.984127,0.989362,0.986737,0.994012
3,0.014962,0.029642,0.968421,0.978723,0.973545,0.98503
4,0.012948,0.010537,0.989418,0.994681,0.992042,0.997006
5,0.002638,0.006519,0.989418,0.994681,0.992042,0.997006
6,0.00334,0.008486,0.989418,0.994681,0.992042,0.997006
7,0.000784,0.00979,0.989418,0.994681,0.992042,0.997006
8,0.000296,0.00972,0.989418,0.994681,0.992042,0.997006
9,0.000228,0.011972,0.989418,0.994681,0.992042,0.997006
10,0.000348,0.014073,0.989418,0.994681,0.992042,0.997006



***** Running Evaluation *****
  Num examples = 40
  Batch size = 16
Saving model checkpoint to ./indobert-bible-ner\checkpoint-20
Configuration saved in ./indobert-bible-ner\checkpoint-20\config.json
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
Model weights saved in ./indobert-bible-ner\checkpoint-20\model.safetensors
tokenizer config file saved in ./indobert-bible-ner\checkpoint-20\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 40
  Batch size = 16
Saving model checkpoint to ./indobert-bible-ner\checkpoint-40
Configuration saved in ./indobert-bible-ner\checkpoint-40\config.json
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
Model weights saved in ./indobert-bible-ner\checkpoint-40\model.safetensors
tokenizer config file saved in ./indobert-bible-ner\checkpoint-40\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 40
  Batch size = 16
Saving model checkpoint to ./indobert-bible-ner\checkpoint-60


TrainOutput(global_step=300, training_loss=0.09477242743208383, metrics={'train_runtime': 281.0769, 'train_samples_per_second': 8.539, 'train_steps_per_second': 1.067, 'total_flos': 26893881552720.0, 'train_loss': 0.09477242743208383, 'epoch': 15.0})

In [15]:
metrics = trainer.evaluate()
print(f"Eval F1:        {metrics['eval_f1']:.4f}")
print(f"Eval Precision: {metrics['eval_precision']:.4f}")
print(f"Eval Recall:    {metrics['eval_recall']:.4f}")


***** Running Evaluation *****
  Num examples = 40
  Batch size = 16


Eval F1:        0.9920
Eval Precision: 0.9894
Eval Recall:    0.9947


In [16]:
SAVE_PATH = Path("../models/indobert-bible-ner-final")

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"Model saved to: {SAVE_PATH}")

Saving model checkpoint to ..\models\indobert-bible-ner-final
Configuration saved in ..\models\indobert-bible-ner-final\config.json
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
Model weights saved in ..\models\indobert-bible-ner-final\model.safetensors
tokenizer config file saved in ..\models\indobert-bible-ner-final\tokenizer_config.json
tokenizer config file saved in ..\models\indobert-bible-ner-final\tokenizer_config.json


Model saved to: ..\models\indobert-bible-ner-final


In [20]:
tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
model = AutoModelForTokenClassification.from_pretrained(SAVE_PATH)

ner_pipeline = pipeline(
    task="ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple", 
)

test_sentences = [
    "Ul 14 - 15 done Anin,Ul 14 - 15 done",
    "Ul 18 - 19 done Anin,Ul 18 - 19 done",
    "Bil 36 - Ul 1 done",
    "Ul 36 sampai 38 done",
    "17. Jason Kej 1-3 done;\n Kej 4-6 done;\n Kej 7-9 done"
]

for sentence in test_sentences:
    results = ner_pipeline(sentence)
    print(f"\nInput : {sentence}")
    for r in results:
        print(f"  {r['entity_group']:12s} | {r['word']:20s} | score: {r['score']:.4f}")

loading configuration file ..\models\indobert-bible-ner-final\config.json
Model config BertConfig {
  "add_cross_attention": false,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": null,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-BOOK",
    "2": "I-BOOK",
    "3": "B-CHAPTER",
    "4": "I-CHAPTER"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "B-BOOK": 1,
    "B-CHAPTER": 3,
    "I-BOOK": 2,
    "I-CHAPTER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "tie_word_embeddings": true,
  "t

loading configuration file ..\models\indobert-bible-ner-final\config.json
Model config BertConfig {
  "add_cross_attention": false,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": null,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-BOOK",
    "2": "I-BOOK",
    "3": "B-CHAPTER",
    "4": "I-CHAPTER"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "B-BOOK": 1,
    "B-CHAPTER": 3,
    "I-BOOK": 2,
    "I-CHAPTER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "tie_word_embeddings": true,
  "t


Input : Ul 14 - 15 done Anin,Ul 14 - 15 done
  BOOK         | ul                   | score: 0.9995
  CHAPTER      | 14                   | score: 0.9996
  CHAPTER      | 15                   | score: 0.9997
  BOOK         | ul                   | score: 0.9996
  CHAPTER      | 14                   | score: 0.9996
  CHAPTER      | 15                   | score: 0.9998

Input : Ul 18 - 19 done Anin,Ul 18 - 19 done
  BOOK         | ul                   | score: 0.9996
  CHAPTER      | 18                   | score: 0.9996
  CHAPTER      | 19                   | score: 0.9998
  BOOK         | ul                   | score: 0.9996
  CHAPTER      | 18                   | score: 0.9996
  CHAPTER      | 19                   | score: 0.9998

Input : Bil 36 - Ul 1 done
  BOOK         | bil                  | score: 0.9991
  CHAPTER      | 36                   | score: 0.9988
  BOOK         | ul                   | score: 0.9891
  CHAPTER      | 1                    | score: 0.9955

Input : Ul 36 s