In [1]:
!pip install -q pandas matplotlib datasets tqdm seqeval scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and pl

# Preparations

## Import Libraries


In [2]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader 
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.metrics import accuracy_score, f1_score
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
from datasets import load_dataset, DatasetDict, Dataset
from torch.optim import AdamW
from tqdm import tqdm
from itertools import product
import json


## Set variables and Seed


In [3]:
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [4]:
NER_TAG = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']
POS_TAG = ['PROPN', 'AUX', 'NUM', 'NOUN', 'ADP', 'PRON', 'VERB', 'ADV', 
           'ADJ', 'PUNCT', 'DET', 'PART', 'SCONJ', 'CCONJ', 'SYM', 'X']

In [5]:
label2id = {label: id for id, label in enumerate(NER_TAG)}
id2label = {id: label for label, id in label2id.items()}


In [6]:
def read_data(file_path):
    with open(file_path, encoding="utf-8") as f:
        lines = f.read().split("\n")
    
    data = []
    sentence = []
    id = 1
    for line in lines:
        if not line.strip():
            if sentence:
                data.append(sentence)
                sentence = []
        else:
            word, pos, tag = line.split()
            sentence.append((word, pos, tag))
            
    if sentence:
        data.append(sentence)

    # Create a dict for dataset
    raw_data_dict = {}
    
    for idx in range(len(data)):
    # for idx in range(20):
        raw_data_dict[idx] = {}
    
        words = []
        ner_tags = []
        pos_tags = []
        
        for i in range(len(data[idx])):
            words.append(data[idx][i][0])
            pos_tags.append(data[idx][i][1])
            ner_tags.append(data[idx][i][2])

        
        raw_data_dict[idx]['words'] = words
        raw_data_dict[idx]['ner_tags'] = ner_tags
        raw_data_dict[idx]['pos_tags'] = pos_tags
        
    # Convert raw_data to a list of dictionaries
    data_list = []
    for idx, data in raw_data_dict.items():
        data_list.append({
            'id': idx,
            'words': data['words'],
            'ner_tags': data['ner_tags'],
            'pos_tags': data['pos_tags'],
            'chunk_tags': []  # Placeholder, as your data doesn't have chunk_tags
        })
    return data_list

## Read Dataset


In [7]:
train_data = read_data('/kaggle/input/idner-news-2k/train.txt')
test_data = read_data('/kaggle/input/idner-news-2k/test.txt')
dev_data = read_data('/kaggle/input/idner-news-2k/valid.txt')
# train_data[:2]


In [8]:
train_dataset = Dataset.from_dict({k: [d[k] for d in train_data] for k in train_data[0]})
test_dataset = Dataset.from_dict({k: [d[k] for d in test_data] for k in test_data[0]})
dev_dataset = Dataset.from_dict({k: [d[k] for d in dev_data] for k in dev_data[0]})

# Create a DatasetDict
raw_data = DatasetDict({"train": train_dataset, "test": test_dataset, "dev": dev_dataset})
raw_data


DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags', 'pos_tags', 'chunk_tags'],
        num_rows: 1464
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags', 'pos_tags', 'chunk_tags'],
        num_rows: 509
    })
    dev: Dataset({
        features: ['id', 'words', 'ner_tags', 'pos_tags', 'chunk_tags'],
        num_rows: 367
    })
})

# Function Definitions

## Config


In [9]:
# Constants and configuration
def get_config():
    return {
        "max_length": 128,
        "batch_size": 16,
        "learning_rate": 2e-5,
        "epochs": 5,
        "warmup_steps": 500,
        "models": ["indobenchmark/indobert-base-p1", "xlm-roberta-base", "cahya/xlm-roberta-base-indonesian-NER"],
        "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
    }

## Tokenize


In [10]:
# Convert dataset to features
def tokenize_and_align_labels(tokenizer, examples, max_length):
    tokenized_inputs = []
    
    for example in examples:
        words = example["words"]
        labels = [label2id[label] for label in example["ner_tags"]]
        
        # Tokenize word by word to ensure alignment
        word_ids = []
        token_ids = []
        attention_mask = []
        label_ids = []
        
        for word_idx, (word, label) in enumerate(zip(words, labels)):
            # Tokenize the word and count # of tokens
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [tokenizer.unk_token]
                
            # Add tokenized word to output
            token_ids.extend(tokenizer.convert_tokens_to_ids(word_tokens))
            attention_mask.extend([1] * len(word_tokens))
            
            # Add the word_id for each token
            word_ids.extend([word_idx] * len(word_tokens))
            
        # Handle truncation
        if len(token_ids) > max_length - 2:  # Account for [CLS] and [SEP]
            token_ids = token_ids[:max_length - 2]
            attention_mask = attention_mask[:max_length - 2]
            word_ids = word_ids[:max_length - 2]
        
        # Add special tokens
        token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
        attention_mask = [1] + attention_mask + [1]
        word_ids = [None] + word_ids + [None]  # None for special tokens
        
        # Create aligned labels
        label_ids = [-100] * len(token_ids)  # Initialize with -100 (ignored in loss)
        
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                label_ids[idx] = labels[word_id]
        
        # Pad to max_length
        padding_length = max_length - len(token_ids)
        if padding_length > 0:
            token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            label_ids = label_ids + ([-100] * padding_length)
        
        tokenized_inputs.append({
            "input_ids": token_ids,
            "attention_mask": attention_mask,
            "labels": label_ids,
            "word_ids": word_ids
        })
    
    return tokenized_inputs


## Prepare dataset


In [11]:
def prepare_datasets(dataset, tokenizer, config):
    tokenized_datasets = {}
    
    for split in dataset.keys():
        print(f"Preparing {split} dataset...")
        tokenized_datasets[split] = tokenize_and_align_labels(
            tokenizer, 
            dataset[split], 
            config["max_length"]
        )
    
    return tokenized_datasets


## Dataloaders


In [12]:
def create_dataloaders(tokenized_datasets, config):
    dataloaders = {}
    
    for split, dataset in tokenized_datasets.items():
        # Convert to PyTorch tensors
        tensor_dataset = []
        for item in dataset:
            tensor_dataset.append({
                "input_ids": torch.tensor(item["input_ids"]),
                "attention_mask": torch.tensor(item["attention_mask"]),
                "labels": torch.tensor(item["labels"])
            })
        
        # Create DataLoader
        shuffle = (split == "train")
        dataloaders[split] = DataLoader(
            tensor_dataset,
            batch_size=config["batch_size"],
            shuffle=shuffle
        )
    
    return dataloaders


## Train functions


In [13]:
# Training function
def train_model(model, dataloaders, optimizer, scheduler, config, model_name):
    device = config["device"]
    model.to(device)
    
    print(f"Training on {device}")
    best_f1 = 0
    
    for epoch in range(config["epochs"]):
        print(f"\nEpoch {epoch+1}/{config['epochs']}")
        
        # Training
        model.train()
        train_loss = 0
        progress_bar = tqdm(dataloaders["train"], desc=f"Training")
        
        for batch in progress_bar:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            
            # Update
            optimizer.step()
            scheduler.step()
            
            # Track loss
            train_loss += loss.item()
            progress_bar.set_postfix({"loss": loss.item()})
        
        avg_train_loss = train_loss / len(dataloaders["train"])
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        # Validation
        val_metrics = evaluate_model(model, dataloaders["dev"], config)
        
        # Save best model
        if val_metrics["f1"] > best_f1:
            best_f1 = val_metrics["f1"]
            print(f"New best F1: {best_f1:.4f} - Saving model")
            if not os.path.exists("models"):
                os.makedirs("models")
            torch.save(model.state_dict(), f"models/best_model_{model_name.replace('/', '_')}.pt")
        
        print(f"Validation Metrics: Precision={val_metrics['precision']:.4f}, Recall={val_metrics['recall']:.4f}, F1={val_metrics['f1']:.4f}, Exact Match= {val_metrics['exact_match']:.4f}")



## Evaluate functions


In [14]:
# Evaluation function
def evaluate_model(model, dataloader, config):
    device = config["device"]
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=2)
            
            # Extract predictions and labels, removing padding and special tokens
            for pred, label, mask in zip(predictions, batch["labels"], batch["attention_mask"]):
                pred = pred.cpu().numpy()
                label = label.cpu().numpy()
                mask = mask.cpu().numpy()
                
                true_labels = []
                true_preds = []
                
                for p, l, m in zip(pred, label, mask):
                    if m == 1 and l != -100:  # Skip padding and special tokens
                        true_labels.append(id2label[l])
                        true_preds.append(id2label[p])
                
                if true_labels:  # Only add non-empty sequences
                    all_labels.append(true_labels)
                    all_predictions.append(true_preds)
    
    # Calculate exact match
    exact_matches = sum(
        1 for preds, labels in zip(all_predictions, all_labels) if preds == labels
    )
    exact_match_score = exact_matches / len(all_labels) if all_labels else 0.0
    
    # Calculate metrics
    metrics = {
        "precision": precision_score(all_labels, all_predictions),
        "recall": recall_score(all_labels, all_predictions),
        "f1": f1_score(all_labels, all_predictions),
        "exact_match": exact_match_score
    }


    
    return metrics


## Predict funtions (single text)


In [15]:
# Inference function for a single text
def predict_entities(text, model, tokenizer, config):
    device = config["device"]
    model.to(device)
    model.eval()
    
    # Tokenize text
    words = text.split()
    word_tokens = []
    word_ids = []
    
    # Tokenize word-by-word
    for i, word in enumerate(words):
        tokens = tokenizer.tokenize(word)
        if not tokens:
            tokens = [tokenizer.unk_token]
        word_tokens.extend(tokens)
        word_ids.extend([i] * len(tokens))
    
    # Handle truncation
    if len(word_tokens) > config["max_length"] - 2:
        word_tokens = word_tokens[:config["max_length"] - 2]
        word_ids = word_ids[:config["max_length"] - 2]
    
    # Add special tokens
    encoded = tokenizer.encode_plus(
        word_tokens,
        is_split_into_words=True,
        add_special_tokens=True,
        max_length=config["max_length"],
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    # Predict
    with torch.no_grad():
        encoded = {k: v.to(device) for k, v in encoded.items()}
        outputs = model(**encoded)
        predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    
    # Process predictions
    previous_word_id = None
    entities = []
    current_entity = {"text": "", "label": "", "start": -1, "end": -1}
    
    special_tokens_mask = np.array(
        [1] + [0] * len(word_ids) + [1] + [1] * (config["max_length"] - len(word_ids) - 2)
    )
    
    for i, (prediction, word_id, is_special) in enumerate(
        zip(predictions, [None] + word_ids + [None] + [None] * (config["max_length"] - len(word_ids) - 2), special_tokens_mask)
    ):
        # Skip special tokens and padding
        if is_special or word_id is None:
            continue
        
        label = id2label.get(prediction, "O")
        
        # If we've moved to a new word
        if previous_word_id != word_id:
            # If we were building an entity, finalize it
            if current_entity["label"] and current_entity["label"] != "O":
                entities.append(current_entity.copy())
            
            # Start a new word
            previous_word_id = word_id
            
            # Check if this word starts a new entity
            if label.startswith("B-") or (label != "O" and not current_entity["label"]):
                current_entity = {
                    "text": words[word_id],
                    "label": label[2:] if label.startswith("B-") else label,
                    "start": word_id,
                    "end": word_id
                }
            elif label.startswith("I-") and current_entity["label"] == label[2:]:
                # Continue current entity
                current_entity["text"] += " " + words[word_id]
                current_entity["end"] = word_id
            elif label == "O":
                current_entity = {"text": "", "label": "", "start": -1, "end": -1}
    
    # Add the last entity if it exists
    if current_entity["label"] and current_entity["label"] != "O":
        entities.append(current_entity)
    
    return entities


# Proceed training


In [16]:
# Load configuration
config = get_config()
config

{'max_length': 128,
 'batch_size': 16,
 'learning_rate': 2e-05,
 'epochs': 5,
 'warmup_steps': 500,
 'models': ['indobenchmark/indobert-base-p1',
  'xlm-roberta-base',
  'cahya/xlm-roberta-base-indonesian-NER'],
 'device': device(type='cuda')}

In [17]:
dataset = raw_data

## Train Model


### indobenchmark/indobert-base-p1


In [18]:
MODEL_NAME = config["models"][0]
MODEL_NAME

'indobenchmark/indobert-base-p1'

In [19]:
print(f"\n{'='*50}")
print(f"Training model: {MODEL_NAME}")
print(f"{'='*50}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(NER_TAG),
    id2label=id2label,
    label2id=label2id
)



Training model: indobenchmark/indobert-base-p1


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-05-16 16:40:03.516068: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747413603.723701      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747413603.785729      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Prepare datasets
tokenized_datasets = prepare_datasets(dataset, tokenizer, config)
dataloaders = create_dataloaders(tokenized_datasets, config)



Preparing train dataset...


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Preparing test dataset...
Preparing dev dataset...


In [21]:
# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
total_steps = len(dataloaders["train"]) * config["epochs"]
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config["warmup_steps"],
    num_training_steps=total_steps
)


In [22]:
# Train and evaluate
train_model(model, dataloaders, optimizer, scheduler, config, MODEL_NAME)


Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:19<00:00,  4.77it/s, loss=0.512]


Average training loss: 1.1031


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.05it/s]


New best F1: 0.1341 - Saving model
Validation Metrics: Precision=0.3241, Recall=0.0845, F1=0.1341, Exact Match= 0.2943

Epoch 2/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.22] 


Average training loss: 0.3476


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]


New best F1: 0.5572 - Saving model
Validation Metrics: Precision=0.5471, Recall=0.5676, F1=0.5572, Exact Match= 0.4659

Epoch 3/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.147] 


Average training loss: 0.1654


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.24it/s]


New best F1: 0.7126 - Saving model
Validation Metrics: Precision=0.6837, Recall=0.7440, F1=0.7126, Exact Match= 0.6131

Epoch 4/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.0742]


Average training loss: 0.0902


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.22it/s]


New best F1: 0.7948 - Saving model
Validation Metrics: Precision=0.7743, Recall=0.8164, F1=0.7948, Exact Match= 0.7112

Epoch 5/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.0409]


Average training loss: 0.0483


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]


New best F1: 0.8278 - Saving model
Validation Metrics: Precision=0.7888, Recall=0.8708, F1=0.8278, Exact Match= 0.7411


In [23]:
# Load best model
best_model_path = f"models/best_model_{MODEL_NAME.replace('/', '_')}.pt"
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))


In [24]:
# Evaluate on test set
print("Evaluating on test set...")
test_metrics = evaluate_model(model, dataloaders["test"], config)
print(f"Test Metrics:\nPrecision={test_metrics['precision']:.4f}\nRecall={test_metrics['recall']:.4f}\nF1={test_metrics['f1']:.4f}\nExact Match={test_metrics['exact_match']:.4f}")

Evaluating on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.11it/s]

Test Metrics:
Precision=0.8950
Recall=0.9132
F1=0.9040
Exact Match=0.7878





In [25]:
# Demo predictions
print("\nExample predictions:")
sample_texts = [
    "Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.",
    "Bank BRI meluncurkan program baru di Jakarta Selatan.",
    "Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta."
]

for text in sample_texts:
    entities = predict_entities(text, model, tokenizer, config)
    
    print(f"\nText: {text}")
    print("Entities:")
    
    for entity in entities:
        print(f"  - {entity['text']} ({entity['label']})")



Example predictions:

Text: Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.
Entities:
  - Joko (PER)
  - Joko Widodo (PER)
  - Universitas (LOC)
  - Universitas Indonesia (LOC)
  - Depok. (LOC)

Text: Bank BRI meluncurkan program baru di Jakarta Selatan.
Entities:
  - Bank (ORG)
  - Bank BRI (ORG)
  - Jakarta (LOC)
  - Jakarta Selatan. (LOC)

Text: Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta.
Entities:
  - Anies (PER)
  - Anies Baswedan (PER)


### xlm-roberta-base


In [26]:
MODEL_NAME = config["models"][1]
MODEL_NAME

'xlm-roberta-base'

In [27]:
print(f"\n{'='*50}")
print(f"Training model: {MODEL_NAME}")
print(f"{'='*50}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(NER_TAG),
    id2label=id2label,
    label2id=label2id
)



Training model: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
total_steps = len(dataloaders["train"]) * config["epochs"]
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config["warmup_steps"],
    num_training_steps=total_steps
)


In [29]:
# Train and evaluate
train_model(model, dataloaders, optimizer, scheduler, config, MODEL_NAME)


Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.36it/s, loss=0.608]


Average training loss: 1.4260


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.542]


Average training loss: 0.6660


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.18it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 3/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.815]


Average training loss: 0.6504


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.18it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 4/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.461]


Average training loss: 0.6274


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 5/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.614]


Average training loss: 0.5867


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.09it/s]


New best F1: 0.0071 - Saving model
Validation Metrics: Precision=0.1765, Recall=0.0036, F1=0.0071, Exact Match= 0.3106


In [30]:
# Load best model
best_model_path = f"models/best_model_{MODEL_NAME.replace('/', '_')}.pt"
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))


In [31]:
# Evaluate on test set
print("Evaluating on test set...")
test_metrics = evaluate_model(model, dataloaders["test"], config)
print(f"Test Metrics:\nPrecision={test_metrics['precision']:.4f}\nRecall={test_metrics['recall']:.4f}\nF1={test_metrics['f1']:.4f}\nExact Match={test_metrics['exact_match']:.4f}")


Evaluating on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.13it/s]

Test Metrics:
Precision=0.1400
Recall=0.0051
F1=0.0098
Exact Match=0.1670





In [32]:
# Demo predictions
print("\nExample predictions:")
sample_texts = [
    "Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.",
    "Bank BRI meluncurkan program baru di Jakarta Selatan.",
    "Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta."
]

for text in sample_texts:
    entities = predict_entities(text, model, tokenizer, config)
    
    print(f"\nText: {text}")
    print("Entities:")
    
    for entity in entities:
        print(f"  - {entity['text']} ({entity['label']})")



Example predictions:

Text: Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.
Entities:
  - Presiden (ORG)
  - Widodo (ORG)

Text: Bank BRI meluncurkan program baru di Jakarta Selatan.
Entities:
  - di (ORG)

Text: Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta.
Entities:


### cahya/xlm-roberta-base-indonesian-NER


In [33]:
MODEL_NAME = config["models"][2]
MODEL_NAME

'cahya/xlm-roberta-base-indonesian-NER'

In [34]:
print(f"\n{'='*50}")
print(f"Training model: {MODEL_NAME}")
print(f"{'='*50}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

custom_config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(NER_TAG),
    id2label=id2label,
    label2id=label2id
)

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    config=custom_config,
    ignore_mismatched_sizes=True
)



Training model: cahya/xlm-roberta-base-indonesian-NER


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

In [35]:
# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
total_steps = len(dataloaders["train"]) * config["epochs"]
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config["warmup_steps"],
    num_training_steps=total_steps
)


In [36]:
# Train and evaluate
train_model(model, dataloaders, optimizer, scheduler, config, MODEL_NAME)


Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.718]


Average training loss: 1.2160


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.878]


Average training loss: 0.6771


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.27it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 3/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.949]


Average training loss: 0.6565


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.23it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 4/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.557]


Average training loss: 0.6380


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 5/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.507]


Average training loss: 0.5995


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106


In [37]:
# Load best model
best_model_path = f"models/best_model_{MODEL_NAME}.pt"
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))


In [38]:
# Evaluate on test set
print("Evaluating on test set...")
test_metrics = evaluate_model(model, dataloaders["test"], config)
print(f"Test Metrics:\nPrecision={test_metrics['precision']:.4f}\nRecall={test_metrics['recall']:.4f}\nF1={test_metrics['f1']:.4f}\nExact Match={test_metrics['exact_match']:.4f}")

Evaluating on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.28it/s]

Test Metrics:
Precision=0.0000
Recall=0.0000
F1=0.0000
Exact Match=0.1670





In [39]:
# Demo predictions
print("\nExample predictions:")
sample_texts = [
    "Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.",
    "Bank BRI meluncurkan program baru di Jakarta Selatan.",
    "Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta."
]

for text in sample_texts:
    entities = predict_entities(text, model, tokenizer, config)
    
    print(f"\nText: {text}")
    print("Entities:")
    
    for entity in entities:
        print(f"  - {entity['text']} ({entity['label']})")



Example predictions:

Text: Presiden Joko Widodo mengunjungi Universitas Indonesia di Depok.
Entities:
  - Joko (LOC)
  - Universitas (I-ORG)

Text: Bank BRI meluncurkan program baru di Jakarta Selatan.
Entities:

Text: Gubernur DKI Jakarta Anies Baswedan meresmikan MRT Jakarta.
Entities:
  - Anies (LOC)


## Hyper Parameter Tuning


In [40]:
best_scores = {}

def hyperparameter_tuning(base_config, search_space):
    keys, values = zip(*search_space.items())
    experiments = [dict(zip(keys, v)) for v in product(*values)]

    for i, params in enumerate(experiments):
        print(f"\n======== Running Experiment {i+1}/{len(experiments)} ========")
        config = base_config.copy()
        config.update(params)

        for MODEL_NAME in config["models"]:
        # for MODEL_NAME in ["cahya/xlm-roberta-base-indonesian-NER"]:
            print(f"\n{'='*50}")
            print(f"Training model: {MODEL_NAME} with params: {params}")
            print(f"{'='*50}")

            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

            if MODEL_NAME == 'cahya/xlm-roberta-base-indonesian-NER':
                # Load tokenizer and model
                custom_config = AutoConfig.from_pretrained(
                    MODEL_NAME,
                    num_labels=len(NER_TAG),
                    id2label=id2label,
                    label2id=label2id
                )
                
                # Load model
                model = AutoModelForTokenClassification.from_pretrained(
                    MODEL_NAME,
                    config=custom_config,
                    ignore_mismatched_sizes=True
                )


            else:
                model = AutoModelForTokenClassification.from_pretrained(
                    MODEL_NAME, 
                    num_labels=len(NER_TAG),
                    id2label=id2label,
                    label2id=label2id
                )
                
            model.to(config["device"])

            tokenized_datasets = prepare_datasets(dataset, tokenizer, config)
            dataloaders = create_dataloaders(tokenized_datasets, config)

            optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
            total_steps = len(dataloaders["train"]) * config["epochs"]
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=config["warmup_steps"],
                num_training_steps=total_steps
            )

            train_model(model, dataloaders, optimizer, scheduler, config, MODEL_NAME)

            model_type = model.config.model_type
            exp_name = f"{model_type}_lr{config['learning_rate']}_bs{config['batch_size']}_ep{config['epochs']}"
            model_path = f"models/{exp_name}.pt"
            os.makedirs("models", exist_ok=True)
            torch.save(model.state_dict(), model_path)

            model.load_state_dict(torch.load(model_path))
            print(f"\nEvaluating {exp_name} on test set...")
            test_metrics = evaluate_model(model, dataloaders["test"], config)
            print(f"Metrics: Precision={test_metrics['precision']:.4f}, Recall={test_metrics['recall']:.4f}, F1={test_metrics['f1']:.4f}, Exact Match={test_metrics['exact_match']:.4f}")

            current_best = best_scores.get(model_type, {"f1": 0})
            
            if test_metrics["f1"] > current_best["f1"]:
                best_model_path = f"models/best_{model_type}.pt"
                torch.save(model.state_dict(), best_model_path)

                save_dir = f"saved_models/{MODEL_NAME.replace('/', '_')}"
                os.makedirs(save_dir, exist_ok=True)
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
            
                with open(os.path.join(save_dir, "label2id.json"), "w") as f:
                    json.dump(label2id, f)
                with open(os.path.join(save_dir, "id2label.json"), "w") as f:
                    json.dump(id2label, f)

                best_scores[model_type] = {"f1": test_metrics["f1"], "path": best_model_path, "params": params}
                print(f"New best model saved for {model_type} at {best_model_path}")


In [41]:
base_config = get_config()
search_space = {
    "learning_rate": [10e-5, 5e-5, 3e-5, 2e-5],
    "batch_size": [16, 32],
    "epochs": [3, 5, 10, 20],
}

hyperparameter_tuning(base_config, search_space)




Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 3}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.296] 


Average training loss: 0.7052


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.31it/s]


New best F1: 0.5669 - Saving model
Validation Metrics: Precision=0.5663, Recall=0.5676, F1=0.5669, Exact Match= 0.5014

Epoch 2/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.131] 


Average training loss: 0.1364


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.17it/s]


New best F1: 0.7503 - Saving model
Validation Metrics: Precision=0.7289, Recall=0.7729, F1=0.7503, Exact Match= 0.6594

Epoch 3/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.0458] 


Average training loss: 0.0608


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.29it/s]


New best F1: 0.7857 - Saving model
Validation Metrics: Precision=0.7900, Recall=0.7814, F1=0.7857, Exact Match= 0.6757

Evaluating bert_lr0.0001_bs16_ep3 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.20it/s]


Metrics: Precision=0.8478, Recall=0.7822, F1=0.8137, Exact Match=0.6719
New best model saved for bert at models/best_bert.pt

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 3}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.224]


Average training loss: 1.1040


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


New best F1: 0.1256 - Saving model
Validation Metrics: Precision=0.1469, Recall=0.1098, F1=0.1256, Exact Match= 0.2861

Epoch 2/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0923]


Average training loss: 0.2432


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


New best F1: 0.7317 - Saving model
Validation Metrics: Precision=0.6934, Recall=0.7744, F1=0.7317, Exact Match= 0.6540

Epoch 3/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0528]


Average training loss: 0.1078


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


New best F1: 0.8331 - Saving model
Validation Metrics: Precision=0.8556, Recall=0.8117, F1=0.8331, Exact Match= 0.7057

Evaluating xlm-roberta_lr0.0001_bs16_ep3 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.23it/s]


Metrics: Precision=0.9005, Recall=0.8876, F1=0.8940, Exact Match=0.7642
New best model saved for xlm-roberta at models/best_xlm-roberta.pt

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 3}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0882]


Average training loss: 0.7437


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.18it/s]


New best F1: 0.6998 - Saving model
Validation Metrics: Precision=0.6604, Recall=0.7442, F1=0.6998, Exact Match= 0.5749

Epoch 2/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.129] 


Average training loss: 0.1185


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


New best F1: 0.8695 - Saving model
Validation Metrics: Precision=0.8669, Recall=0.8721, F1=0.8695, Exact Match= 0.7711

Epoch 3/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0555]


Average training loss: 0.0666


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.17it/s]


New best F1: 0.8719 - Saving model
Validation Metrics: Precision=0.8768, Recall=0.8671, F1=0.8719, Exact Match= 0.7711

Evaluating xlm-roberta_lr0.0001_bs16_ep3 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.27it/s]


Metrics: Precision=0.8984, Recall=0.8882, F1=0.8933, Exact Match=0.7721


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 5}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.24] 


Average training loss: 0.7467


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.25it/s]


New best F1: 0.5759 - Saving model
Validation Metrics: Precision=0.5964, Recall=0.5568, F1=0.5759, Exact Match= 0.4850

Epoch 2/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.0181]


Average training loss: 0.1402


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


New best F1: 0.7939 - Saving model
Validation Metrics: Precision=0.7968, Recall=0.7911, F1=0.7939, Exact Match= 0.6948

Epoch 3/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.0499]


Average training loss: 0.0633


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.31it/s]


New best F1: 0.8182 - Saving model
Validation Metrics: Precision=0.7725, Recall=0.8696, F1=0.8182, Exact Match= 0.7221

Epoch 4/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.00115]


Average training loss: 0.0349


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.28it/s]


New best F1: 0.8390 - Saving model
Validation Metrics: Precision=0.8169, Recall=0.8623, F1=0.8390, Exact Match= 0.7193

Epoch 5/5


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.0373] 


Average training loss: 0.0348


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.31it/s]


Validation Metrics: Precision=0.7908, Recall=0.8309, F1=0.8104, Exact Match= 0.7221

Evaluating bert_lr0.0001_bs16_ep5 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.19it/s]


Metrics: Precision=0.8818, Recall=0.8907, F1=0.8862, Exact Match=0.7485
New best model saved for bert at models/best_bert.pt

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 5}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.274]


Average training loss: 1.1851


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


New best F1: 0.0099 - Saving model
Validation Metrics: Precision=0.2941, Recall=0.0050, F1=0.0099, Exact Match= 0.3106

Epoch 2/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0907]


Average training loss: 0.2466


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.24it/s]


New best F1: 0.7293 - Saving model
Validation Metrics: Precision=0.7315, Recall=0.7271, F1=0.7293, Exact Match= 0.6240

Epoch 3/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0891]


Average training loss: 0.0985


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.8629 - Saving model
Validation Metrics: Precision=0.8285, Recall=0.9003, F1=0.8629, Exact Match= 0.7466

Epoch 4/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0378]


Average training loss: 0.0644


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


New best F1: 0.8800 - Saving model
Validation Metrics: Precision=0.8890, Recall=0.8711, F1=0.8800, Exact Match= 0.7766

Epoch 5/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0138] 


Average training loss: 0.0564


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.17it/s]


New best F1: 0.8868 - Saving model
Validation Metrics: Precision=0.8904, Recall=0.8832, F1=0.8868, Exact Match= 0.7657

Evaluating xlm-roberta_lr0.0001_bs16_ep5 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.19it/s]


Metrics: Precision=0.8884, Recall=0.8990, F1=0.8937, Exact Match=0.7623

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 5}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.307]


Average training loss: 0.7896


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.18it/s]


New best F1: 0.7082 - Saving model
Validation Metrics: Precision=0.7016, Recall=0.7150, F1=0.7082, Exact Match= 0.6131

Epoch 2/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.109] 


Average training loss: 0.1295


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


New best F1: 0.8547 - Saving model
Validation Metrics: Precision=0.8534, Recall=0.8560, F1=0.8547, Exact Match= 0.7139

Epoch 3/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.00344]


Average training loss: 0.0705


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.22it/s]


New best F1: 0.8996 - Saving model
Validation Metrics: Precision=0.8881, Recall=0.9114, F1=0.8996, Exact Match= 0.8065

Epoch 4/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0357] 


Average training loss: 0.0486


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


Validation Metrics: Precision=0.8547, Recall=0.9003, F1=0.8769, Exact Match= 0.7711

Epoch 5/5


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.103]  


Average training loss: 0.0687


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]


Validation Metrics: Precision=0.8345, Recall=0.8479, F1=0.8412, Exact Match= 0.6621

Evaluating xlm-roberta_lr0.0001_bs16_ep5 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.13it/s]


Metrics: Precision=0.8746, Recall=0.8391, F1=0.8565, Exact Match=0.6817


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 10}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.134]


Average training loss: 0.6755


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.27it/s]


New best F1: 0.5532 - Saving model
Validation Metrics: Precision=0.5673, Recall=0.5399, F1=0.5532, Exact Match= 0.4986

Epoch 2/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.0953]


Average training loss: 0.1301


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.25it/s]


New best F1: 0.7783 - Saving model
Validation Metrics: Precision=0.7255, Recall=0.8394, F1=0.7783, Exact Match= 0.6894

Epoch 3/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.136] 


Average training loss: 0.0585


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.28it/s]


New best F1: 0.8135 - Saving model
Validation Metrics: Precision=0.8106, Recall=0.8164, F1=0.8135, Exact Match= 0.6812

Epoch 4/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.92it/s, loss=0.024]  


Average training loss: 0.0447


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.27it/s]


Validation Metrics: Precision=0.6685, Recall=0.7428, F1=0.7037, Exact Match= 0.5886

Epoch 5/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.113]  


Average training loss: 0.0410


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.26it/s]


Validation Metrics: Precision=0.8017, Recall=0.8152, F1=0.8084, Exact Match= 0.6785

Epoch 6/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.025]  


Average training loss: 0.0415


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.37it/s]


Validation Metrics: Precision=0.7805, Recall=0.8200, F1=0.7998, Exact Match= 0.6921

Epoch 7/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.025]  


Average training loss: 0.0277


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


Validation Metrics: Precision=0.7965, Recall=0.8176, F1=0.8069, Exact Match= 0.7330

Epoch 8/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0326]  


Average training loss: 0.0109


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.33it/s]


New best F1: 0.8255 - Saving model
Validation Metrics: Precision=0.8065, Recall=0.8454, F1=0.8255, Exact Match= 0.7493

Epoch 9/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.000462]


Average training loss: 0.0027


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.33it/s]


New best F1: 0.8364 - Saving model
Validation Metrics: Precision=0.8100, Recall=0.8647, F1=0.8364, Exact Match= 0.7684

Epoch 10/10


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.00207] 


Average training loss: 0.0013


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


Validation Metrics: Precision=0.8137, Recall=0.8599, F1=0.8362, Exact Match= 0.7602

Evaluating bert_lr0.0001_bs16_ep10 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.38it/s]


Metrics: Precision=0.9008, Recall=0.9067, F1=0.9037, Exact Match=0.7937
New best model saved for bert at models/best_bert.pt

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 10}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.309]


Average training loss: 0.9219


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


New best F1: 0.3226 - Saving model
Validation Metrics: Precision=0.3945, Recall=0.2729, F1=0.3226, Exact Match= 0.3515

Epoch 2/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.0216]


Average training loss: 0.2140


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


New best F1: 0.8141 - Saving model
Validation Metrics: Precision=0.7712, Recall=0.8620, F1=0.8141, Exact Match= 0.6785

Epoch 3/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.128] 


Average training loss: 0.0890


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.29it/s]


New best F1: 0.8495 - Saving model
Validation Metrics: Precision=0.8201, Recall=0.8812, F1=0.8495, Exact Match= 0.7112

Epoch 4/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0437] 


Average training loss: 0.0684


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.26it/s]


New best F1: 0.8611 - Saving model
Validation Metrics: Precision=0.8400, Recall=0.8832, F1=0.8611, Exact Match= 0.7439

Epoch 5/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0576]


Average training loss: 0.0613


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.26it/s]


New best F1: 0.8783 - Saving model
Validation Metrics: Precision=0.8501, Recall=0.9084, F1=0.8783, Exact Match= 0.7575

Epoch 6/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0267] 


Average training loss: 0.0538


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.26it/s]


New best F1: 0.8943 - Saving model
Validation Metrics: Precision=0.8696, Recall=0.9204, F1=0.8943, Exact Match= 0.8120

Epoch 7/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0265] 


Average training loss: 0.0363


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


Validation Metrics: Precision=0.8436, Recall=0.8963, F1=0.8691, Exact Match= 0.7684

Epoch 8/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.00611]


Average training loss: 0.0176


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.24it/s]


New best F1: 0.8987 - Saving model
Validation Metrics: Precision=0.8864, Recall=0.9114, F1=0.8987, Exact Match= 0.8147

Epoch 9/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.00111] 


Average training loss: 0.0097


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.29it/s]


New best F1: 0.9028 - Saving model
Validation Metrics: Precision=0.8849, Recall=0.9215, F1=0.9028, Exact Match= 0.8147

Epoch 10/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0108]  


Average training loss: 0.0055


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.27it/s]


New best F1: 0.9076 - Saving model
Validation Metrics: Precision=0.8913, Recall=0.9245, F1=0.9076, Exact Match= 0.8229

Evaluating xlm-roberta_lr0.0001_bs16_ep10 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.00it/s]


Metrics: Precision=0.9252, Recall=0.9310, F1=0.9281, Exact Match=0.8193
New best model saved for xlm-roberta at models/best_xlm-roberta.pt

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 10}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.208]


Average training loss: 0.6067


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.29it/s]


New best F1: 0.7182 - Saving model
Validation Metrics: Precision=0.6939, Recall=0.7442, F1=0.7182, Exact Match= 0.5804

Epoch 2/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.266] 


Average training loss: 0.1186


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.24it/s]


New best F1: 0.8671 - Saving model
Validation Metrics: Precision=0.8363, Recall=0.9003, F1=0.8671, Exact Match= 0.7384

Epoch 3/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.103]  


Average training loss: 0.0747


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


New best F1: 0.8807 - Saving model
Validation Metrics: Precision=0.8628, Recall=0.8993, F1=0.8807, Exact Match= 0.7875

Epoch 4/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.0397] 


Average training loss: 0.0552


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.28it/s]


New best F1: 0.8856 - Saving model
Validation Metrics: Precision=0.8751, Recall=0.8963, F1=0.8856, Exact Match= 0.7902

Epoch 5/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.00513]


Average training loss: 0.0550


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.16it/s]


New best F1: 0.8971 - Saving model
Validation Metrics: Precision=0.9031, Recall=0.8912, F1=0.8971, Exact Match= 0.7793

Epoch 6/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0918] 


Average training loss: 0.0537


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.24it/s]


Validation Metrics: Precision=0.7991, Recall=0.8892, F1=0.8418, Exact Match= 0.7112

Epoch 7/10


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0559] 


Average training loss: 0.0306


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.23it/s]


Validation Metrics: Precision=0.8475, Recall=0.9063, F1=0.8759, Exact Match= 0.7602

Epoch 8/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.00567]


Average training loss: 0.0159


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


New best F1: 0.9005 - Saving model
Validation Metrics: Precision=0.8852, Recall=0.9164, F1=0.9005, Exact Match= 0.8011

Epoch 9/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0995]  


Average training loss: 0.0107


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.9037 - Saving model
Validation Metrics: Precision=0.8866, Recall=0.9215, F1=0.9037, Exact Match= 0.8120

Epoch 10/10


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0013]  


Average training loss: 0.0064


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.22it/s]


New best F1: 0.9056 - Saving model
Validation Metrics: Precision=0.8893, Recall=0.9225, F1=0.9056, Exact Match= 0.8120

Evaluating xlm-roberta_lr0.0001_bs16_ep10 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 17.88it/s]


Metrics: Precision=0.9167, Recall=0.9412, F1=0.9288, Exact Match=0.8389
New best model saved for xlm-roberta at models/best_xlm-roberta.pt


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 20}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.106]


Average training loss: 0.8077


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.22it/s]


New best F1: 0.5809 - Saving model
Validation Metrics: Precision=0.5809, Recall=0.5809, F1=0.5809, Exact Match= 0.4959

Epoch 2/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.134] 


Average training loss: 0.1441


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.30it/s]


New best F1: 0.7995 - Saving model
Validation Metrics: Precision=0.7901, Recall=0.8092, F1=0.7995, Exact Match= 0.7112

Epoch 3/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0579]


Average training loss: 0.0598


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.33it/s]


New best F1: 0.8208 - Saving model
Validation Metrics: Precision=0.8315, Recall=0.8104, F1=0.8208, Exact Match= 0.7221

Epoch 4/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0805] 


Average training loss: 0.0587


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


New best F1: 0.8273 - Saving model
Validation Metrics: Precision=0.8002, Recall=0.8563, F1=0.8273, Exact Match= 0.7112

Epoch 5/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0277] 


Average training loss: 0.0351


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


Validation Metrics: Precision=0.7626, Recall=0.8768, F1=0.8157, Exact Match= 0.6894

Epoch 6/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0475]  


Average training loss: 0.0366


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


Validation Metrics: Precision=0.7991, Recall=0.8164, F1=0.8076, Exact Match= 0.6894

Epoch 7/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0611] 


Average training loss: 0.0322


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.39it/s]


Validation Metrics: Precision=0.8216, Recall=0.8285, F1=0.8250, Exact Match= 0.7057

Epoch 8/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.00372]


Average training loss: 0.0166


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.39it/s]


Validation Metrics: Precision=0.7865, Recall=0.8140, F1=0.8000, Exact Match= 0.7057

Epoch 9/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.00173] 


Average training loss: 0.0145


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


Validation Metrics: Precision=0.7877, Recall=0.8696, F1=0.8266, Exact Match= 0.6894

Epoch 10/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.00503] 


Average training loss: 0.0167


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


Validation Metrics: Precision=0.7832, Recall=0.8466, F1=0.8137, Exact Match= 0.7248

Epoch 11/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.0324]  


Average training loss: 0.0079


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.38it/s]


Validation Metrics: Precision=0.7800, Recall=0.8563, F1=0.8164, Exact Match= 0.7248

Epoch 12/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.000427]


Average training loss: 0.0061


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


New best F1: 0.8390 - Saving model
Validation Metrics: Precision=0.8410, Recall=0.8370, F1=0.8390, Exact Match= 0.7602

Epoch 13/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=5.28e-5] 


Average training loss: 0.0025


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.35it/s]


Validation Metrics: Precision=0.8202, Recall=0.8430, F1=0.8314, Exact Match= 0.7548

Epoch 14/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.000149]


Average training loss: 0.0022


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.33it/s]


New best F1: 0.8398 - Saving model
Validation Metrics: Precision=0.8195, Recall=0.8611, F1=0.8398, Exact Match= 0.7493

Epoch 15/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=5.53e-5] 


Average training loss: 0.0005


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.35it/s]


New best F1: 0.8424 - Saving model
Validation Metrics: Precision=0.8211, Recall=0.8647, F1=0.8424, Exact Match= 0.7684

Epoch 16/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.000129]


Average training loss: 0.0005


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


Validation Metrics: Precision=0.8202, Recall=0.8647, F1=0.8419, Exact Match= 0.7602

Epoch 17/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.000358]


Average training loss: 0.0006


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.35it/s]


Validation Metrics: Precision=0.8202, Recall=0.8539, F1=0.8367, Exact Match= 0.7575

Epoch 18/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=9.94e-5] 


Average training loss: 0.0006


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.36it/s]


Validation Metrics: Precision=0.8154, Recall=0.8587, F1=0.8365, Exact Match= 0.7602

Epoch 19/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=7.28e-5] 


Average training loss: 0.0005


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.38it/s]


Validation Metrics: Precision=0.8152, Recall=0.8575, F1=0.8358, Exact Match= 0.7602

Epoch 20/20


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=4.33e-5] 


Average training loss: 0.0003


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.38it/s]


Validation Metrics: Precision=0.8191, Recall=0.8587, F1=0.8384, Exact Match= 0.7602

Evaluating bert_lr0.0001_bs16_ep20 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.42it/s]


Metrics: Precision=0.8945, Recall=0.9023, F1=0.8984, Exact Match=0.8055

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 20}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 92/92 [00:20<00:00,  4.39it/s, loss=0.0648]


Average training loss: 0.0592


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.21it/s]


Validation Metrics: Precision=0.8045, Recall=0.9033, F1=0.8510, Exact Match= 0.7221

Epoch 6/20


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.0751] 


Average training loss: 0.0518


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.17it/s]


Validation Metrics: Precision=0.8021, Recall=0.8288, F1=0.8153, Exact Match= 0.6921

Epoch 7/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0458] 


Average training loss: 0.0459


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.28it/s]


Validation Metrics: Precision=0.8068, Recall=0.9043, F1=0.8528, Exact Match= 0.7357

Epoch 8/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0283] 


Average training loss: 0.0372


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.19it/s]


New best F1: 0.8912 - Saving model
Validation Metrics: Precision=0.8646, Recall=0.9194, F1=0.8912, Exact Match= 0.7738

Epoch 9/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0211] 


Average training loss: 0.0316


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.20it/s]


Validation Metrics: Precision=0.8610, Recall=0.8983, F1=0.8793, Exact Match= 0.7657

Epoch 10/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.000921]


Average training loss: 0.0157


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.12it/s]


New best F1: 0.8948 - Saving model
Validation Metrics: Precision=0.8742, Recall=0.9164, F1=0.8948, Exact Match= 0.7766

Epoch 11/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0197]  


Average training loss: 0.0128


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.09it/s]


New best F1: 0.8990 - Saving model
Validation Metrics: Precision=0.8888, Recall=0.9094, F1=0.8990, Exact Match= 0.8120

Epoch 12/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00161] 


Average training loss: 0.0048


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.16it/s]


New best F1: 0.8991 - Saving model
Validation Metrics: Precision=0.9009, Recall=0.8973, F1=0.8991, Exact Match= 0.8202

Epoch 13/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00277] 


Average training loss: 0.0030


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]


New best F1: 0.9003 - Saving model
Validation Metrics: Precision=0.9003, Recall=0.9003, F1=0.9003, Exact Match= 0.8065

Epoch 14/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00118] 


Average training loss: 0.0025


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.9109 - Saving model
Validation Metrics: Precision=0.8958, Recall=0.9265, F1=0.9109, Exact Match= 0.8229

Epoch 15/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000277]


Average training loss: 0.0016


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.14it/s]


New best F1: 0.9135 - Saving model
Validation Metrics: Precision=0.9067, Recall=0.9204, F1=0.9135, Exact Match= 0.8174

Epoch 16/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00021] 


Average training loss: 0.0015


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.11it/s]


Validation Metrics: Precision=0.9014, Recall=0.9114, F1=0.9064, Exact Match= 0.8147

Epoch 17/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000166]


Average training loss: 0.0014


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.05it/s]


Validation Metrics: Precision=0.8936, Recall=0.9215, F1=0.9073, Exact Match= 0.8202

Epoch 18/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000115]


Average training loss: 0.0008


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.12it/s]


Validation Metrics: Precision=0.9023, Recall=0.9204, F1=0.9113, Exact Match= 0.8202

Epoch 19/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000334]


Average training loss: 0.0006


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


Validation Metrics: Precision=0.9001, Recall=0.9164, F1=0.9082, Exact Match= 0.8229

Epoch 20/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000287]


Average training loss: 0.0010


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


Validation Metrics: Precision=0.8949, Recall=0.9174, F1=0.9060, Exact Match= 0.8174

Evaluating xlm-roberta_lr0.0001_bs16_ep20 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.15it/s]


Metrics: Precision=0.9205, Recall=0.9253, F1=0.9229, Exact Match=0.8134

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 16, 'epochs': 20}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.166]


Average training loss: 0.6785


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.7187 - Saving model
Validation Metrics: Precision=0.6888, Recall=0.7513, F1=0.7187, Exact Match= 0.6104

Epoch 2/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.184] 


Average training loss: 0.1331


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.8617 - Saving model
Validation Metrics: Precision=0.8368, Recall=0.8882, F1=0.8617, Exact Match= 0.7330

Epoch 3/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0553] 


Average training loss: 0.0710


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.8791 - Saving model
Validation Metrics: Precision=0.8490, Recall=0.9114, F1=0.8791, Exact Match= 0.7548

Epoch 4/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0668] 


Average training loss: 0.0537


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.10it/s]


Validation Metrics: Precision=0.8512, Recall=0.8872, F1=0.8688, Exact Match= 0.7711

Epoch 5/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.036] 


Average training loss: 0.0654


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.12it/s]


Validation Metrics: Precision=0.8343, Recall=0.9074, F1=0.8693, Exact Match= 0.7302

Epoch 6/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0662] 


Average training loss: 0.0768


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.11it/s]


Validation Metrics: Precision=0.8552, Recall=0.8983, F1=0.8762, Exact Match= 0.7684

Epoch 7/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0038] 


Average training loss: 0.0612


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.07it/s]


New best F1: 0.8830 - Saving model
Validation Metrics: Precision=0.8663, Recall=0.9003, F1=0.8830, Exact Match= 0.7629

Epoch 8/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00925]


Average training loss: 0.0354


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.8957 - Saving model
Validation Metrics: Precision=0.8881, Recall=0.9033, F1=0.8957, Exact Match= 0.7902

Epoch 9/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00964]


Average training loss: 0.0205


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.10it/s]


Validation Metrics: Precision=0.8505, Recall=0.9164, F1=0.8822, Exact Match= 0.7711

Epoch 10/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0256]  


Average training loss: 0.0132


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.09it/s]


New best F1: 0.9050 - Saving model
Validation Metrics: Precision=0.8764, Recall=0.9355, F1=0.9050, Exact Match= 0.7984

Epoch 11/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.36it/s, loss=0.0322]  


Average training loss: 0.0119


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.13it/s]


New best F1: 0.9083 - Saving model
Validation Metrics: Precision=0.8899, Recall=0.9275, F1=0.9083, Exact Match= 0.8147

Epoch 12/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00296] 


Average training loss: 0.0057


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.10it/s]


New best F1: 0.9141 - Saving model
Validation Metrics: Precision=0.9020, Recall=0.9265, F1=0.9141, Exact Match= 0.8174

Epoch 13/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000237]


Average training loss: 0.0075


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.11it/s]


Validation Metrics: Precision=0.8977, Recall=0.9275, F1=0.9123, Exact Match= 0.8256

Epoch 14/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00439] 


Average training loss: 0.0063


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.10it/s]


Validation Metrics: Precision=0.8808, Recall=0.9154, F1=0.8978, Exact Match= 0.8120

Epoch 15/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00196] 


Average training loss: 0.0044


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.08it/s]


New best F1: 0.9165 - Saving model
Validation Metrics: Precision=0.9048, Recall=0.9285, F1=0.9165, Exact Match= 0.8311

Epoch 16/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00097] 


Average training loss: 0.0035


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.12it/s]


Validation Metrics: Precision=0.8930, Recall=0.9325, F1=0.9123, Exact Match= 0.8174

Epoch 17/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000228]


Average training loss: 0.0020


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.15it/s]


Validation Metrics: Precision=0.8943, Recall=0.9285, F1=0.9111, Exact Match= 0.8229

Epoch 18/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.00182] 


Average training loss: 0.0016


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.10it/s]


Validation Metrics: Precision=0.9010, Recall=0.9255, F1=0.9131, Exact Match= 0.8202

Epoch 19/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.0003]  


Average training loss: 0.0010


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 17.98it/s]


New best F1: 0.9190 - Saving model
Validation Metrics: Precision=0.9126, Recall=0.9255, F1=0.9190, Exact Match= 0.8338

Epoch 20/20


Training: 100%|██████████| 92/92 [00:21<00:00,  4.37it/s, loss=0.000258]


Average training loss: 0.0014


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.16it/s]


New best F1: 0.9221 - Saving model
Validation Metrics: Precision=0.9148, Recall=0.9295, F1=0.9221, Exact Match= 0.8338

Evaluating xlm-roberta_lr0.0001_bs16_ep20 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.16it/s]


Metrics: Precision=0.9146, Recall=0.9281, F1=0.9213, Exact Match=0.8193


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 3}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 46/46 [00:17<00:00,  2.60it/s, loss=0.375]


Average training loss: 1.0620


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.1995 - Saving model
Validation Metrics: Precision=0.3391, Recall=0.1413, F1=0.1995, Exact Match= 0.2916

Epoch 2/3


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.118]


Average training loss: 0.2848


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.6216 - Saving model
Validation Metrics: Precision=0.6053, Recall=0.6389, F1=0.6216, Exact Match= 0.5341

Epoch 3/3


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.133] 


Average training loss: 0.1242


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.7916 - Saving model
Validation Metrics: Precision=0.7620, Recall=0.8237, F1=0.7916, Exact Match= 0.7084

Evaluating bert_lr0.0001_bs32_ep3 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.27it/s]


Metrics: Precision=0.8668, Recall=0.8712, F1=0.8690, Exact Match=0.7191

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 3}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.772]


Average training loss: 1.6374


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.306]


Average training loss: 0.4787


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.4459 - Saving model
Validation Metrics: Precision=0.4498, Recall=0.4421, F1=0.4459, Exact Match= 0.3869

Epoch 3/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.144] 


Average training loss: 0.1992


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.61it/s]


New best F1: 0.7497 - Saving model
Validation Metrics: Precision=0.7453, Recall=0.7543, F1=0.7497, Exact Match= 0.6267

Evaluating xlm-roberta_lr0.0001_bs32_ep3 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.22it/s]


Metrics: Precision=0.8021, Recall=0.8323, F1=0.8169, Exact Match=0.6346

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 3}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.401]


Average training loss: 1.1183


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.3426 - Saving model
Validation Metrics: Precision=0.7441, Recall=0.2226, F1=0.3426, Exact Match= 0.3488

Epoch 2/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.163]


Average training loss: 0.2606


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.6919 - Saving model
Validation Metrics: Precision=0.6600, Recall=0.7271, F1=0.6919, Exact Match= 0.5940

Epoch 3/3


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.159] 


Average training loss: 0.1220


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8521 - Saving model
Validation Metrics: Precision=0.8573, Recall=0.8469, F1=0.8521, Exact Match= 0.7657

Evaluating xlm-roberta_lr0.0001_bs32_ep3 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.19it/s]


Metrics: Precision=0.8950, Recall=0.8853, F1=0.8902, Exact Match=0.7446


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 5}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.534]


Average training loss: 1.1982


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


New best F1: 0.1427 - Saving model
Validation Metrics: Precision=0.3541, Recall=0.0894, F1=0.1427, Exact Match= 0.2997

Epoch 2/5


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.244]


Average training loss: 0.2993


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.6067 - Saving model
Validation Metrics: Precision=0.5837, Recall=0.6316, F1=0.6067, Exact Match= 0.5422

Epoch 3/5


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0666]


Average training loss: 0.1260


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.7679 - Saving model
Validation Metrics: Precision=0.7854, Recall=0.7512, F1=0.7679, Exact Match= 0.6621

Epoch 4/5


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0653]


Average training loss: 0.0590


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.8379 - Saving model
Validation Metrics: Precision=0.8127, Recall=0.8647, F1=0.8379, Exact Match= 0.7629

Epoch 5/5


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00474]


Average training loss: 0.0313


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.67it/s]


Validation Metrics: Precision=0.7996, Recall=0.8671, F1=0.8320, Exact Match= 0.7275

Evaluating bert_lr0.0001_bs32_ep5 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.26it/s]


Metrics: Precision=0.9084, Recall=0.9327, F1=0.9204, Exact Match=0.8075
New best model saved for bert at models/best_bert.pt

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 5}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.519]


Average training loss: 1.5320


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.243]


Average training loss: 0.4971


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.3713 - Saving model
Validation Metrics: Precision=0.3839, Recall=0.3595, F1=0.3713, Exact Match= 0.3787

Epoch 3/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.119]


Average training loss: 0.2305


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.6819 - Saving model
Validation Metrics: Precision=0.6343, Recall=0.7372, F1=0.6819, Exact Match= 0.5531

Epoch 4/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.102] 


Average training loss: 0.1120


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


New best F1: 0.8262 - Saving model
Validation Metrics: Precision=0.7976, Recall=0.8570, F1=0.8262, Exact Match= 0.7003

Epoch 5/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0764]


Average training loss: 0.0646


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.8573 - Saving model
Validation Metrics: Precision=0.8439, Recall=0.8711, F1=0.8573, Exact Match= 0.7302

Evaluating xlm-roberta_lr0.0001_bs32_ep5 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.22it/s]


Metrics: Precision=0.9087, Recall=0.9087, F1=0.9087, Exact Match=0.7917

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 5}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.473]


Average training loss: 1.2220


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.3590 - Saving model
Validation Metrics: Precision=0.4069, Recall=0.3212, F1=0.3590, Exact Match= 0.3869

Epoch 2/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.14] 


Average training loss: 0.2624


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.6736 - Saving model
Validation Metrics: Precision=0.6351, Recall=0.7170, F1=0.6736, Exact Match= 0.5477

Epoch 3/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0846]


Average training loss: 0.1241


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


New best F1: 0.8412 - Saving model
Validation Metrics: Precision=0.8268, Recall=0.8560, F1=0.8412, Exact Match= 0.7357

Epoch 4/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.064] 


Average training loss: 0.0731


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8819 - Saving model
Validation Metrics: Precision=0.8805, Recall=0.8832, F1=0.8819, Exact Match= 0.7875

Epoch 5/5


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0372]


Average training loss: 0.0498


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.8966 - Saving model
Validation Metrics: Precision=0.8813, Recall=0.9124, F1=0.8966, Exact Match= 0.7847

Evaluating xlm-roberta_lr0.0001_bs32_ep5 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.16it/s]


Metrics: Precision=0.9223, Recall=0.9213, F1=0.9218, Exact Match=0.8212


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 10}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.45] 


Average training loss: 1.1472


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


New best F1: 0.1908 - Saving model
Validation Metrics: Precision=0.3746, Recall=0.1280, F1=0.1908, Exact Match= 0.2970

Epoch 2/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.216]


Average training loss: 0.3020


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.5740 - Saving model
Validation Metrics: Precision=0.5320, Recall=0.6232, F1=0.5740, Exact Match= 0.4959

Epoch 3/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.15]  


Average training loss: 0.1290


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.7814 - Saving model
Validation Metrics: Precision=0.7462, Recall=0.8200, F1=0.7814, Exact Match= 0.6839

Epoch 4/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0445]


Average training loss: 0.0642


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8230 - Saving model
Validation Metrics: Precision=0.7851, Recall=0.8647, F1=0.8230, Exact Match= 0.7193

Epoch 5/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0136] 


Average training loss: 0.0353


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


Validation Metrics: Precision=0.7822, Recall=0.8587, F1=0.8187, Exact Match= 0.6866

Epoch 6/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0242] 


Average training loss: 0.0236


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.67it/s]


New best F1: 0.8393 - Saving model
Validation Metrics: Precision=0.8131, Recall=0.8671, F1=0.8393, Exact Match= 0.7575

Epoch 7/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0446] 


Average training loss: 0.0125


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


Validation Metrics: Precision=0.8119, Recall=0.8599, F1=0.8352, Exact Match= 0.7629

Epoch 8/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00546]


Average training loss: 0.0109


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


Validation Metrics: Precision=0.8020, Recall=0.8756, F1=0.8372, Exact Match= 0.7548

Epoch 9/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00965]


Average training loss: 0.0109


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


New best F1: 0.8472 - Saving model
Validation Metrics: Precision=0.8407, Recall=0.8539, F1=0.8472, Exact Match= 0.7357

Epoch 10/10


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0198] 


Average training loss: 0.0175


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


Validation Metrics: Precision=0.7871, Recall=0.7814, F1=0.7842, Exact Match= 0.6975

Evaluating bert_lr0.0001_bs32_ep10 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.25it/s]


Metrics: Precision=0.8797, Recall=0.8784, F1=0.8791, Exact Match=0.7485

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 10}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.815]


Average training loss: 1.8504


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.337]


Average training loss: 0.4690


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.3012 - Saving model
Validation Metrics: Precision=0.3194, Recall=0.2850, F1=0.3012, Exact Match= 0.3351

Epoch 3/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.211]


Average training loss: 0.2453


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.6705 - Saving model
Validation Metrics: Precision=0.6410, Recall=0.7029, F1=0.6705, Exact Match= 0.5422

Epoch 4/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0937]


Average training loss: 0.1106


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.8395 - Saving model
Validation Metrics: Precision=0.8362, Recall=0.8429, F1=0.8395, Exact Match= 0.7084

Epoch 5/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0425]


Average training loss: 0.0660


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.8741 - Saving model
Validation Metrics: Precision=0.8780, Recall=0.8701, F1=0.8741, Exact Match= 0.7902

Epoch 6/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0691] 


Average training loss: 0.0488


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8811 - Saving model
Validation Metrics: Precision=0.8852, Recall=0.8771, F1=0.8811, Exact Match= 0.7793

Epoch 7/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0357] 


Average training loss: 0.0321


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.62it/s]


New best F1: 0.8898 - Saving model
Validation Metrics: Precision=0.8814, Recall=0.8983, F1=0.8898, Exact Match= 0.7820

Epoch 8/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0354] 


Average training loss: 0.0269


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8911 - Saving model
Validation Metrics: Precision=0.9098, Recall=0.8731, F1=0.8911, Exact Match= 0.7847

Epoch 9/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0494] 


Average training loss: 0.0288


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8951 - Saving model
Validation Metrics: Precision=0.8880, Recall=0.9023, F1=0.8951, Exact Match= 0.7820

Epoch 10/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0558] 


Average training loss: 0.0220


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


Validation Metrics: Precision=0.8951, Recall=0.8761, F1=0.8855, Exact Match= 0.7738

Evaluating xlm-roberta_lr0.0001_bs32_ep10 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.26it/s]


Metrics: Precision=0.9397, Recall=0.9161, F1=0.9278, Exact Match=0.8330

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 10}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.311]


Average training loss: 0.9437


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.55it/s]


New best F1: 0.3711 - Saving model
Validation Metrics: Precision=0.6755, Recall=0.2558, F1=0.3711, Exact Match= 0.3869

Epoch 2/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.118]


Average training loss: 0.2378


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.7202 - Saving model
Validation Metrics: Precision=0.6924, Recall=0.7503, F1=0.7202, Exact Match= 0.5940

Epoch 3/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0662]


Average training loss: 0.1169


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


New best F1: 0.8584 - Saving model
Validation Metrics: Precision=0.8442, Recall=0.8731, F1=0.8584, Exact Match= 0.7302

Epoch 4/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0375]


Average training loss: 0.0696


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.8807 - Saving model
Validation Metrics: Precision=0.8557, Recall=0.9074, F1=0.8807, Exact Match= 0.7766

Epoch 5/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0876]


Average training loss: 0.0503


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8874 - Saving model
Validation Metrics: Precision=0.8673, Recall=0.9084, F1=0.8874, Exact Match= 0.7875

Epoch 6/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0287]


Average training loss: 0.0382


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.8895 - Saving model
Validation Metrics: Precision=0.8642, Recall=0.9164, F1=0.8895, Exact Match= 0.7929

Epoch 7/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0512] 


Average training loss: 0.0337


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.61it/s]


New best F1: 0.8990 - Saving model
Validation Metrics: Precision=0.8803, Recall=0.9184, F1=0.8990, Exact Match= 0.7984

Epoch 8/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.00599]


Average training loss: 0.0259


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


Validation Metrics: Precision=0.8217, Recall=0.9144, F1=0.8656, Exact Match= 0.7711

Epoch 9/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0312] 


Average training loss: 0.0321


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


Validation Metrics: Precision=0.8647, Recall=0.9013, F1=0.8826, Exact Match= 0.7875

Epoch 10/10


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0307]


Average training loss: 0.0467


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


Validation Metrics: Precision=0.8671, Recall=0.8933, F1=0.8800, Exact Match= 0.7793

Evaluating xlm-roberta_lr0.0001_bs32_ep10 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.13it/s]


Metrics: Precision=0.9100, Recall=0.9116, F1=0.9108, Exact Match=0.8035


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 20}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.442]


Average training loss: 1.0945


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


New best F1: 0.2058 - Saving model
Validation Metrics: Precision=0.3550, Recall=0.1449, F1=0.2058, Exact Match= 0.3161

Epoch 2/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.165]


Average training loss: 0.3033


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.6053 - Saving model
Validation Metrics: Precision=0.5951, Recall=0.6159, F1=0.6053, Exact Match= 0.5150

Epoch 3/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.108] 


Average training loss: 0.1286


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.7669 - Saving model
Validation Metrics: Precision=0.7730, Recall=0.7609, F1=0.7669, Exact Match= 0.6757

Epoch 4/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0808]


Average training loss: 0.0594


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


New best F1: 0.7753 - Saving model
Validation Metrics: Precision=0.7929, Recall=0.7585, F1=0.7753, Exact Match= 0.7003

Epoch 5/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0309]


Average training loss: 0.0336


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.8478 - Saving model
Validation Metrics: Precision=0.8349, Recall=0.8611, F1=0.8478, Exact Match= 0.7575

Epoch 6/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0187] 


Average training loss: 0.0241


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.8510 - Saving model
Validation Metrics: Precision=0.8245, Recall=0.8792, F1=0.8510, Exact Match= 0.7493

Epoch 7/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0109] 


Average training loss: 0.0188


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


Validation Metrics: Precision=0.8181, Recall=0.8744, F1=0.8453, Exact Match= 0.7520

Epoch 8/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.012]  


Average training loss: 0.0107


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


Validation Metrics: Precision=0.8247, Recall=0.8696, F1=0.8466, Exact Match= 0.7738

Epoch 9/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0416] 


Average training loss: 0.0183


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.67it/s]


Validation Metrics: Precision=0.8242, Recall=0.8720, F1=0.8474, Exact Match= 0.7684

Epoch 10/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00891]


Average training loss: 0.0198


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


Validation Metrics: Precision=0.7951, Recall=0.8671, F1=0.8296, Exact Match= 0.7384

Epoch 11/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0366] 


Average training loss: 0.0196


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


Validation Metrics: Precision=0.8290, Recall=0.8551, F1=0.8419, Exact Match= 0.7248

Epoch 12/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0649] 


Average training loss: 0.0207


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


Validation Metrics: Precision=0.7862, Recall=0.8792, F1=0.8301, Exact Match= 0.7221

Epoch 13/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.0102]  


Average training loss: 0.0116


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.67it/s]


Validation Metrics: Precision=0.8215, Recall=0.8671, F1=0.8437, Exact Match= 0.7275

Epoch 14/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00696] 


Average training loss: 0.0052


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.67it/s]


Validation Metrics: Precision=0.8268, Recall=0.8647, F1=0.8453, Exact Match= 0.7439

Epoch 15/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.000186]


Average training loss: 0.0027


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


New best F1: 0.8539 - Saving model
Validation Metrics: Precision=0.8400, Recall=0.8684, F1=0.8539, Exact Match= 0.7493

Epoch 16/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.000188]


Average training loss: 0.0004


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


New best F1: 0.8588 - Saving model
Validation Metrics: Precision=0.8307, Recall=0.8889, F1=0.8588, Exact Match= 0.7466

Epoch 17/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.00028] 


Average training loss: 0.0003


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.66it/s]


New best F1: 0.8618 - Saving model
Validation Metrics: Precision=0.8331, Recall=0.8925, F1=0.8618, Exact Match= 0.7493

Epoch 18/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.000321]


Average training loss: 0.0005


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


Validation Metrics: Precision=0.8314, Recall=0.8877, F1=0.8586, Exact Match= 0.7520

Epoch 19/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=9.28e-5] 


Average training loss: 0.0004


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.65it/s]


Validation Metrics: Precision=0.8326, Recall=0.8889, F1=0.8598, Exact Match= 0.7548

Epoch 20/20


Training: 100%|██████████| 46/46 [00:17<00:00,  2.62it/s, loss=0.000118]


Average training loss: 0.0004


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.64it/s]


Validation Metrics: Precision=0.8256, Recall=0.8865, F1=0.8550, Exact Match= 0.7548

Evaluating bert_lr0.0001_bs32_ep20 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.28it/s]


Metrics: Precision=0.9070, Recall=0.9175, F1=0.9122, Exact Match=0.8075

Training model: xlm-roberta-base with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 20}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.634]


Average training loss: 1.3987


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.62it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.406]


Average training loss: 0.4216


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.63it/s]


New best F1: 0.3737 - Saving model
Validation Metrics: Precision=0.4609, Recall=0.3142, F1=0.3737, Exact Match= 0.3978

Epoch 3/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.228]


Average training loss: 0.2511


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.61it/s]


New best F1: 0.6498 - Saving model
Validation Metrics: Precision=0.6065, Recall=0.6999, F1=0.6498, Exact Match= 0.5777

Epoch 4/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0933]


Average training loss: 0.1101


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


New best F1: 0.8222 - Saving model
Validation Metrics: Precision=0.7769, Recall=0.8731, F1=0.8222, Exact Match= 0.7112

Epoch 5/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0541]


Average training loss: 0.0651


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


New best F1: 0.8744 - Saving model
Validation Metrics: Precision=0.8591, Recall=0.8902, F1=0.8744, Exact Match= 0.7684

Epoch 6/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0418]


Average training loss: 0.0421


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


New best F1: 0.8945 - Saving model
Validation Metrics: Precision=0.8682, Recall=0.9225, F1=0.8945, Exact Match= 0.7902

Epoch 7/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0394] 


Average training loss: 0.0304


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


Validation Metrics: Precision=0.8829, Recall=0.8963, F1=0.8896, Exact Match= 0.7902

Epoch 8/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.191]  


Average training loss: 0.0445


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8264, Recall=0.8963, F1=0.8599, Exact Match= 0.7384

Epoch 9/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0555]


Average training loss: 0.0615


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.73it/s]


Validation Metrics: Precision=0.8759, Recall=0.8812, F1=0.8785, Exact Match= 0.7711

Epoch 10/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0388] 


Average training loss: 0.0371


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


New best F1: 0.8984 - Saving model
Validation Metrics: Precision=0.8887, Recall=0.9084, F1=0.8984, Exact Match= 0.8093

Epoch 11/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0322] 


Average training loss: 0.0315


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


New best F1: 0.9010 - Saving model
Validation Metrics: Precision=0.8947, Recall=0.9074, F1=0.9010, Exact Match= 0.8147

Epoch 12/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0265] 


Average training loss: 0.0190


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


Validation Metrics: Precision=0.8505, Recall=0.9053, F1=0.8771, Exact Match= 0.7793

Epoch 13/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0255] 


Average training loss: 0.0120


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8628, Recall=0.9053, F1=0.8835, Exact Match= 0.7902

Epoch 14/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00636]


Average training loss: 0.0081


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8810, Recall=0.9023, F1=0.8915, Exact Match= 0.8202

Epoch 15/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00264] 


Average training loss: 0.0050


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8836, Recall=0.9174, F1=0.9002, Exact Match= 0.8256

Epoch 16/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0122]  


Average training loss: 0.0049


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


New best F1: 0.9141 - Saving model
Validation Metrics: Precision=0.9012, Recall=0.9275, F1=0.9141, Exact Match= 0.8338

Epoch 17/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00722] 


Average training loss: 0.0033


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


New best F1: 0.9192 - Saving model
Validation Metrics: Precision=0.9110, Recall=0.9275, F1=0.9192, Exact Match= 0.8392

Epoch 18/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.000792]


Average training loss: 0.0026


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.9068, Recall=0.9305, F1=0.9185, Exact Match= 0.8283

Epoch 19/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00163] 


Average training loss: 0.0019


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


New best F1: 0.9192 - Saving model
Validation Metrics: Precision=0.9053, Recall=0.9335, F1=0.9192, Exact Match= 0.8365

Epoch 20/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.000737]


Average training loss: 0.0014


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.9025, Recall=0.9325, F1=0.9173, Exact Match= 0.8311

Evaluating xlm-roberta_lr0.0001_bs32_ep20 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.31it/s]


Metrics: Precision=0.9337, Recall=0.9321, F1=0.9329, Exact Match=0.8389
New best model saved for xlm-roberta at models/best_xlm-roberta.pt

Training model: cahya/xlm-roberta-base-indonesian-NER with params: {'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 20}


Some weights of the model checkpoint at cahya/xlm-roberta-base-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cahya/xlm-roberta-base-indonesian-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([39]) in the checkpoint and torch.Size([7]) in the model 

Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.336]


Average training loss: 1.2745


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


New best F1: 0.2303 - Saving model
Validation Metrics: Precision=0.5451, Recall=0.1460, F1=0.2303, Exact Match= 0.3106

Epoch 2/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.22] 


Average training loss: 0.2608


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


New best F1: 0.7299 - Saving model
Validation Metrics: Precision=0.7359, Recall=0.7241, F1=0.7299, Exact Match= 0.6512

Epoch 3/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0758]


Average training loss: 0.1235


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


New best F1: 0.8460 - Saving model
Validation Metrics: Precision=0.8373, Recall=0.8550, F1=0.8460, Exact Match= 0.7439

Epoch 4/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0397]


Average training loss: 0.0713


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.68it/s]


New best F1: 0.8830 - Saving model
Validation Metrics: Precision=0.8626, Recall=0.9043, F1=0.8830, Exact Match= 0.7902

Epoch 5/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.45it/s, loss=0.0236]


Average training loss: 0.0495


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.70it/s]


New best F1: 0.9005 - Saving model
Validation Metrics: Precision=0.8946, Recall=0.9063, F1=0.9005, Exact Match= 0.8093

Epoch 6/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0794] 


Average training loss: 0.0433


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


New best F1: 0.9158 - Saving model
Validation Metrics: Precision=0.9015, Recall=0.9305, F1=0.9158, Exact Match= 0.8147

Epoch 7/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.027]  


Average training loss: 0.0297


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.9054, Recall=0.9154, F1=0.9104, Exact Match= 0.8038

Epoch 8/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0456] 


Average training loss: 0.0243


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8587, Recall=0.9245, F1=0.8904, Exact Match= 0.7684

Epoch 9/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00603]


Average training loss: 0.0222


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.73it/s]


Validation Metrics: Precision=0.9000, Recall=0.9245, F1=0.9121, Exact Match= 0.8174

Epoch 10/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0439] 


Average training loss: 0.0265


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


Validation Metrics: Precision=0.8873, Recall=0.9194, F1=0.9031, Exact Match= 0.8174

Epoch 11/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0529]


Average training loss: 0.0369


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8534, Recall=0.9144, F1=0.8828, Exact Match= 0.7657

Epoch 12/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0204]


Average training loss: 0.0373


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8481, Recall=0.8771, F1=0.8624, Exact Match= 0.7439

Epoch 13/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0289]


Average training loss: 0.0320


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8883, Recall=0.9053, F1=0.8968, Exact Match= 0.7956

Epoch 14/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00349]


Average training loss: 0.0187


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8538, Recall=0.9295, F1=0.8901, Exact Match= 0.7766

Epoch 15/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.0191] 


Average training loss: 0.0113


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8668, Recall=0.9174, F1=0.8914, Exact Match= 0.7875

Epoch 16/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00307]


Average training loss: 0.0091


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.9022, Recall=0.9194, F1=0.9107, Exact Match= 0.8283

Epoch 17/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00142] 


Average training loss: 0.0058


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8689, Recall=0.9345, F1=0.9005, Exact Match= 0.7929

Epoch 18/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.00824] 


Average training loss: 0.0039


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.71it/s]


Validation Metrics: Precision=0.8928, Recall=0.9305, F1=0.9112, Exact Match= 0.8120

Epoch 19/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.000726]


Average training loss: 0.0032


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.69it/s]


Validation Metrics: Precision=0.8947, Recall=0.9245, F1=0.9094, Exact Match= 0.8174

Epoch 20/20


Training: 100%|██████████| 46/46 [00:18<00:00,  2.46it/s, loss=0.000759]


Average training loss: 0.0025


Evaluating: 100%|██████████| 12/12 [00:01<00:00,  9.72it/s]


Validation Metrics: Precision=0.8970, Recall=0.9295, F1=0.9130, Exact Match= 0.8202

Evaluating xlm-roberta_lr0.0001_bs32_ep20 on test set...


Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.24it/s]


Metrics: Precision=0.9396, Recall=0.9321, F1=0.9359, Exact Match=0.8527
New best model saved for xlm-roberta at models/best_xlm-roberta.pt


Training model: indobenchmark/indobert-base-p1 with params: {'learning_rate': 5e-05, 'batch_size': 16, 'epochs': 3}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.94it/s, loss=0.422]


Average training loss: 0.8240


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.37it/s]


New best F1: 0.3586 - Saving model
Validation Metrics: Precision=0.4223, Recall=0.3116, F1=0.3586, Exact Match= 0.3406

Epoch 2/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.0762]


Average training loss: 0.1934


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.34it/s]


New best F1: 0.7447 - Saving model
Validation Metrics: Precision=0.7064, Recall=0.7874, F1=0.7447, Exact Match= 0.6431

Epoch 3/3


Training: 100%|██████████| 92/92 [00:18<00:00,  4.93it/s, loss=0.105] 


Average training loss: 0.0858


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.32it/s]


New best F1: 0.8176 - Saving model
Validation Metrics: Precision=0.8071, Recall=0.8285, F1=0.8176, Exact Match= 0.7112

Evaluating bert_lr5e-05_bs16_ep3 on test set...


Evaluating: 100%|██████████| 32/32 [00:01<00:00, 18.30it/s]


Metrics: Precision=0.8952, Recall=0.8777, F1=0.8864, Exact Match=0.7701

Training model: xlm-roberta-base with params: {'learning_rate': 5e-05, 'batch_size': 16, 'epochs': 3}


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing train dataset...
Preparing test dataset...
Preparing dev dataset...
Training on cuda

Epoch 1/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.699]


Average training loss: 1.2355


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.22it/s]


Validation Metrics: Precision=0.0000, Recall=0.0000, F1=0.0000, Exact Match= 0.3106

Epoch 2/3


Training: 100%|██████████| 92/92 [00:20<00:00,  4.38it/s, loss=0.105]


Average training loss: 0.2993


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.28it/s]


New best F1: 0.6609 - Saving model
Validation Metrics: Precision=0.5770, Recall=0.7734, F1=0.6609, Exact Match= 0.5613

Epoch 3/3


Training: 100%|██████████| 92/92 [00:21<00:00,  4.38it/s, loss=0.0729]


Average training loss: 0.1200


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 18.26it/s]


New best F1: 0.8255 - Saving model
Validation Metrics: Precision=0.7937, Recall=0.8600, F1=0.8255, Exact Match= 0.7357


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 43968 vs 43862

In [None]:
# # Inference

# from transformers import AutoTokenizer, AutoModelForTokenClassification
# import json

# load_dir = "saved_model"

# tokenizer = AutoTokenizer.from_pretrained(load_dir)
# model = AutoModelForTokenClassification.from_pretrained(load_dir)

# with open(f"{load_dir}/label2id.json") as f:
#     label2id = json.load(f)
# with open(f"{load_dir}/id2label.json") as f:
#     id2label = json.load(f)
!ls models

In [None]:
# dataset used
"""
https://github.com/khairunnisaor/idner-news-2k/tree/main

@inproceedings{khairunnisa2020,
  title={Towards a Standardized Dataset on Indonesian Named Entity Recognition},
  author={Siti Oryza Khairunnisa and Aizhan Imankulova and Mamoru Komachi},
  booktitle={Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing: Student Research Workshop}
  year={2020}
}
"""