Dataset:
https://github.com/IndoNLP/indonlu/tree/master/dataset/nerp_ner-prosa

Entity:
*   PPL (Nama orang): joko widodo, budi gunawan, gus dur, dll.
*   PLC (Tempat): dki jakarta, chicago, monas, mata elang international stadium, dll.
*   EVT (Kejadian, peristiwa): bandung great sale 2017, kai serba 70, dll.
*   IND (nama produk): apple watch, call of duty (game), fifa 16, hainan airlines, dll.
*   FNB (makanan): es kopi, sponge cake, dll.







# Load Dataset

In [10]:
import requests

def load_conll_from_url(url: str):
    text = requests.get(url).text.strip().split("\n")

    data = []
    tokens = []
    tags = []

    for line in text:
        if line.strip() == "":
            if tokens:
                data.append((tokens, tags))
                tokens = []
                tags = []
        else:
            parts = line.split()
            token, tag = parts[0], parts[-1]
            tokens.append(token)
            tags.append(tag)

    if tokens:
        data.append((tokens, tags))

    return data

In [1]:
def load_conll_from_file(file_path: str):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().strip().split("\n")
    data = []
    tokens = []
    tags = []
    for line in text:
        if line.strip() == "":
            if tokens:
                data.append((tokens, tags))
                tokens = []
                tags = []
        else:
            parts = line.split()
            if len(parts) >= 2:
                token, tag = parts[0], parts[-1]
                tokens.append(token)
                tags.append(tag)
    if tokens:
        data.append((tokens, tags))
    return data

In [2]:
# train_url = "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/nerp_ner-prosa/train_preprocess.txt"
# valid_url = "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/nerp_ner-prosa/valid_preprocess.txt"
# test_url  = "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/nerp_ner-prosa/test_preprocess.txt"

# train_data = load_conll_from_url(train_url)
# valid_data = load_conll_from_url(valid_url)
# test_data  = load_conll_from_url(test_url)

train_data = load_conll_from_file("train_fixed_p1.txt")
valid_data = load_conll_from_file("valid_fixed_p1.txt")
test_data  = load_conll_from_file("test_fixed_p1.txt")

In [8]:
!pip install fasttext --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz
!gunzip cc.id.300.bin.gz

--2025-12-30 02:10:16--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.249.182.62, 13.249.182.39, 13.249.182.81, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.249.182.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4507049071 (4.2G) [application/octet-stream]
Saving to: ‘cc.id.300.bin.gz’


2025-12-30 02:10:46 (142 MB/s) - ‘cc.id.300.bin.gz’ saved [4507049071/4507049071]



In [4]:
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, f1_score
from collections import Counter
import fasttext
from tqdm import tqdm
import torch

fasttext_model = fasttext.load_model("../cc.id.300.bin")
EMBEDDING_DIM = 300

# Prepare vocabulary and labels
def build_vocab(data):
    word_counter = Counter()
    for tokens, _ in data:
        word_counter.update(tokens)

    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, _ in word_counter.most_common():
        vocab[word] = len(vocab)

    return vocab

def build_label_vocab(data):
    labels = set()
    for _, tags in data:
        labels.update(tags)

    label_vocab = {label: idx for idx, label in enumerate(sorted(labels))}
    return label_vocab

vocab = build_vocab(train_data)
label_vocab = build_label_vocab(train_data)
idx_to_label = {v: k for k, v in label_vocab.items()}

print(f"Vocabulary size: {len(vocab)}")
print(f"Number of labels: {len(label_vocab)}")
print(f"Labels: {list(label_vocab.keys())}")

# Calculate class weights
def calculate_class_weights(data, label_vocab):
    label_counts = Counter()
    for _, tags in data:
        label_counts.update(tags)

    total = sum(label_counts.values())
    weights = torch.zeros(len(label_vocab))

    for label, idx in label_vocab.items():
        count = label_counts[label]
        # Inverse frequency weighting
        weights[idx] = total / (len(label_vocab) * count)

    # Normalize weights
    weights = weights / weights.sum() * len(label_vocab)
    return weights

class_weights = calculate_class_weights(train_data, label_vocab)
print(f"Class weights: {class_weights}")

# Create embedding matrix
def create_embedding_matrix(vocab, fasttext_model, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))

    for word, idx in vocab.items():
        if word in ["<PAD>", "<UNK>"]:
            continue
        try:
            embedding_matrix[idx] = fasttext_model[word]
        except KeyError:
            # Random initialization for OOV words
            embedding_matrix[idx] = np.random.normal(0, 0.1, embedding_dim)

    return torch.FloatTensor(embedding_matrix)

embedding_matrix = create_embedding_matrix(vocab, fasttext_model, EMBEDDING_DIM)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

# Dataset
class NERDataset(Dataset):
    def __init__(self, data, vocab, label_vocab):
        self.data = data
        self.vocab = vocab
        self.label_vocab = label_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, tags = self.data[idx]

        token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]
        tag_ids = [self.label_vocab[tag] for tag in tags]

        return torch.LongTensor(token_ids), torch.LongTensor(tag_ids)

def collate_fn(batch):
    tokens, tags = zip(*batch)
    lengths = torch.LongTensor([len(t) for t in tokens])

    max_len = lengths.max().item()
    padded_tokens = torch.zeros(len(tokens), max_len, dtype=torch.long)
    padded_tags = torch.zeros(len(tags), max_len, dtype=torch.long)

    for i, (token, tag) in enumerate(zip(tokens, tags)):
        padded_tokens[i, :len(token)] = token
        padded_tags[i, :len(tag)] = tag

    return padded_tokens, padded_tags, lengths




Vocabulary size: 18930
Number of labels: 11
Labels: ['B-EVT', 'B-FNB', 'B-IND', 'B-PLC', 'B-PPL', 'I-EVT', 'I-FNB', 'I-IND', 'I-PLC', 'I-PPL', 'O']
Class weights: tensor([1.4321, 2.7364, 0.4863, 0.3411, 0.3089, 0.8704, 3.3835, 0.5331, 0.5242,
        0.3726, 0.0115])
Embedding matrix shape: torch.Size([18930, 300])


### Menyimpan `vocab` dan `label_vocab`

In [5]:
import pickle

# Menyimpan vocab
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
print("Vocab saved to vocab.pkl")

# Menyimpan label_vocab (dan idx_to_label jika diperlukan untuk UI)
with open('label_vocab.pkl', 'wb') as f:
    pickle.dump(label_vocab, f)
print("Label Vocab saved to label_vocab.pkl")

with open('idx_to_label.pkl', 'wb') as f:
    pickle.dump(idx_to_label, f)
print("idx_to_label saved to idx_to_label.pkl")

Vocab saved to vocab.pkl
Label Vocab saved to label_vocab.pkl
idx_to_label saved to idx_to_label.pkl


In [6]:
import torch
import torch.nn as nn

# CRF Layer
class CRF(nn.Module):
    def __init__(self, num_tags):
        super(CRF, self).__init__()
        self.num_tags = num_tags

        # Transition scores: transitions[i][j] = score of transitioning from tag i to tag j
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

        # Start and end transitions
        self.start_transitions = nn.Parameter(torch.randn(num_tags))
        self.end_transitions = nn.Parameter(torch.randn(num_tags))

    def forward(self, emissions, tags, mask):
        """Compute negative log likelihood (for training)"""
        return -self._log_likelihood(emissions, tags, mask)

    def _log_likelihood(self, emissions, tags, mask):
        batch_size, seq_length, num_tags = emissions.shape

        # Compute score of the given tag sequence
        score = self._compute_score(emissions, tags, mask)

        # Compute partition function (normalization)
        partition = self._compute_partition(emissions, mask)

        return (score - partition).sum()

    def _compute_score(self, emissions, tags, mask):
        batch_size, seq_length = tags.shape

        score = self.start_transitions[tags[:, 0]]
        score += emissions[:, 0].gather(1, tags[:, 0].unsqueeze(1)).squeeze(1)

        for i in range(1, seq_length):
            mask_i = mask[:, i]
            emit_score = emissions[:, i].gather(1, tags[:, i].unsqueeze(1)).squeeze(1)
            trans_score = self.transitions[tags[:, i - 1], tags[:, i]]

            score += (emit_score + trans_score) * mask_i

        last_tags = tags.gather(1, mask.sum(1).long().unsqueeze(1) - 1).squeeze(1)
        score += self.end_transitions[last_tags]

        return score

    def _compute_partition(self, emissions, mask):
        batch_size, seq_length, num_tags = emissions.shape

        alpha = self.start_transitions + emissions[:, 0]

        for i in range(1, seq_length):
            emit_score = emissions[:, i].unsqueeze(1)
            trans_score = self.transitions.unsqueeze(0)
            next_alpha = alpha.unsqueeze(2) + emit_score + trans_score
            next_alpha = torch.logsumexp(next_alpha, dim=1)

            mask_i = mask[:, i].unsqueeze(1)
            alpha = next_alpha * mask_i + alpha * (1 - mask_i)

        alpha += self.end_transitions
        return torch.logsumexp(alpha, dim=1)

    def decode(self, emissions, mask):
        """Viterbi decoding"""
        batch_size, seq_length, num_tags = emissions.shape

        viterbi_score = self.start_transitions + emissions[:, 0]
        viterbi_path = []

        for i in range(1, seq_length):
            broadcast_score = viterbi_score.unsqueeze(2)
            broadcast_emission = emissions[:, i].unsqueeze(1)
            next_score = broadcast_score + self.transitions + broadcast_emission

            next_score, indices = next_score.max(dim=1)
            viterbi_path.append(indices)

            mask_i = mask[:, i].unsqueeze(1)
            viterbi_score = next_score * mask_i + viterbi_score * (1 - mask_i)

        viterbi_score += self.end_transitions
        _, best_last_tag = viterbi_score.max(dim=1)

        # Backtrack
        best_paths = [best_last_tag]
        for indices in reversed(viterbi_path):
            best_last_tag = indices.gather(1, best_last_tag.unsqueeze(1)).squeeze(1)
            best_paths.append(best_last_tag)

        best_paths.reverse()
        return torch.stack(best_paths, dim=1)

# BiGRU-CRF Model
class BiGRUCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags, embedding_matrix=None):
        super(BiGRUCRF, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(embedding_matrix)

        self.bigru = nn.GRU(
            embedding_dim,
            hidden_dim // 2,  # Divide by 2 because bidirectional
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )

        self.dropout = nn.Dropout(0.5)
        self.hidden2tag = nn.Linear(hidden_dim, num_tags)
        self.crf = CRF(num_tags)

    def forward(self, x, tags, lengths):
        mask = (x != 0).float()

        embeddings = self.embedding(x)
        embeddings = self.dropout(embeddings)

        # Pack padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(
            embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        gru_out, _ = self.bigru(packed)
        gru_out, _ = nn.utils.rnn.pad_packed_sequence(gru_out, batch_first=True)

        gru_out = self.dropout(gru_out)
        emissions = self.hidden2tag(gru_out)

        loss = self.crf(emissions, tags, mask)
        return loss

    def predict(self, x, lengths):
        mask = (x != 0).float()

        embeddings = self.embedding(x)

        packed = nn.utils.rnn.pack_padded_sequence(
            embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        gru_out, _ = self.bigru(packed)
        gru_out, _ = nn.utils.rnn.pad_packed_sequence(gru_out, batch_first=True)

        emissions = self.hidden2tag(gru_out)
        predictions = self.crf.decode(emissions, mask)

        return predictions

In [7]:

# Create datasets and dataloaders
train_dataset = NERDataset(train_data, vocab, label_vocab)
valid_dataset = NERDataset(valid_data, vocab, label_vocab)
test_dataset = NERDataset(test_data, vocab, label_vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BiGRUCRF(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=256,
    num_tags=len(label_vocab),
    embedding_matrix=embedding_matrix
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

# Training function
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    for tokens, tags, lengths in tqdm(loader, desc="Training"):
        tokens, tags = tokens.to(device), tags.to(device)

        optimizer.zero_grad()
        loss = model(tokens, tags, lengths)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, device, idx_to_label):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for tokens, tags, lengths in tqdm(loader, desc="Evaluating"):
            tokens = tokens.to(device)

            predictions = model.predict(tokens, lengths)

            for i, length in enumerate(lengths):
                pred = predictions[i, :length].cpu().numpy()
                true = tags[i, :length].numpy()

                all_preds.extend([idx_to_label[p] for p in pred])
                all_labels.extend([idx_to_label[t] for t in true])

    return all_preds, all_labels

# Training loop
num_epochs = 20
best_f1 = 0

print("\n" + "="*50)
print("Starting Training")
print("="*50)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)

    # Evaluate on validation set
    val_preds, val_labels = evaluate(model, valid_loader, device, idx_to_label)
    val_f1 = f1_score(val_labels, val_preds, average='weighted')

    scheduler.step(train_loss)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pt')
        print("✓ Model saved!")

# Load best model and evaluate on test set
print("\n" + "="*50)
print("Evaluating on Test Set")
print("="*50)

model.load_state_dict(torch.load('best_model.pt'))
test_preds, test_labels = evaluate(model, test_loader, device, idx_to_label)

print("\nTest Set Results:")
print(classification_report(test_labels, test_preds, digits=4))

# Calculate per-entity F1 scores
print("\nPer-Entity Metrics:")
unique_labels = sorted(set(test_labels))
for label in unique_labels:
    if label != 'O':
        label_preds = [1 if p == label else 0 for p in test_preds]
        label_true = [1 if t == label else 0 for t in test_labels]
        f1 = f1_score(label_true, label_preds)
        print(f"{label}: F1 = {f1:.4f}")

Using device: cuda

Starting Training


Training: 100%|██████████| 210/210 [00:14<00:00, 14.34it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 71.97it/s]



Epoch 1/20
Train Loss: 416.9869
Validation F1: 0.9120
✓ Model saved!


Training: 100%|██████████| 210/210 [00:13<00:00, 15.98it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.75it/s]



Epoch 2/20
Train Loss: 145.3883
Validation F1: 0.9347
✓ Model saved!


Training: 100%|██████████| 210/210 [00:12<00:00, 16.24it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 73.36it/s]



Epoch 3/20
Train Loss: 91.3668
Validation F1: 0.9370
✓ Model saved!


Training: 100%|██████████| 210/210 [00:13<00:00, 16.15it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 51.68it/s]



Epoch 4/20
Train Loss: 65.7185
Validation F1: 0.9391
✓ Model saved!


Training: 100%|██████████| 210/210 [00:13<00:00, 15.82it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 75.18it/s]



Epoch 5/20
Train Loss: 52.5390
Validation F1: 0.9372


Training: 100%|██████████| 210/210 [00:12<00:00, 16.31it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.88it/s]



Epoch 6/20
Train Loss: 43.7045
Validation F1: 0.9383


Training: 100%|██████████| 210/210 [00:13<00:00, 15.74it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 76.02it/s]



Epoch 7/20
Train Loss: 36.6652
Validation F1: 0.9400
✓ Model saved!


Training: 100%|██████████| 210/210 [00:13<00:00, 15.02it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 75.54it/s]



Epoch 8/20
Train Loss: 32.0967
Validation F1: 0.9387


Training: 100%|██████████| 210/210 [00:12<00:00, 16.26it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 75.02it/s]



Epoch 9/20
Train Loss: 28.7061
Validation F1: 0.9387


Training: 100%|██████████| 210/210 [00:12<00:00, 16.59it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 72.99it/s]



Epoch 10/20
Train Loss: 26.0248
Validation F1: 0.9399


Training: 100%|██████████| 210/210 [00:12<00:00, 16.61it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 76.92it/s]



Epoch 11/20
Train Loss: 22.8964
Validation F1: 0.9392


Training: 100%|██████████| 210/210 [00:12<00:00, 16.60it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 75.86it/s]



Epoch 12/20
Train Loss: 21.0659
Validation F1: 0.9380


Training: 100%|██████████| 210/210 [00:12<00:00, 16.57it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 54.71it/s]



Epoch 13/20
Train Loss: 20.1860
Validation F1: 0.9385


Training: 100%|██████████| 210/210 [00:12<00:00, 16.18it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.63it/s]



Epoch 14/20
Train Loss: 18.4187
Validation F1: 0.9385


Training: 100%|██████████| 210/210 [00:12<00:00, 16.59it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 76.71it/s]



Epoch 15/20
Train Loss: 17.5709
Validation F1: 0.9399


Training: 100%|██████████| 210/210 [00:12<00:00, 16.42it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 76.22it/s]



Epoch 16/20
Train Loss: 16.8665
Validation F1: 0.9384


Training: 100%|██████████| 210/210 [00:12<00:00, 16.57it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.60it/s]



Epoch 17/20
Train Loss: 15.1937
Validation F1: 0.9371


Training: 100%|██████████| 210/210 [00:12<00:00, 16.60it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 76.84it/s]



Epoch 18/20
Train Loss: 14.7355
Validation F1: 0.9396


Training: 100%|██████████| 210/210 [00:12<00:00, 16.50it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 72.58it/s]



Epoch 19/20
Train Loss: 15.5839
Validation F1: 0.9378


Training: 100%|██████████| 210/210 [00:12<00:00, 16.48it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.26it/s]



Epoch 20/20
Train Loss: 14.0684
Validation F1: 0.9386

Evaluating on Test Set


Evaluating: 100%|██████████| 27/27 [00:00<00:00, 74.88it/s]



Test Set Results:
              precision    recall  f1-score   support

       B-EVT     0.6200    0.4662    0.5322       133
       B-FNB     0.8302    0.5714    0.6769        77
       B-IND     0.7828    0.7664    0.7745       381
       B-PLC     0.8725    0.7935    0.8311       552
       B-PPL     0.8851    0.8309    0.8571       621
       I-EVT     0.5369    0.4020    0.4598       199
       I-FNB     0.8158    0.4429    0.5741        70
       I-IND     0.7291    0.6854    0.7066       267
       I-PLC     0.7513    0.6463    0.6948       229
       I-PPL     0.8632    0.7995    0.8301       379
           O     0.9662    0.9838    0.9749     17044

    accuracy                         0.9455     19952
   macro avg     0.7866    0.6717    0.7193     19952
weighted avg     0.9423    0.9455    0.9432     19952


Per-Entity Metrics:
B-EVT: F1 = 0.5322
B-FNB: F1 = 0.6769
B-IND: F1 = 0.7745
B-PLC: F1 = 0.8311
B-PPL: F1 = 0.8571
I-EVT: F1 = 0.4598
I-FNB: F1 = 0.5741
I-IND: F1 = 0.

In [8]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BiGRUCRF(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=256,
    num_tags=len(label_vocab),
    embedding_matrix=embedding_matrix
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

# Training function
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    for tokens, tags, lengths in tqdm(loader, desc="Training"):
        tokens, tags = tokens.to(device), tags.to(device)

        optimizer.zero_grad()
        loss = model(tokens, tags, lengths)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, device, idx_to_label):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for tokens, tags, lengths in tqdm(loader, desc="Evaluating"):
            tokens = tokens.to(device)

            predictions = model.predict(tokens, lengths)

            for i, length in enumerate(lengths):
                pred = predictions[i, :length].cpu().numpy()
                true = tags[i, :length].numpy()

                all_preds.extend([idx_to_label[p] for p in pred])
                all_labels.extend([idx_to_label[t] for t in true])

    return all_preds, all_labels

# Training loop with Early Stopping
num_epochs = 20
best_f1 = 0
patience = 3  # Early stopping patience
patience_counter = 0

print("\n" + "="*50)
print("Starting Training")
print("="*50)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)

    # Evaluate on validation set
    val_preds, val_labels = evaluate(model, valid_loader, device, idx_to_label)
    val_f1 = f1_score(val_labels, val_preds, average='weighted')

    scheduler.step(train_loss)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation F1: {val_f1:.4f}")

    # Early stopping logic
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"✓ Model saved! (Best F1: {best_f1:.4f})")
    else:
        patience_counter += 1
        print(f"No improvement. Patience: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"\n⚠ Early stopping triggered after {epoch + 1} epochs")
            print(f"Best Validation F1: {best_f1:.4f}")
            break

# Load best model and evaluate on test set
print("\n" + "="*50)
print("Evaluating on Test Set")
print("="*50)

model.load_state_dict(torch.load('best_model.pt'))
test_preds, test_labels = evaluate(model, test_loader, device, idx_to_label)

print("\nTest Set Results:")
print(classification_report(test_labels, test_preds, digits=4))

# Calculate per-entity F1 scores
print("\nPer-Entity Metrics:")
unique_labels = sorted(set(test_labels))
for label in unique_labels:
    if label != 'O':
        label_preds = [1 if p == label else 0 for p in test_preds]
        label_true = [1 if t == label else 0 for t in test_labels]
        f1 = f1_score(label_true, label_preds)
        print(f"{label}: F1 = {f1:.4f}")

Using device: cpu

Starting Training


Training: 100%|██████████| 210/210 [00:42<00:00,  4.92it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 31.61it/s]



Epoch 1/20
Train Loss: 447.6979
Validation F1: 0.9033
✓ Model saved! (Best F1: 0.9033)


Training: 100%|██████████| 210/210 [00:41<00:00,  5.09it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 30.14it/s]



Epoch 2/20
Train Loss: 157.4314
Validation F1: 0.9304
✓ Model saved! (Best F1: 0.9304)


Training: 100%|██████████| 210/210 [00:42<00:00,  4.90it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 29.92it/s]



Epoch 3/20
Train Loss: 98.4736
Validation F1: 0.9323
✓ Model saved! (Best F1: 0.9323)


Training: 100%|██████████| 210/210 [00:44<00:00,  4.77it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 34.69it/s]



Epoch 4/20
Train Loss: 70.2894
Validation F1: 0.9339
✓ Model saved! (Best F1: 0.9339)


Training: 100%|██████████| 210/210 [00:42<00:00,  4.90it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 35.49it/s]



Epoch 5/20
Train Loss: 55.4094
Validation F1: 0.9358
✓ Model saved! (Best F1: 0.9358)


Training: 100%|██████████| 210/210 [00:39<00:00,  5.27it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 35.77it/s]



Epoch 6/20
Train Loss: 44.9656
Validation F1: 0.9324
No improvement. Patience: 1/3


Training: 100%|██████████| 210/210 [00:39<00:00,  5.26it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 36.10it/s]



Epoch 7/20
Train Loss: 37.1480
Validation F1: 0.9360
✓ Model saved! (Best F1: 0.9360)


Training: 100%|██████████| 210/210 [00:40<00:00,  5.19it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 35.81it/s]



Epoch 8/20
Train Loss: 34.2499
Validation F1: 0.9356
No improvement. Patience: 1/3


Training: 100%|██████████| 210/210 [00:40<00:00,  5.23it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 38.43it/s]



Epoch 9/20
Train Loss: 29.3732
Validation F1: 0.9351
No improvement. Patience: 2/3


Training: 100%|██████████| 210/210 [00:37<00:00,  5.58it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 33.50it/s]



Epoch 10/20
Train Loss: 25.5699
Validation F1: 0.9361
✓ Model saved! (Best F1: 0.9361)


Training: 100%|██████████| 210/210 [00:40<00:00,  5.16it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 29.60it/s]



Epoch 11/20
Train Loss: 23.8352
Validation F1: 0.9344
No improvement. Patience: 1/3


Training: 100%|██████████| 210/210 [00:42<00:00,  4.98it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 34.31it/s]



Epoch 12/20
Train Loss: 21.4949
Validation F1: 0.9348
No improvement. Patience: 2/3


Training: 100%|██████████| 210/210 [00:39<00:00,  5.25it/s]
Evaluating: 100%|██████████| 27/27 [00:00<00:00, 33.50it/s]



Epoch 13/20
Train Loss: 19.8763
Validation F1: 0.9348
No improvement. Patience: 3/3

⚠ Early stopping triggered after 13 epochs
Best Validation F1: 0.9361

Evaluating on Test Set


Evaluating: 100%|██████████| 27/27 [00:00<00:00, 32.14it/s]



Test Set Results:
              precision    recall  f1-score   support

       B-EVT     0.5259    0.5182    0.5221       137
       B-FNB     0.7576    0.6494    0.6993        77
       B-IND     0.7842    0.7493    0.7664       383
       B-PLC     0.8426    0.8213    0.8318       554
       B-PPL     0.8283    0.8429    0.8356       624
       I-EVT     0.5463    0.5385    0.5424       208
       I-FNB     0.8281    0.6092    0.7020        87
       I-IND     0.7193    0.7193    0.7193       342
       I-PLC     0.7709    0.7709    0.7709       358
       I-PPL     0.8276    0.8460    0.8367       539
           O     0.9713    0.9740    0.9727     16643

    accuracy                         0.9394     19952
   macro avg     0.7638    0.7308    0.7454     19952
weighted avg     0.9389    0.9394    0.9391     19952


Per-Entity Metrics:
B-EVT: F1 = 0.5221
B-FNB: F1 = 0.6993
B-IND: F1 = 0.7664
B-PLC: F1 = 0.8318
B-PPL: F1 = 0.8356
I-EVT: F1 = 0.5424
I-FNB: F1 = 0.7020
I-IND: F1 = 0.

In [14]:
model.eval()

BiGRUCRF(
  (embedding): Embedding(18930, 300, padding_idx=0)
  (bigru): GRU(300, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (hidden2tag): Linear(in_features=256, out_features=11, bias=True)
  (crf): CRF()
)

In [11]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BiGRUCRF(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=256,
    num_tags=len(label_vocab),
    embedding_matrix=embedding_matrix
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

# Training function
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    for tokens, tags, lengths in tqdm(loader, desc="Training"):
        tokens, tags = tokens.to(device), tags.to(device)

        optimizer.zero_grad()
        loss = model(tokens, tags, lengths)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

# Evaluation function - exclude 'O' tags
def evaluate(model, loader, device, idx_to_label):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for tokens, tags, lengths in tqdm(loader, desc="Evaluating"):
            tokens = tokens.to(device)

            predictions = model.predict(tokens, lengths)

            for i, length in enumerate(lengths):
                pred = predictions[i, :length].cpu().numpy()
                true = tags[i, :length].numpy()

                # Filter out 'O' tags
                for p, t in zip(pred, true):
                    pred_label = idx_to_label[p]
                    true_label = idx_to_label[t]

                    # Only include non-O predictions
                    if true_label != 'O' or pred_label != 'O':
                        all_preds.append(pred_label)
                        all_labels.append(true_label)

    return all_preds, all_labels

# Load best model and evaluate on test set
print("\n" + "="*50)
print("Evaluating on Test Set (Excluding 'O' tags)")
print("="*50)

checkpoint = torch.load('best_model.pt', map_location=device)
model.load_state_dict(checkpoint)
test_preds, test_labels = evaluate(model, test_loader, device, idx_to_label)

# Filter labels for classification report (exclude 'O' from label list)
entity_labels = [label for label in sorted(set(test_labels)) if label != 'O']

print("\nTest Set Results (Entity-level metrics):")
print(classification_report(
    test_labels,
    test_preds,
    labels=entity_labels,  # Only evaluate on entity labels
    digits=4
))

# Alternative: Use seqeval library (recommended for NER)
# pip install seqeval
from seqeval.metrics import classification_report as seq_report
from seqeval.metrics import f1_score as seq_f1

# Group predictions by sentence for seqeval
def group_by_sentence(preds, labels, loader, idx_to_label):
    """Group token predictions back into sentences"""
    all_pred_tags = []
    all_true_tags = []

    model.eval()
    with torch.no_grad():
        for tokens, tags, lengths in loader:
            tokens = tokens.to(device)
            predictions = model.predict(tokens, lengths)

            for i, length in enumerate(lengths):
                pred_seq = [idx_to_label[p] for p in predictions[i, :length].cpu().numpy()]
                true_seq = [idx_to_label[t] for t in tags[i, :length].numpy()]

                all_pred_tags.append(pred_seq)
                all_true_tags.append(true_seq)

    return all_pred_tags, all_true_tags

print("\n" + "="*50)
print("Seqeval Metrics")
print("="*50)

pred_tags, true_tags = group_by_sentence(test_preds, test_labels, test_loader, idx_to_label)
print(seq_report(true_tags, pred_tags, digits=4))
print(f"\nOverall F1 (seqeval): {seq_f1(true_tags, pred_tags):.4f}")

Using device: cpu

Evaluating on Test Set (Excluding 'O' tags)


Evaluating: 100%|██████████| 27/27 [00:00<00:00, 29.32it/s]



Test Set Results (Entity-level metrics):
              precision    recall  f1-score   support

       B-EVT     0.5259    0.5182    0.5221       137
       B-FNB     0.7576    0.6494    0.6993        77
       B-IND     0.7842    0.7493    0.7664       383
       B-PLC     0.8426    0.8213    0.8318       554
       B-PPL     0.8283    0.8429    0.8356       624
       I-EVT     0.5463    0.5385    0.5424       208
       I-FNB     0.8281    0.6092    0.7020        87
       I-IND     0.7193    0.7193    0.7193       342
       I-PLC     0.7709    0.7709    0.7709       358
       I-PPL     0.8276    0.8460    0.8367       539

   micro avg     0.7762    0.7652    0.7707      3309
   macro avg     0.7431    0.7065    0.7226      3309
weighted avg     0.7761    0.7652    0.7700      3309


Seqeval Metrics
              precision    recall  f1-score   support

         EVT     0.4632    0.4599    0.4615       137
         FNB     0.6377    0.5714    0.6027        77
         IND     0.

In [15]:
print("\n" + "="*50)
print("Contoh Data Asli vs Prediksi")
print("="*50)

num_examples_to_show = 5 # Number of examples to display

# test_preds is a flattened list of all predicted labels.
# We need to re-segment it by sentence length.
current_preds_idx = 0

for i in range(num_examples_to_show):
    tokens, true_labels_raw = test_data[i] # true_labels_raw already contains string labels
    sentence_length = len(tokens)

    # Get predicted labels for the current sentence
    predicted_labels = test_preds[current_preds_idx : current_preds_idx + sentence_length]

    print(f"\n--- Contoh {i+1} ---")
    print(f"Tokens:      {' '.join(tokens)}")
    print(f"True Labels: {true_labels_raw}") # Use true_labels_raw directly
    print(f"Pred Labels: {predicted_labels}")

    current_preds_idx += sentence_length # Update index for the next sentence


Contoh Data Asli vs Prediksi

--- Contoh 1 ---
Tokens:      kuasa hukum teamster berasal dari edmonton , namun tinggal di sekitar vancouver sejak tahun 1991 .
True Labels: ['O', 'O', 'B-PPL', 'O', 'O', 'B-PLC', 'O', 'O', 'O', 'O', 'O', 'B-PLC', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PLC', 'O', 'O', 'O', 'O']

--- Contoh 2 ---
Tokens:      data diurutkan berdasarkan umur , jenis kelamin dan metode ; pertambahan yang ditandai terlihat di antara orang skotlandia berumur dari 25 sampai 54 tahun dengan gantung diri meningkatkan popularitas .
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PLC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PLC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

--- Contoh 3 ---
Tokens:      " urusan dengan atasannya masuk dala

In [1]:
import pickle

# Memuat vocab
with open('vocab.pkl', 'rb') as f:
    loaded_vocab = pickle.load(f)
print(f"Loaded vocab size: {len(loaded_vocab)}")

# Memuat label_vocab
with open('label_vocab.pkl', 'rb') as f:
    loaded_label_vocab = pickle.load(f)
print(f"Loaded label vocab size: {len(loaded_label_vocab)}")

# Memuat idx_to_label
with open('idx_to_label.pkl', 'rb') as f:
    loaded_idx_to_label = pickle.load(f)
print(f"Loaded idx_to_label keys: {list(loaded_idx_to_label.keys())[:5]}...")

Loaded vocab size: 18930
Loaded label vocab size: 11
Loaded idx_to_label keys: [0, 1, 2, 3, 4]...


In [None]:
import fasttext
import torch
import numpy as np
import pickle
import os

device = torch.device('cpu')
print("Using CPU")

fasttext_model = fasttext.load_model("../cc.id.300.bin")
EMBEDDING_DIM = 300

def create_embedding_matrix(vocab, fasttext_model, embedding_dim):
    """
    Membuat embedding matrix dari vocabulary dan fasttext model
    """
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    
    for word, idx in vocab.items():
        # Skip special tokens
        if word in ["<PAD>", "<UNK>"]:
            continue
        
        try:
            embedding_matrix[idx] = fasttext_model[word]
        except KeyError:
            # Random initialization untuk OOV words
            embedding_matrix[idx] = np.random.normal(0, 0.1, embedding_dim)
    
    # Convert ke torch tensor dengan tipe float32
    return torch.tensor(embedding_matrix, dtype=torch.float32)

print("Creating embedding matrix...")
loaded_embedding_matrix = create_embedding_matrix(
    loaded_vocab, 
    fasttext_model, 
    EMBEDDING_DIM
)
print(f"Embedding matrix shape: {loaded_embedding_matrix.shape}")

print("Initializing model...")
model = BiGRUCRF(
    vocab_size=len(loaded_vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=256,
    num_tags=len(loaded_label_vocab),
    embedding_matrix=loaded_embedding_matrix
).to(device)

print("Loading model weights...")
checkpoint = torch.load('best_model.pt', map_location=device)
model.load_state_dict(checkpoint)

model.eval()

print(f"✓ Model loaded successfully on {device}")
print(f"  - Vocabulary size: {len(loaded_vocab)}")
print(f"  - Label vocabulary size: {len(loaded_label_vocab)}")
print(f"  - Embedding dimension: {EMBEDDING_DIM}")
print(f"  - Hidden dimension: 256")

Using CPU




Creating embedding matrix...
Embedding matrix shape: torch.Size([18930, 300])
Initializing model...
Loading model weights...
✓ Model loaded successfully on cpu
  - Vocabulary size: 18930
  - Label vocabulary size: 11
  - Embedding dimension: 300
  - Hidden dimension: 256


In [9]:
import re

def preprocess_sentence(sentence):
    # lowercase
    sentence = sentence.lower()

    # beri spasi di sekitar simbol
    sentence = re.sub(r"([.,!?():;])", r" \1 ", sentence)

    # hapus spasi berlebih
    sentence = re.sub(r"\s+", " ", sentence).strip()

    # tokenisasi
    tokens = sentence.split()
    return tokens


def predict_sentence(sentence, model, vocab, idx_to_label, device):
    model.eval()
    tokens = preprocess_sentence(sentence)

    # Convert tokens to IDs
    token_ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    token_tensor = torch.LongTensor(token_ids).unsqueeze(0).to(device)
    length_tensor = torch.LongTensor([len(tokens)])

    # Make prediction
    with torch.no_grad():
        predictions = model.predict(token_tensor, length_tensor)

    # Convert predicted IDs back to labels
    predicted_tag_ids = predictions.squeeze(0).cpu().numpy()
    predicted_labels = [idx_to_label[tag_id] for tag_id in predicted_tag_ids]

    return tokens, predicted_labels

# Example usage:
user_input_sentence = "Joko Widodo bertemu dengan Menteri Keuangan Sri Mulyani di Jakarta."
predicted_tokens, predicted_tags = predict_sentence(user_input_sentence, model, loaded_vocab, loaded_idx_to_label, device)

print(f"\nInput Sentence: {user_input_sentence}")
print(f"Tokens:         {predicted_tokens}")
print(f"Predicted Tags: {predicted_tags}")

user_input_sentence_2 = "Saya suka makan nasi goreng di restoran Padang."
predicted_tokens_2, predicted_tags_2 = predict_sentence(user_input_sentence_2, model, loaded_vocab, loaded_idx_to_label, device)

print(f"\nInput Sentence: {user_input_sentence_2}")
print(f"Tokens:         {predicted_tokens_2}")
print(f"Predicted Tags: {predicted_tags_2}")


Input Sentence: Joko Widodo bertemu dengan Menteri Keuangan Sri Mulyani di Jakarta.
Tokens:         ['joko', 'widodo', 'bertemu', 'dengan', 'menteri', 'keuangan', 'sri', 'mulyani', 'di', 'jakarta', '.']
Predicted Tags: ['B-PPL', 'I-PPL', 'O', 'O', 'O', 'O', 'B-PPL', 'I-PPL', 'O', 'B-PLC', 'O']

Input Sentence: Saya suka makan nasi goreng di restoran Padang.
Tokens:         ['saya', 'suka', 'makan', 'nasi', 'goreng', 'di', 'restoran', 'padang', '.']
Predicted Tags: ['O', 'O', 'O', 'B-FNB', 'I-FNB', 'O', 'B-PLC', 'I-PLC', 'O']
