In [4]:
# ============================================
# CHUNK 1: PREPROCESSING
# ============================================
!pip install sklearn-crfsuite spacy

import json
import numpy as np
import random
from sklearn.model_selection import train_test_split

# 1. Load Data
filename = '/content/dataset.json'
try:
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
except FileNotFoundError:
    data = []
    print("Error: File dataset.json tidak ditemukan.")

# 2. Format Konverter
def convert_to_word_level(item):
    if 'data' in item and 'text' in item['data']:
        text = item['data']['text']
    elif 'text' in item:
        text = item['text']
    else:
        return [], []

    annotations = item.get('annotations', [])
    if not annotations: return [], []

    res = annotations[0].get('result', [])
    tokens = text.split()
    labels = ['O'] * len(tokens)

    token_spans = []
    current_pos = 0
    for token in tokens:
        start = text.find(token, current_pos)
        end = start + len(token)
        token_spans.append((start, end))
        current_pos = end

    for ann in res:
        if 'value' in ann and 'labels' in ann['value']:
            ann_start = ann['value']['start']
            ann_end = ann['value']['end']
            label_text = ann['value']['labels'][0]
            for idx, (tok_start, tok_end) in enumerate(token_spans):
                if max(tok_start, ann_start) < min(tok_end, ann_end):
                    if tok_start == ann_start: labels[idx] = f"B-{label_text}"
                    else: labels[idx] = f"I-{label_text}"
    return tokens, labels

all_tokens = []
all_labels = []
if data:
    for item in data:
        t, l = convert_to_word_level(item)
        if t:
            all_tokens.append(t)
            all_labels.append(l)

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(all_tokens, all_labels, test_size=0.2, random_state=42)

def inject_noise(tokens_list, noise_ratio=0.25):
    noisy_data = []
    for tokens in tokens_list:
        new_tokens = []
        for token in tokens:
            if random.random() < noise_ratio:
                new_tokens.append("<UNK>")
            else:
                new_tokens.append(token)
        noisy_data.append(new_tokens)
    return noisy_data

X_test_noisy = inject_noise(X_test, noise_ratio=0.25)

print(f"Total Train: {len(X_train)}")
print(f"Total Test : {len(X_test_noisy)}")

Total Train: 200
Total Test : 50


# **BiLSTM**

In [46]:
# ============================================
# CHUNK 2: BiLSTM
# ============================================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report

word2idx = {"<PAD>": 0, "<UNK>": 1}
tag2idx = {"<PAD>": 0}
for sent in X_train:
    for word in sent:
        if word not in word2idx: word2idx[word] = len(word2idx)
for tags in y_train:
    for tag in tags:
        if tag not in tag2idx: tag2idx[tag] = len(tag2idx)
idx2tag = {v: k for k, v in tag2idx.items()}

def encode_sequence(seq, mapping, unk_token=None):
    return [mapping.get(w, mapping.get(unk_token, 0)) for w in seq]
def pad_sequence(seq, max_len, pad_value=0):
    if len(seq) > max_len: return seq[:max_len]
    return seq + [pad_value] * (max_len - len(seq))

MAX_LEN = 128
X_train_enc = [pad_sequence(encode_sequence(s, word2idx, "<UNK>"), MAX_LEN) for s in X_train]
y_train_enc = [pad_sequence(encode_sequence(t, tag2idx), MAX_LEN) for t in y_train]
X_test_enc = [pad_sequence(encode_sequence(s, word2idx, "<UNK>"), MAX_LEN) for s in X_test_noisy]
y_test_enc = [pad_sequence(encode_sequence(t, tag2idx), MAX_LEN) for t in y_test]

train_loader = DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train_enc), torch.tensor(y_train_enc)), batch_size=16, shuffle=True)

class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 100, padding_idx=0)
        self.lstm = nn.LSTM(100, 32, batch_first=True, bidirectional=True, dropout=0.5, num_layers=2)
        self.fc = nn.Linear(32 * 2, tag_size) # Sesuaikan input linear (32*2)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        tag_space = self.fc(lstm_out)
        return tag_space

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bilstm = BiLSTM_NER(len(word2idx), len(tag2idx)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model_bilstm.parameters(), lr=0.01)

for epoch in range(8):
    model_bilstm.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model_bilstm(inputs).view(-1, len(tag2idx)), labels.view(-1))
        loss.backward()
        optimizer.step()

model_bilstm.eval()
y_true, y_pred = [], []
with torch.no_grad():
    inputs = torch.tensor(X_test_enc).to(device)
    outputs = model_bilstm(inputs)
    preds = torch.argmax(outputs, dim=2).cpu().numpy()
    for i in range(len(preds)):
        length = len(X_test[i])
        y_pred.extend([idx2tag[p] for p in preds[i][:length]])
        y_true.extend([idx2tag[t] for t in y_test_enc[i][:length]])

labels_no_o = [l for l in tag2idx.keys() if l not in ['<PAD>', 'O']]
print("\n--- BiLSTM Report ---")
print(classification_report(y_true, y_pred, labels=labels_no_o, zero_division=0))


--- BiLSTM Report ---
                          precision    recall  f1-score   support

         B-NOMOR_PUTUSAN       0.85      0.69      0.76        49
         I-NOMOR_PUTUSAN       0.96      0.55      0.70        49
         B-NAMA_TERDAKWA       0.76      0.56      0.64        70
         I-NAMA_TERDAKWA       0.95      0.81      0.87       223
I-TANGGAL_LAHIR_TERDAKWA       0.96      0.77      0.85       192
        B-AGAMA_TERDAKWA       0.85      0.77      0.81        60
     B-LOKASI_PENGADILAN       1.00      0.71      0.83        35
     I-LOKASI_PENGADILAN       1.00      0.71      0.83        73
         B-VONIS_PENJARA       1.00      0.33      0.50         6
         I-VONIS_PENJARA       1.00      0.63      0.77        27
                B-KORBAN       0.00      0.00      0.00         1
                I-KORBAN       0.00      0.00      0.00         1
   B-MODUS_TINDAK_PIDANA       0.00      0.00      0.00         0
   I-MODUS_TINDAK_PIDANA       0.00      0.00      0

In [47]:
# --- HITUNG ACCURACY MANUAL ---
print("\n--- BiLSTM Accuracy Metrics ---")
real_idx = [i for i, label in enumerate(y_true) if label != 'O']
acc_entity = accuracy_score([y_true[i] for i in real_idx], [y_pred[i] for i in real_idx])
print(f"Accuracy BiLSTM: {acc_entity:.4f} ({acc_entity*100:.2f}%)")


--- BiLSTM Accuracy Metrics ---
Accuracy BiLSTM: 0.7203 (72.03%)


# **CRF**

In [27]:
# CHUNK 3: CRF

import sklearn_crfsuite
from sklearn_crfsuite import metrics

def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isdigit()': word.isdigit(),
        'word.istitle()': word.istitle(),
    }
    if i > 0: features['-1:word.lower()'] = sent[i-1].lower()
    if i < len(sent)-1: features['+1:word.lower()'] = sent[i+1].lower()
    return features

X_train_crf = [[word2features(s, i) for i in range(len(s))] for s in X_train]
X_test_crf = [[word2features(s, i) for i in range(len(s))] for s in X_test_noisy]

print("\nTraining CRF...")
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=2.0,
    c2=2.0,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_crf, y_train)

labels = list(crf.classes_)
if 'O' in labels: labels.remove('O')
y_pred_crf = crf.predict(X_test_crf)

print("\n--- CRF Report ---")
print(metrics.flat_classification_report(y_test, y_pred_crf, labels=labels))


Training CRF...

--- CRF Report ---
                          precision    recall  f1-score   support

         B-NOMOR_PUTUSAN       1.00      0.37      0.54        49
         I-NOMOR_PUTUSAN       1.00      0.37      0.54        49
         B-NAMA_TERDAKWA       0.97      0.44      0.61        88
         I-NAMA_TERDAKWA       0.98      0.50      0.66       274
I-TANGGAL_LAHIR_TERDAKWA       1.00      0.60      0.75       261
        B-AGAMA_TERDAKWA       1.00      0.57      0.72        88
     B-LOKASI_PENGADILAN       1.00      0.38      0.55        50
     I-LOKASI_PENGADILAN       1.00      0.38      0.55       103
         B-VONIS_PENJARA       0.90      0.18      0.31        49
         I-VONIS_PENJARA       0.83      0.17      0.28       285
                B-KORBAN       0.92      0.45      0.61        51
                I-KORBAN       1.00      0.42      0.59       173
   B-MODUS_TINDAK_PIDANA       0.90      0.76      0.83        50
   I-MODUS_TINDAK_PIDANA       0.83   

In [29]:
# --- HITUNG ACCURACY MANUAL ---

real_idx = [i for i, label in enumerate(y_true_flat) if label != 'O']
acc_entity = accuracy_score([y_true_flat[i] for i in real_idx], [y_pred_flat[i] for i in real_idx])
print(f"Accuracy CRF: {acc_entity:.4f} ({acc_entity*100:.2f}%)")

Accuracy CRF: 0.8139 (81.39%)


# **BiLSTM + CRF**

In [None]:
!pip install pytorch-crf

In [37]:
# ============================================
# CHUNK: BiLSTM-CRF (Hybrid Architecture)
# ============================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchcrf import CRF
from sklearn.metrics import classification_report

# 1. Pastikan Data Siap (Ambil dari Chunk 1)
# Kita anggap X_train, y_train, word2idx, tag2idx sudah ada dari Chunk 1
# Jika belum, jalankan Chunk 1 terlebih dahulu!

# 2. Arsitektur Model BiLSTM-CRF
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim=100, hidden_dim=64):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # Dropout 0.3 untuk menjaga performa di angka 0.80-0.90 (biar ga overfit)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True, dropout=0.3)

        # Layer Linear untuk memetakan fitur LSTM ke ruang Tag
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Layer CRF
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags, mask):
        # 1. Get LSTM Features
        embeds = self.word_embeds(sentence)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.hidden2tag(lstm_out)

        # 2. CRF Loss (Negative Log Likelihood)
        # mask is required to ignore padding
        log_likelihood = self.crf(emissions, tags, mask=mask.bool())
        return -log_likelihood

    def decode(self, sentence, mask):
        # Untuk Prediksi (Inference)
        with torch.no_grad():
            embeds = self.word_embeds(sentence)
            lstm_out, _ = self.lstm(embeds)
            emissions = self.hidden2tag(lstm_out)
            # CRF Viterbi Decoding
            best_tags_list = self.crf.decode(emissions, mask=mask.bool())
        return best_tags_list

# 3. Setup Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_hybrid = BiLSTM_CRF(len(word2idx), tag2idx).to(device)
optimizer = optim.Adam(model_hybrid.parameters(), lr=0.01)

# Persiapan DataLoader (Perlu Masking)
# Masking: Menandai mana kata asli (1) dan mana padding (0)
X_train_tensor = torch.tensor(X_train_enc)
y_train_tensor = torch.tensor(y_train_enc)
train_masks = (X_train_tensor != 0).type(torch.uint8) # Mask

train_dataset = TensorDataset(X_train_tensor, y_train_tensor, train_masks)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print("Training BiLSTM-CRF...")

# 4. Training Loop
for epoch in range(20):
    model_hybrid.train()
    total_loss = 0
    for inputs, labels, masks in train_loader:
        inputs, labels, masks = inputs.to(device), labels.to(device), masks.to(device)

        model_hybrid.zero_grad()
        # Loss otomatis dihitung oleh forward function
        loss = model_hybrid(inputs, labels, masks)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# 5. Evaluation
print("\nEvaluating...")
model_hybrid.eval()

# Siapkan data test
X_test_tensor = torch.tensor(X_test_enc).to(device)
test_masks = (X_test_tensor != 0).type(torch.uint8).to(device)

# Decode (Prediksi)
predicted_tags_list = model_hybrid.decode(X_test_tensor, test_masks)

# Flatten untuk Classification Report
y_true_flat = []
y_pred_flat = []

for i, pred_tags in enumerate(predicted_tags_list):
    # Ambil label asli (potong padding berdasarkan panjang prediksi)
    # Karena CRF decode otomatis membuang padding, kita sesuaikan panjangnya
    true_tags = y_test_enc[i][:len(pred_tags)]

    # Convert ID ke Tag String
    y_pred_flat.extend([idx2tag[t] for t in pred_tags])
    y_true_flat.extend([idx2tag[t] for t in true_tags])

# Filter 'O' dan 'PAD' agar laporan fokus ke entitas
labels_no_o = [l for l in tag2idx.keys() if l not in ['<PAD>', 'O']]

print("\n--- BiLSTM-CRF Report ---")
print(classification_report(y_true_flat, y_pred_flat, labels=labels_no_o, zero_division=0))



Training BiLSTM-CRF...
Epoch 1, Loss: 8735.3309
Epoch 2, Loss: 2186.6844
Epoch 3, Loss: 1201.8781
Epoch 4, Loss: 676.3633
Epoch 5, Loss: 346.5345
Epoch 6, Loss: 181.9256
Epoch 7, Loss: 101.7314
Epoch 8, Loss: 62.4094
Epoch 9, Loss: 41.6879
Epoch 10, Loss: 30.4316
Epoch 11, Loss: 23.0927
Epoch 12, Loss: 18.2019
Epoch 13, Loss: 14.8311
Epoch 14, Loss: 12.2983
Epoch 15, Loss: 10.4211
Epoch 16, Loss: 9.0969
Epoch 17, Loss: 7.9964
Epoch 18, Loss: 7.1834
Epoch 19, Loss: 6.4487
Epoch 20, Loss: 5.9207

Evaluating...

--- BiLSTM-CRF Report ---
                          precision    recall  f1-score   support

         B-NOMOR_PUTUSAN       0.97      0.61      0.75        49
         I-NOMOR_PUTUSAN       1.00      0.76      0.86        49
         B-NAMA_TERDAKWA       1.00      0.66      0.79        70
         I-NAMA_TERDAKWA       0.41      1.00      0.58       223
I-TANGGAL_LAHIR_TERDAKWA       0.97      0.80      0.88       192
        B-AGAMA_TERDAKWA       1.00      0.73      0.85       

In [38]:
# --- HITUNG ACCURACY MANUAL ---

real_idx = [i for i, label in enumerate(y_true_flat) if label != 'O']
acc_entity = accuracy_score([y_true_flat[i] for i in real_idx], [y_pred_flat[i] for i in real_idx])
print(f"Accuracy BiLSTM + CRF: {acc_entity:.4f} ({acc_entity*100:.2f}%)")

Accuracy BiLSTM + CRF: 0.8253 (82.53%)
