## Baseline KBLI Classification

## load data


In [14]:
import pandas as pd
df_path="../dataset/clean/subset_kbli_classify.csv"

df=pd.read_csv(df_path,quotechar='"',encoding="utf-8",dtype=str)

df.head()






Unnamed: 0,text_description,kbli_code,text_length
0,aktivitas: membantu menjemur cengke. produk: c...,1282,11
1,aktivitas: tenaga honorer guru bahasa indonesi...,85230,21
2,aktivitas: membersihkan rumput di kebun kopi. ...,1270,12
3,aktivitas: jual kueh putu mayang keliling. pro...,47991,13
4,"aktivitas: dosen unwina (dosen tidak tetap, ma...",85321,13


In [16]:
df_kbli=df.copy()
# Identifikasi kelas dengan minimal 2 sampel
valid_classes = df_kbli["kbli_code"].value_counts()
valid_classes = valid_classes[valid_classes >= 10].index

# Filter dataset hanya untuk kelas valid tersebut
df_kbli_filtered = df_kbli[df_kbli["kbli_code"].isin(valid_classes)]

# Ambil subset 10% stratified
from sklearn.model_selection import train_test_split

df_subset, _ = train_test_split(
    df_kbli_filtered,
    test_size=0.5,
    stratify=df_kbli_filtered['kbli_code'],
    random_state=42
)




len(df_subset)

103030

In [17]:
df=df_subset.copy()

In [18]:
len(df)

103030

In [19]:
## make label maps

# Buat label_map dari seluruh data
all_labels = df["kbli_code"].tolist()
label_map = {label: idx for idx, label in enumerate(sorted(set(all_labels)))}
idx2label = {v: k for k, v in label_map.items()}
print(label_map)

{'01111': 0, '01112': 1, '01113': 2, '01114': 3, '01115': 4, '01116': 5, '01117': 6, '01118': 7, '01119': 8, '01121': 9, '01122': 10, '01131': 11, '01132': 12, '01133': 13, '01134': 14, '01135': 15, '01136': 16, '01139': 17, '01140': 18, '01150': 19, '01160': 20, '01191': 21, '01193': 22, '01199': 23, '01220': 24, '01230': 25, '01240': 26, '01252': 27, '01253': 28, '01259': 29, '01261': 30, '01262': 31, '01269': 32, '01270': 33, '01281': 34, '01282': 35, '01283': 36, '01284': 37, '01285': 38, '01286': 39, '01289': 40, '01291': 41, '01299': 42, '01301': 43, '01302': 44, '01411': 45, '01412': 46, '01413': 47, '01420': 48, '01441': 49, '01442': 50, '01443': 51, '01450': 52, '01461': 53, '01462': 54, '01463': 55, '01464': 56, '01465': 57, '01466': 58, '01468': 59, '01469': 60, '01497': 61, '01499': 62, '01611': 63, '01612': 64, '01613': 65, '01619': 66, '01629': 67, '01630': 68, '01712': 69, '01714': 70, '01719': 71, '01724': 72, '01727': 73, '02111': 74, '02113': 75, '02119': 76, '02121':

### Import Model Library

In [20]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split




### Tokenize and vectorize the data

In [21]:
tokenizer=AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

## make dataset custom
class KBLIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label_map, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = label_map

    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.label_map[label], dtype=torch.long)
        }




In [33]:
label_counts = df["kbli_code"].value_counts()
valid_labels = label_counts[label_counts >=100].index

df_filtered = df[df["kbli_code"].isin(valid_labels)].copy()

len(df_filtered)

unique_code,n_sample=df_filtered["kbli_code"].nunique(), len(df_kbli)

(unique_code,n_sample)

(170, 208564)

In [43]:
# Buat label_map dari seluruh data
all_labels = df_filtered["kbli_code"].tolist()
label_map = {label: idx for idx, label in enumerate(sorted(set(all_labels)))}
idx2label = {v: k for k, v in label_map.items()}
print(label_map)

{'01111': 0, '01114': 1, '01115': 2, '01116': 3, '01121': 4, '01122': 5, '01131': 6, '01132': 7, '01133': 8, '01134': 9, '01135': 10, '01139': 11, '01140': 12, '01150': 13, '01220': 14, '01252': 15, '01261': 16, '01262': 17, '01270': 18, '01281': 19, '01282': 20, '01283': 21, '01284': 22, '01286': 23, '01289': 24, '01291': 25, '01299': 26, '01411': 27, '01441': 28, '01442': 29, '01450': 30, '01461': 31, '01462': 32, '01464': 33, '01613': 34, '02202': 35, '02301': 36, '02309': 37, '03111': 38, '03112': 39, '03121': 40, '03217': 41, '05100': 42, '07291': 43, '07301': 44, '08104': 45, '10421': 46, '10431': 47, '10631': 48, '10710': 49, '10722': 50, '10750': 51, '10792': 52, '10794': 53, '10799': 54, '11052': 55, '13121': 56, '13122': 57, '14111': 58, '14120': 59, '15202': 60, '16101': 61, '16211': 62, '16221': 63, '16291': 64, '16292': 65, '18111': 66, '23921': 67, '25111': 68, '31001': 69, '32903': 70, '32909': 71, '38110': 72, '41011': 73, '41012': 74, '41019': 75, '42101': 76, '42919':

In [34]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_filtered["text_description"],
    df_filtered["kbli_code"],
    test_size=0.2,
    stratify=df_filtered["kbli_code"],
    random_state=42
)

# Dataset dan DataLoader
train_dataset = KBLIDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, label_map)
val_dataset = KBLIDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, label_map)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

#### check sample tokenization

In [44]:
sample = train_dataset[0]
print("Input IDs:", sample['input_ids'])
print("Attention Mask:", sample['attention_mask'])
print("Label ID:", sample['labels'])


Input IDs: tensor([    2,  2310, 30472,  8881, 18336,    26,  1219,   448, 30470,   497,
        30472, 18336, 30470,  1062, 30472,  6540,  2318,   536, 30470,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    

#### Fine-Tuning IndoBert

In [36]:
import torch.nn as nn
from transformers import AutoModel

class IndoBertForKBLI(nn.Module):
    def __init__(self, num_labels):
        super(IndoBertForKBLI, self).__init__()
        self.indobert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.indobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs =self.indobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped_output = self.dropout(pooled_output)
        logits = self.classifier(dropped_output)
        return logits

#### Training Preparation

In [45]:
print("Jumlah kelas unik (class_weights):", len(class_weights))
train_label_ids = [label_map[label] for label in train_labels]

print("Min label ID:", min(train_label_ids))
print("Max label ID:", max(train_label_ids))
print("Jumlah kelas unik:", len(set(train_label_ids)))

print("Mapping label2id:", label_map)
print("Jumlah kelas (dari label_map):", len(label_map))


Jumlah kelas unik (class_weights): 170
Min label ID: 0
Max label ID: 169
Jumlah kelas unik: 170
Mapping label2id: {'01111': 0, '01114': 1, '01115': 2, '01116': 3, '01121': 4, '01122': 5, '01131': 6, '01132': 7, '01133': 8, '01134': 9, '01135': 10, '01139': 11, '01140': 12, '01150': 13, '01220': 14, '01252': 15, '01261': 16, '01262': 17, '01270': 18, '01281': 19, '01282': 20, '01283': 21, '01284': 22, '01286': 23, '01289': 24, '01291': 25, '01299': 26, '01411': 27, '01441': 28, '01442': 29, '01450': 30, '01461': 31, '01462': 32, '01464': 33, '01613': 34, '02202': 35, '02301': 36, '02309': 37, '03111': 38, '03112': 39, '03121': 40, '03217': 41, '05100': 42, '07291': 43, '07301': 44, '08104': 45, '10421': 46, '10431': 47, '10631': 48, '10710': 49, '10722': 50, '10750': 51, '10792': 52, '10794': 53, '10799': 54, '11052': 55, '13121': 56, '13122': 57, '14111': 58, '14120': 59, '15202': 60, '16101': 61, '16211': 62, '16221': 63, '16291': 64, '16292': 65, '18111': 66, '23921': 67, '25111': 68

In [55]:
train_label_ids = [label_map[label] for label in train_labels]
train_label_ids

[83,
 25,
 16,
 146,
 73,
 8,
 104,
 84,
 16,
 13,
 59,
 5,
 92,
 8,
 25,
 38,
 5,
 7,
 14,
 159,
 10,
 83,
 20,
 132,
 16,
 73,
 121,
 17,
 0,
 73,
 126,
 9,
 52,
 112,
 5,
 6,
 8,
 167,
 0,
 152,
 10,
 83,
 25,
 17,
 6,
 10,
 58,
 28,
 159,
 69,
 163,
 25,
 128,
 5,
 166,
 123,
 83,
 5,
 17,
 17,
 41,
 20,
 17,
 120,
 25,
 59,
 169,
 132,
 145,
 156,
 104,
 168,
 5,
 21,
 15,
 141,
 5,
 25,
 168,
 152,
 10,
 140,
 83,
 25,
 8,
 48,
 120,
 17,
 43,
 73,
 145,
 25,
 10,
 83,
 39,
 86,
 119,
 134,
 30,
 64,
 58,
 29,
 53,
 5,
 13,
 38,
 60,
 158,
 75,
 25,
 8,
 119,
 116,
 5,
 134,
 10,
 73,
 82,
 58,
 101,
 9,
 134,
 147,
 146,
 25,
 25,
 132,
 5,
 0,
 10,
 17,
 113,
 31,
 67,
 49,
 38,
 5,
 132,
 73,
 116,
 158,
 73,
 46,
 4,
 146,
 4,
 168,
 0,
 107,
 75,
 18,
 69,
 5,
 169,
 132,
 17,
 82,
 46,
 120,
 121,
 5,
 156,
 140,
 5,
 9,
 29,
 0,
 8,
 108,
 76,
 134,
 17,
 27,
 122,
 0,
 83,
 10,
 25,
 21,
 121,
 5,
 78,
 93,
 73,
 29,
 8,
 113,
 5,
 83,
 38,
 5,
 86,
 0,
 47,
 83,
 9,
 83,

In [57]:
label_map.shape(0)

AttributeError: 'dict' object has no attribute 'shape'

In [52]:
from torch.optim import AdamW
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ubah label string ke ID berdasarkan label_map
train_label_ids = [label_map[label] for label in train_labels]
# Hitung class weight dari ID, bukan string
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(len(label_map)),  # semua ID label, misal 0–956
    y=train_label_ids
)

class_weights=torch.tensor(class_weights,dtype=torch.float).to(device)

## init model
num_labels=len(np.unique(train_labels))
model=IndoBertForKBLI(num_labels=num_labels).to(device)

## init optimizer
optimizer=AdamW(model.parameters(),lr=1e-5)

## init loss
loss_fn=torch.nn.CrossEntropyLoss(weight=class_weights)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Train with epochs

In [12]:
print(device)

cuda


In [13]:
from tqdm import tqdm

EPOCHS = 5
model.train()

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    epoch_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).long()  # Ensure labels are of type LongTensor

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Rata-rata Loss: {epoch_loss / len(train_loader):.4f}")

##save model
torch.save(model.state_dict(), "../outputs/checkpoints/indobert_kbli_baseline.pt")


Epoch 1/5


  0%|          | 3/719 [00:01<04:09,  2.87it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Save model

In [14]:
torch.save(model.state_dict(), "../outputs/checkpoints/indobert_kbli_baseline.pt")


### Evaluation

In [36]:
from sklearn.metrics import f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Skor F1
f1 = f1_score(all_labels, all_preds, average='macro')
print("F1-Score (macro):", f1)



F1-Score (macro): 0.0003387927801395947


NameError: name 'accuracy_score' is not defined

In [37]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(all_labels, all_preds)
print("Accuracy:", acc)

Accuracy: 0.0019193857965451055


In [38]:
print("Pred:", preds[:10])
print("Label:", labels[:10])
print("Jumlah kelas:", len(set(train_labels)))
print("Shape output logits:", outputs.shape)  # Harus (batch_size, num_labels)


Pred: tensor([700,  75, 218, 513, 413, 862,  53, 862], device='cuda:0')
Label: tensor([397, 608,   4, 286, 251, 487,   2, 487], device='cuda:0')
Jumlah kelas: 1101
Shape output logits: torch.Size([8, 1101])


In [40]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(all_labels, all_preds))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 3]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
