## Baseline KBLI Classification

## load data


In [3]:
import pandas as pd
df_path="../dataset/clean/subset_kbli_classify.csv"

df=pd.read_csv(df_path,quotechar='"',encoding="utf-8",dtype=str)

df.head()



Unnamed: 0,text_description,kbli_code,text_length
0,aktivitas: membantu menjemur cengke. produk: c...,1282,11
1,aktivitas: tenaga honorer guru bahasa indonesi...,85230,21
2,aktivitas: membersihkan rumput di kebun kopi. ...,1270,12
3,aktivitas: jual kueh putu mayang keliling. pro...,47991,13
4,"aktivitas: dosen unwina (dosen tidak tetap, ma...",85321,13


### Import Model Library

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split




### Tokenize and vectorize the data

In [7]:
tokenizer=AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

## make dataset custom
class KBLIDataset(Dataset):
    def __init__(self,texts,labels,tokenizer,max_length=128):
        self.texts=texts
        self.labels=labels
        self.tokenizer=tokenizer
        self.max_length=max_length
        self.label_map={label:idx for idx,label in enumerate(set(labels))}
        self.idx2label={idx:label for label,idx in self.label_map.items()}

    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self,idx):
        text=str(self.texts[idx])
        label=self.labels[idx]
        encoding=self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids":encoding["input_ids"].squeeze(0),
            "attention_mask":encoding["attention_mask"].squeeze(0),
            "labels":torch.tensor(self.label_map[label],dtype=torch.long)
        }

In [18]:

#split train and validation
train_texts,val_texts,train_labels,val_labels=train_test_split(
    df["text_description"],df["kbli_code"],test_size=0.2,random_state=42
    )


#Dataset
train_dataset=KBLIDataset(train_texts.tolist(),train_labels.tolist(),tokenizer=tokenizer)
val_dataset=KBLIDataset(val_texts.tolist(),val_labels.tolist(),tokenizer=tokenizer)


# Dataloader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


#### check sample tokenization

In [12]:
sample = train_dataset[0]
print("Input IDs:", sample['input_ids'])
print("Attention Mask:", sample['attention_mask'])
print("Label ID:", sample['labels'])


Input IDs: tensor([    2,  2310, 30472,  3952,    26,  4896, 14767, 10027,  8546, 30354,
          984, 30354, 30470,   497, 30472,  1416,   701, 30470,  1062, 30472,
          701,  4896, 30470,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    

#### Fine-Tuning IndoBert

In [15]:
import torch.nn as nn
from transformers import AutoModel

class IndoBertForKBLI(nn.Module):
    def __init__(self, num_labels):
        super(IndoBertForKBLI, self).__init__()
        self.indobert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.indobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs =self.indobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped_output = self.dropout(pooled_output)
        logits = self.classifier(dropped_output)
        return logits

#### Training Preparation

In [None]:
from torch.optim import AdamW
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights=compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights=torch.tensor(class_weights,dtype=torch.float).to(device)

## init model
num_labels=len(np.unique(train_labels))
model=IndoBertForKBLI(num_labels=num_labels).to(device)

## init optimizer
optimizer=AdamW(model.parameters(),lr=1e-5)

## init loss
loss_fn=torch.nn.CrossEntropyLoss(weight=class_weights)



### train without epoch ( epoch=1)

In [26]:
from tqdm import tqdm

model.train()
for batch in tqdm(train_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask)
    loss = loss_fn(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Training selesai untuk 1 epoch.")
print(loss.item()+": Training selesai untuk 1 epoch.")




  0%|          | 14/10429 [07:54<98:05:36, 33.91s/it] 


KeyboardInterrupt: 

### Train with epochs

In [None]:
from tqdm import tqdm

EPOCHS = 5
model.train()

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    epoch_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Rata-rata Loss: {epoch_loss / len(train_loader):.4f}")


### Save model

In [None]:
torch.save(model.state_dict(), "indobert_kbli_baseline.pt")


### Evaluation

In [None]:
from sklearn.metrics import f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Skor F1
f1 = f1_score(all_labels, all_preds, average='macro')
print("F1-Score (macro):", f1)
