In [6]:
import pandas as pd
df_path="../dataset/clean/subset_kbli_classify.csv"

df=pd.read_csv(df_path,quotechar='"',encoding="utf-8",dtype=str)

df_subset=df.head(1000)

# If you want df_subset to be a separate copy:
df_subset = df_subset.copy()

df_subset.loc[:, "level1"] = df_subset["kbli_code"].str[0]      # 1 digit
df_subset.loc[:, "level2"] = df_subset["kbli_code"].str[:2]     # 2 digit
df_subset.loc[:, "level3"] = df_subset["kbli_code"].str[:3]     # 3 digit
df_subset.loc[:, "level4"] = df_subset["kbli_code"].str[:4]     # 4 digit
df_subset.loc[:, "level5"] = df_subset["kbli_code"] 

df_subset.head()            # all digits       # full code


Unnamed: 0,text_description,kbli_code,text_length,level1,level2,level3,level4,level5
0,aktivitas: membantu menjemur cengke. produk: c...,1282,11,0,1,12,128,1282
1,aktivitas: tenaga honorer guru bahasa indonesi...,85230,21,8,85,852,8523,85230
2,aktivitas: membersihkan rumput di kebun kopi. ...,1270,12,0,1,12,127,1270
3,aktivitas: jual kueh putu mayang keliling. pro...,47991,13,4,47,479,4799,47991
4,"aktivitas: dosen unwina (dosen tidak tetap, ma...",85321,13,8,85,853,8532,85321


In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split

In [9]:
class HierarchicalKBLITextDataset(Dataset):
    def __init__(self, texts, level_labels_dict, tokenizer, max_length=128):
        self.texts = texts
        self.labels_dict = level_labels_dict
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Buat label2id per level
        self.label2id = {
            level: {l: i for i, l in enumerate(sorted(set(labels)))}
            for level, labels in level_labels_dict.items()
        }
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(text, padding='max_length', truncation=True,
                                  max_length=self.max_length, return_tensors='pt')
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

        # Encode semua level label
        for level, labels in self.labels_dict.items():
            item[level] = torch.tensor(self.label2id[level][labels[idx]], dtype=torch.long)

        return item
