In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

In [3]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2")


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

In [5]:
import pandas as pd
df_path="../dataset"
df = pd.read_csv(df_path+"/df_output_kalteng_sample.csv")     # Dataset utama (input teks survei)
kbli_db = pd.read_csv(df_path+"/kbli2020.csv")                # Database KBLI
kbji_db = pd.read_csv(df_path+"/kbji2014.csv") 

In [9]:
# Ambil 5 sampel teks dari survei
sample_texts = df["text_description"].dropna().sample(5, random_state=42).tolist()

sample_texts

['Aktivitas: Menjaga warung sembako pribadi. Produk: Jasa konsume. Usaha: Warung sembako milik sendiri.',
 'Aktivitas: Menjaga konservasi alam di danau masoraian. Produk: Konservasi alam danau masoraian. Usaha: Danau masoraian.',
 'Aktivitas: Membantu menjaga warung angkringan milik keluarga. Produk: Makanan dan minuman siap saji. Usaha: Warung makan.',
 'Aktivitas: MENJUAL MAKANAN RINGAN. Produk: Makanan ringan. Usaha: Penyediaan makanan ringan.',
 'Aktivitas: OPERATOR ALAT BERAT DI PT BGA. Produk: OPERATOR ALAT BERAT. Usaha: OPERATOR ALAT BERAT DI PT BGA.']

In [10]:
# Filter KBLI level 5 (digit == 5)
kbli_labels = kbli_db[kbli_db["digit"] == 5][["kode", "judul", "deskripsi"]].dropna()

# Filter KBJI level 4 (digit == 4) – label terlengkap
kbji_labels = kbji_db[kbji_db["digit"] == 4][["kode", "judul", "deskripsi"]].dropna()


In [11]:
# Fungsi untuk dapatkan pooled embedding dari teks
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.pooler_output.squeeze(0)

In [14]:
# Representasi KBLI
kbli_label_texts = (kbli_labels["judul"] + ". " + kbli_labels["deskripsi"].str[:256]).tolist()
kbli_embeddings = torch.stack([get_embedding(text) for text in kbli_label_texts])

# Representasi KBJI
kbji_label_texts = (kbji_labels["judul"] + ". " + kbji_labels["deskripsi"].str[:256]).tolist()
kbji_embeddings = torch.stack([get_embedding(text) for text in kbji_label_texts])


In [18]:
results = []

for text in sample_texts:
    input_emb = get_embedding(text)

    # Cosine similarity ke semua label
    sim_kbli = cosine_similarity(input_emb.unsqueeze(0), kbli_embeddings).squeeze()
    sim_kbji = cosine_similarity(input_emb.unsqueeze(0), kbji_embeddings).squeeze()

    # Ambil prediksi terbaik
    kbli_idx = sim_kbli.argmax().item()
    kbji_idx = sim_kbji.argmax().item()

    pred_kbli = kbli_labels.iloc[kbli_idx]
    pred_kbji = kbji_labels.iloc[kbji_idx]

    results.append({
        "input_text": text,
        "kbli_pred_code": pred_kbli["kode"],
        "kbli_pred_label": pred_kbli["judul"],
        "kbji_pred_code": pred_kbji["kode"],
        "kbji_pred_label": pred_kbji["judul"],
    })


In [19]:
import pandas as pd
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,input_text,kbli_pred_code,kbli_pred_label,kbji_pred_code,kbji_pred_label
0,Aktivitas: Menjaga warung sembako pribadi. Pro...,82110,Aktivitas Penyedia Gabungan Jasa Administrasi ...,5131.0,Pramusaji
1,Aktivitas: Menjaga konservasi alam di danau ma...,91038,Hutan Lindung,5164.0,Pelatih dan Pekerja Perawatan Hewan
2,Aktivitas: Membantu menjaga warung angkringan ...,87901,Aktivitas Panti Asuhan Pemerintah,5164.0,Pelatih dan Pekerja Perawatan Hewan
3,Aktivitas: MENJUAL MAKANAN RINGAN. Produk: Mak...,23111,Industri Kaca Lembaran,7126.0,Pekerja Perpipaan
4,Aktivitas: OPERATOR ALAT BERAT DI PT BGA. Prod...,78431,Pelatihan Kerja Teknik Perusahaan,3113.0,Teknisi Teknik Listrik


In [20]:
# Ambil ground truth berdasarkan text_description
df_truth = df[df["text_description"].isin([r["input_text"] for r in results])][[
    "text_description", "kbli_code", "kbli_label", "kbji_code", "kbji_label"
]]

# Konversi prediksi ke DataFrame
df_pred = pd.DataFrame(results)

# Gabungkan berdasarkan text_description
df_eval = df_pred.merge(df_truth, left_on="input_text", right_on="text_description")

# Tambahkan kolom evaluasi akurasi
df_eval["kbli_match"] = df_eval["kbli_pred_code"] == df_eval["kbli_code"]
df_eval["kbji_match"] = df_eval["kbji_pred_code"] == df_eval["kbji_code"]

df_eval[[
    "input_text",
    "kbli_pred_code", "kbli_code", "kbli_match",
    "kbji_pred_code", "kbji_code", "kbji_match"
]]


Unnamed: 0,input_text,kbli_pred_code,kbli_code,kbli_match,kbji_pred_code,kbji_code,kbji_match
0,Aktivitas: Menjaga warung sembako pribadi. Pro...,82110,47241,False,5131.0,5221,False
1,Aktivitas: Menjaga konservasi alam di danau ma...,91038,91035,False,5164.0,5414,False
2,Aktivitas: Membantu menjaga warung angkringan ...,87901,56102,False,5164.0,5169,False
3,Aktivitas: MENJUAL MAKANAN RINGAN. Produk: Mak...,23111,56103,False,7126.0,5211,False
4,Aktivitas: OPERATOR ALAT BERAT DI PT BGA. Prod...,78431,1262,False,3113.0,8341,False
