In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader
import time
import re
import torch
import os
import random
import numpy as np

In [None]:
# --- 0. Konfigurasi Hyperparameter ---
# --- Bi-Encoder (LaBSE) Parameters ---
BI_ENCODER_MODEL_NAME = 'sentence-transformers/LaBSE'
BI_ENCODER_MODEL_PATH = './fine_tuned_labse_vector_model' # Path model yang akan disimpan
BI_ENCODER_BATCH_SIZE = 16
BI_ENCODER_NUM_EPOCHS = 4
BI_ENCODER_WARMUP_FRACTION = 0.1
BI_ENCODER_LEARNING_RATE = 2e-5 # Learning rate spesifik untuk Bi-Encoder

# --- Retrieval Parameters ---
TOP_K_CANDIDATES = 1459 # Jumlah dokumen yang akan diambil per kueri

# --- General ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Menggunakan perangkat: {DEVICE}")

# --- Fungsi untuk Mengatur Random Seed ---
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


Menggunakan perangkat: cuda


In [None]:
### Bagian 1: Data Preprocessing

# --- Bagian 1: Data Preprocessing ---
print("Memulai proses Data Preprocessing...")

# Fungsi preprocess_mars_text yang mempertahankan struktur penting
def preprocess_mars_text(text):
    words = text.split()
    cleaned_tokens = []
    for word in words:
        root_text_found = False
        # Cari angka dalam kata untuk menentukan panjang root text
        lengths = re.findall(r'\d+', word)
        if lengths:
            try:
                length = int(lengths[-1])
                # Cari posisi 'x' dari belakang
                x_index = word.rfind('x')
                if x_index != -1 and x_index - length >= 0:
                    # KUNCI: Ambil root text apa adanya, tanpa re.sub() untuk membersihkan karakter.
                    # Ini akan mempertahankan karakter seperti '/', '-', '(', '.' yang ternyata penting.
                    root_text = word[x_index - length : x_index]
                    cleaned_tokens.append(root_text)
                    root_text_found = True
            except (ValueError, IndexError):
                # Jika terjadi error, lanjutkan ke fallback
                pass

        # Fallback: jika pola tidak ditemukan, gunakan kata asli.
        # Ini penting untuk tanda baca atau kata yang tidak mengikuti pola.
        if not root_text_found:
            cleaned_tokens.append(word)

    return " ".join(cleaned_tokens)

# Fungsi untuk membaca kueri, menerapkan preprocessing Mars
def read_queries(file_path):
    queries = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t\t')
            if len(parts) == 2:
                queries[parts[0]] = preprocess_mars_text(parts[1])
    return queries

# Fungsi untuk membaca koleksi, cukup lowercasing untuk Inggris
def read_collection(file_path):
    df = pd.read_csv(file_path, sep='\t\t', header=None, names=['doc_id', 'text'], engine='python')
    df['text'] = df['text'].apply(lambda x: x.lower())
    return df

# Fungsi untuk membaca korpus paralel, menerapkan preprocessing Mars jika bahasa Mars
def read_parallel_corpus(file_path, is_mars_lang=False):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f]
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as f:
            lines = [line.strip() for line in f]

    if is_mars_lang:
        return [preprocess_mars_text(line) for line in lines]
    else:
        return [line.lower() for line in lines]


# Muat semua dataset
queries_mars_dict = read_queries('queries.txt')
eng_collection_df = read_collection('eng_collection.txt')
train_mars = read_parallel_corpus('unk500.txt', is_mars_lang=True)
train_eng = read_parallel_corpus('eng500.txt')

print(f"Jumlah Query Mars: {len(queries_mars_dict)}")
print(f"Jumlah Dokumen Inggris: {len(eng_collection_df)}")
print(f"Ukuran Data Training Paralel: {len(train_mars)} pasang kalimat")
print("-" * 30)

Memulai proses Data Preprocessing...
Jumlah Query Mars: 50
Jumlah Dokumen Inggris: 1459
Ukuran Data Training Paralel: 500 pasang kalimat
------------------------------


In [None]:
# --- Bagian 2: Fine-tuning Bi-Encoder (LaBSE) ---
print("Memulai proses Fine-tuning Bi-Encoder (LaBSE) untuk pembentukan vektor kata...")

if os.path.exists(BI_ENCODER_MODEL_PATH) and os.path.isdir(BI_ENCODER_MODEL_PATH):
    bi_encoder_model = SentenceTransformer(BI_ENCODER_MODEL_PATH, device=DEVICE)
    print("Menggunakan model Bi-Encoder LaBSE yang sudah di-fine-tune dari lokal.")
else:
    print(f"Model Bi-Encoder LaBSE belum ditemukan di '{BI_ENCODER_MODEL_PATH}'. Melakukan fine-tuning sekarang...")
    set_seed(42) # Menggunakan seed tetap untuk reproduktifitas
    bi_encoder_model = SentenceTransformer(BI_ENCODER_MODEL_NAME, device=DEVICE)

    # Siapkan data pelatihan untuk LaBSE
    train_examples_bi_encoder = []
    for i in range(len(train_mars)):
        # Model akan belajar memetakan kalimat Mars dan Inggris ke ruang vektor yang sama
        train_examples_bi_encoder.append(InputExample(texts=[train_mars[i], train_eng[i]]))

    train_dataloader_bi_encoder = DataLoader(train_examples_bi_encoder, shuffle=True, batch_size=BI_ENCODER_BATCH_SIZE)
    # MultipleNegativesRankingLoss cocok untuk training siamese network seperti Bi-Encoder
    train_loss_bi_encoder = losses.MultipleNegativesRankingLoss(model=bi_encoder_model)

    num_training_steps_bi_encoder = len(train_dataloader_bi_encoder) * BI_ENCODER_NUM_EPOCHS
    warmup_steps_bi_encoder = int(num_training_steps_bi_encoder * BI_ENCODER_WARMUP_FRACTION)

    start_time_bi_encoder = time.time()
    bi_encoder_model.fit(train_objectives=[(train_dataloader_bi_encoder, train_loss_bi_encoder)],
                        epochs=BI_ENCODER_NUM_EPOCHS,
                        warmup_steps=warmup_steps_bi_encoder,
                        output_path=BI_ENCODER_MODEL_PATH,
                        optimizer_params={'lr': BI_ENCODER_LEARNING_RATE}, # Menetapkan learning rate
                        show_progress_bar=True)
    end_time_bi_encoder = time.time()
    print(f"Fine-tuning Bi-Encoder selesai dalam {end_time_bi_encoder - start_time_bi_encoder:.2f} detik.")

print("-" * 30)

Memulai proses Fine-tuning Bi-Encoder (LaBSE) untuk pembentukan vektor kata...
Model Bi-Encoder LaBSE belum ditemukan di './fine_tuned_labse_vector_model'. Melakukan fine-tuning sekarang...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkimnzh[0m ([33mkimnzh-universitas-indonesia[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Fine-tuning Bi-Encoder selesai dalam 170.12 detik.
------------------------------


In [None]:
# --- Bagian 3: Pembentukan Embedding & Pencarian Semantik ---
print("Membuat embedding untuk kueri dan dokumen, lalu melakukan pencarian semantik...")

print("\nMembuat embedding untuk koleksi dokumen Inggris...")
corpus_embeddings = bi_encoder_model.encode(eng_collection_df['text'].tolist(),
                                             convert_to_tensor=True,
                                             show_progress_bar=True,
                                             batch_size=256, # Batch size lebih besar untuk encoding dokumen
                                             device=DEVICE)

query_ids = [f'Q{i}' for i in range(len(queries_mars_dict))]
query_texts = [queries_mars_dict[qid] for qid in query_ids]

print("\nMembuat embedding untuk kueri Mars...")
query_embeddings = bi_encoder_model.encode(query_texts,
                                            convert_to_tensor=True,
                                            show_progress_bar=True,
                                            batch_size=32, # Batch size lebih kecil untuk kueri
                                            device=DEVICE)

# Melakukan pencarian semantik (cosine similarity)
search_results = util.semantic_search(query_embeddings, corpus_embeddings, top_k=TOP_K_CANDIDATES)
print(f"Pencarian semantik selesai. Mengambil {TOP_K_CANDIDATES} kandidat teratas per kueri.")
print("-" * 30)

Membuat embedding untuk kueri dan dokumen, lalu melakukan pencarian semantik...

Membuat embedding untuk koleksi dokumen Inggris...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]


Membuat embedding untuk kueri Mars...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Pencarian semantik selesai. Mengambil 100 kandidat teratas per kueri.
------------------------------


In [None]:
# --- Bagian 4: Membangun File Submission ---
print("Membangun file submisi...")

submission_data = []
num_queries = len(queries_mars_dict)
num_docs = len(eng_collection_df)

# Map indeks dataframe ke ID dokumen asli
doc_id_map = {idx: f"D{eng_collection_df.iloc[idx]['doc_id']}" for idx in range(num_docs)}

for query_index, hits in enumerate(search_results):
    query_id = f"Q{query_index}" # Query ID didasarkan pada indeks

    query_specific_ranks = {}

    # Isi peringkat dari hasil pencarian semantik (TOP_K_CANDIDATES)
    for rank, hit in enumerate(hits, 1):
        original_doc_id = doc_id_map[hit['corpus_id']]
        query_specific_ranks[original_doc_id] = rank

    # Tambahkan semua dokumen, beri peringkat sisanya secara berurutan
    current_rank_for_others = len(query_specific_ranks) + 1

    # Dapatkan semua ID dokumen dalam koleksi, urutkan untuk konsistensi
    all_doc_ids_sorted_num = sorted(eng_collection_df['doc_id'].tolist())
    all_doc_ids_str_sorted = [f"D{doc_id}" for doc_id in all_doc_ids_sorted_num]

    # Buat dictionary map untuk semua dokumen dengan default inf
    rank_map = {doc_id_str: float('inf') for doc_id_str in all_doc_ids_str_sorted}

    # Isi rank untuk dokumen yang ditemukan relevan
    for doc_id_str, rank in query_specific_ranks.items():
        rank_map[doc_id_str] = rank

    # Beri peringkat sisanya secara berurutan (dokumen yang tidak masuk TOP_K)
    for doc_id_str in all_doc_ids_str_sorted:
        if rank_map[doc_id_str] == float('inf'):
            rank_map[doc_id_str] = current_rank_for_others
            current_rank_for_others += 1

    # Tambahkan ke submission_data
    for doc_id_str in all_doc_ids_str_sorted:
        submission_data.append({'que_doc': f"{query_id}-{doc_id_str}", 'rank': rank_map[doc_id_str]})

submission_df = pd.DataFrame(submission_data)
submission_df.to_csv('submission.csv', index=False)

print("\nFile submission.csv berhasil dibuat dengan format yang benar (menggunakan Bi-Encoder Transformer).")
print("Contoh isi file submission:")
print(submission_df.head())
print("...")
print(submission_df.tail())
print(f"\nJumlah baris dalam submission.csv: {len(submission_df)}")
print(f"Setiap query seharusnya memiliki {num_docs} baris.")

Membangun file submisi...

File submission.csv berhasil dibuat dengan format yang benar (menggunakan Bi-Encoder Transformer).
Contoh isi file submission:
  que_doc  rank
0   Q0-D0   101
1   Q0-D1   102
2   Q0-D2   103
3   Q0-D3   104
4   Q0-D4   105
...
         que_doc  rank
72945  Q49-D1454  1456
72946  Q49-D1455  1457
72947  Q49-D1456  1458
72948  Q49-D1457    67
72949  Q49-D1458  1459

Jumlah baris dalam submission.csv: 72950
Setiap query seharusnya memiliki 1459 baris.
