In [None]:
import pandas as pd
import os
import json

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# --- 1. DEFINISI PASANGAN QUERY & GROUND TRUTH ---
# Struktur: [Query Asli, Query Variasi/Sinonim, Kategori, [Lokasi Ayats Relevan]]
robustness_data = [
    # Fiqh/Hukum (Law) - [1-10]
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    # Kisah/Narasi (Narrative) - [11-20]
    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    # Aqidah/Akhlak (Theology/Ethics) - [21-30]
    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Al-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 2. KONVERSI KE DATAFRAME
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])

# 3. DUPLIKASI BARIS: Buat setiap Query A dan Query B menjadi baris terpisah
# Ini penting agar kita bisa menghitung metrik untuk Q_A dan Q_B secara independen
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]

df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 4. SIMPAN KE CSV
# Kita simpan kolom 'target_ayats' sebagai string JSON agar mudah dibaca nanti
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT!")
print(f"   Tersimpan di: {OUTPUT_PATH}")
print(f"   Total Query Uji: {len(df_test_set)} baris ({len(df_test_set)/2} pasangan).")

print("\nSTRUKTUR DATA (Preview):")
print(df_test_set.head(6))


‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries.csv
   Total Query Uji: 60 baris (30.0 pasangan).

STRUKTUR DATA (Preview):
   id                            query category  \
0   1     Hukum warisan bagi perempuan     Fiqh   
1   2      Kapan puasa Ramadan dimulai     Fiqh   
2   3   Cara melaksanakan sholat Jumat     Fiqh   
3   4                 Zakat hasil bumi     Fiqh   
4   5  Denda bagi yang bersumpah palsu     Fiqh   
5   6            Berwudu sebelum salat     Fiqh   

                                       target_ayats  
0                        ["QS. An-Nis\u0101' : 11"]  
1  ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]  
2                            ["QS. Al-Jumu'ah : 9"]  
3                       ["QS. Al-An'\u0101m : 141"]  
4                      ["QS. Al-M\u0101'idah : 89"]  
5                       ["QS. Al-M\u0101'idah : 6"]  


In [2]:
import pandas as pd
import os

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'dataset_training_FULL_COMPLETE.csv')

print("‚öôÔ∏è MEMBACA DATA UNTUK VERIFIKASI FORMAT...")

if not os.path.exists(DATA_PATH):
    print(f"‚ùå File {DATA_PATH} tidak ditemukan!")
else:
    try:
        df = pd.read_csv(DATA_PATH)
        df.columns = df.columns.str.strip().str.lower()
        
        if 'ayat_asal' in df.columns:
            # Ambil 5 contoh unik
            sample = df['ayat_asal'].astype(str).unique()[:5]
            
            print("\n‚úÖ FORMAT PENULISAN SURAT YANG TERDETEKSI (Kolom 'ayat_asal'):")
            
            for i, s in enumerate(sample):
                # Gunakan repr() untuk melihat karakter tersembunyi/kutipan
                print(f"   Contoh {i+1}: {repr(s)}") 
            
            print("\nKUNCI PENCARI LOKASI ADALAH: String di atas.")
            print("Pastikan format ini sama persis dengan yang Anda gunakan untuk membangun gudang metadata!")
            
        else:
            print("‚ö†Ô∏è Kolom 'ayat_asal' tidak ditemukan di CSV.")
            
    except Exception as e:
        print(f"Error saat membaca CSV: {e}")

‚öôÔ∏è MEMBACA DATA UNTUK VERIFIKASI FORMAT...

‚úÖ FORMAT PENULISAN SURAT YANG TERDETEKSI (Kolom 'ayat_asal'):
   Contoh 1: 'QS. Al-Qamar : Ayat 46'
   Contoh 2: 'QS. Ar-Ra·∏•mƒÅn : Ayat 6'
   Contoh 3: "QS. Al-AnbiyƒÅ'  : Ayat 39"
   Contoh 4: 'QS. Asy-Sy≈´rƒÅ : Ayat 52'
   Contoh 5: "QS. Asy-Syu‚ÄòarƒÅ' : Ayat 202"

KUNCI PENCARI LOKASI ADALAH: String di atas.
Pastikan format ini sama persis dengan yang Anda gunakan untuk membangun gudang metadata!


In [4]:
import pandas as pd
import os
import re

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# Input Files
FULL_COMPLETE_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')
TAFSIR_CLEAN_PATH = os.path.join(DATA_DIR, 'tafsir_clean.csv')

# Output File Master Baru
OUTPUT_MASTER_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')

print("üöÄ MEMULAI PROSES PENJAHITAN ID NUMERIK (FIXED)...")

# 1. LOAD DATA
print("   -> Membaca Data Lengkap (Sumber Fitur & Teks)...")
df_master = pd.read_csv(FULL_COMPLETE_PATH)
df_master.columns = df_master.columns.str.strip().str.lower()

print("   -> Membaca Tafsir Clean (Sumber ID Surah)...")
df_clean = pd.read_csv(TAFSIR_CLEAN_PATH)
df_clean.columns = df_clean.columns.str.strip().str.lower()

# 2. BUAT JEMBATAN (PARSING AYAT ASAL)
# Mengekstrak Nama Surah dan Nomor Ayat dari string 'ayat_asal' di df_master
def parse_location(location_str):
    if not isinstance(location_str, str) or "QS." not in location_str:
        return None, None
    
    # Mencari Nama Surah
    match_surah = re.search(r'QS\. (.+?)\s+:', location_str)
    surah_name = match_surah.group(1).strip() if match_surah else None
    
    # Mencari Nomor Ayat
    match_ayah = re.search(r'Ayat\s+(\d+)', location_str)
    ayah_num = int(match_ayah.group(1)) if match_ayah else None
    
    return surah_name, ayah_num

# Terapkan Parsing
df_master[['surah_nama_temp', 'ayah_id']] = df_master['ayat_asal'].apply(
    lambda x: pd.Series(parse_location(x))
)

print("   -> Parsing Lokasi & Ayah ID Selesai.")

# 3. BUAT KAMUS ID SURAH (DERIVASI 1-114)
print("   -> Membuat Kamus ID Numerik Surah (1-114)...")

# Dapatkan daftar unik nama surah dari df_clean
unique_surahs = df_clean['surah'].str.strip().unique()

# Hapus nilai NaN atau string kosong dari daftar unique_surahs sebelum sorting
unique_surahs = [s for s in unique_surahs if isinstance(s, str) and s.strip() != '']

# Urutkan berdasarkan Nama Surah (Asumsi ini mengikuti urutan kronologis Al-Qur'an)
# Sorting menggunakan nama string adalah heuristik terbaik tanpa list Surah eksternal.
sorted_surahs = sorted(unique_surahs) 

# Buat map {Nama Surah: ID Numerik}
surah_id_map = {name: i + 1 for i, name in enumerate(sorted_surahs)}

# 4. TRANSFER ID NUMERIK
print("   -> Mentransfer Surah ID Numerik ke Data Master...")

# Ambil ID Surah dari kamus
df_master['surah_id'] = df_master['surah_nama_temp'].str.strip().map(surah_id_map)

# 5. BERSIHKAN & SIMPAN FILE BARU
# Hapus kolom sementara
df_master = df_master.drop(columns=['surah_nama_temp'], errors='ignore')

# Ambil kolom ID baru
df_master = df_master[[c for c in df_master.columns if c not in ['surah_id', 'ayah_id']] + ['surah_id', 'ayah_id']]

# Drop baris yang gagal di-parse
df_master = df_master.dropna(subset=['surah_id', 'ayah_id'])

# Pastikan ID adalah integer
df_master['surah_id'] = df_master['surah_id'].astype(int)
df_master['ayah_id'] = df_master['ayah_id'].astype(int)


df_master.to_csv(OUTPUT_MASTER_PATH, index=False)

print(f"\n‚úÖ FILE MASTER ID-BASED BERHASIL DIBUAT!")
print(f"   Tersimpan di: {OUTPUT_MASTER_PATH}")
print(f"   Total baris: {len(df_master)}")

print("\nüîç PREVIEW KOLOM ID BARU (Wajib Numerik):")
print(df_master[['ayat_asal', 'surah_id', 'ayah_id']].head(5))

üöÄ MEMULAI PROSES PENJAHITAN ID NUMERIK (FIXED)...
   -> Membaca Data Lengkap (Sumber Fitur & Teks)...
   -> Membaca Tafsir Clean (Sumber ID Surah)...
   -> Parsing Lokasi & Ayah ID Selesai.
   -> Membuat Kamus ID Numerik Surah (1-114)...
   -> Mentransfer Surah ID Numerik ke Data Master...

‚úÖ FILE MASTER ID-BASED BERHASIL DIBUAT!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\dataset_training_ID_BASED_MASTER.csv
   Total baris: 170372

üîç PREVIEW KOLOM ID BARU (Wajib Numerik):
                     ayat_asal  surah_id  ayah_id
0       QS. Al-Qamar : Ayat 46        48       46
1       QS. Ar-Ra·∏•mƒÅn : Ayat 6        72        6
2    QS. Al-AnbiyƒÅ'  : Ayat 39         2       39
3      QS. Asy-Sy≈´rƒÅ : Ayat 52        79       52
4  QS. Asy-Syu‚ÄòarƒÅ' : Ayat 202        78      202


In [6]:
import pandas as pd
import os
import re

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
MASTER_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv') # File output kita

print("‚öôÔ∏è MEMUAT DATA MASTER UNTUK VERIFIKASI ID...")

if not os.path.exists(MASTER_PATH):
    raise FileNotFoundError(f"‚ùå File Master ID-Based tidak ditemukan di: {MASTER_PATH}")

# 1. Load File Master
df_master = pd.read_csv(MASTER_PATH)
df_master.columns = df_master.columns.str.strip().str.lower()

# 2. Fungsi Ekstraksi Nama Surah dari String Ayat Asal
def extract_surah_name(location_str):
    if not isinstance(location_str, str) or "QS." not in location_str:
        return "Unknown/None"
    
    # Regex untuk mengambil teks setelah "QS. " dan sebelum " :"
    match = re.search(r'QS\. (.+?)\s+:', location_str)
    return match.group(1).strip() if match else "Unknown/None"

# 3. Ekstrak Nama Surah & Grouping
df_master['surah_name'] = df_master['ayat_asal'].apply(extract_surah_name)

# 4. Buat Tabel Verifikasi (Hanya Surah ID dan Nama Unik)
surah_map_check = df_master[['surah_id', 'surah_name']].drop_duplicates().sort_values('surah_id')

# 4. Buat Tabel Verifikasi (Hanya Surah ID dan Nama Unik)
surah_map_check = df_master[['surah_id', 'surah_name']].drop_duplicates().sort_values('surah_id')

print("\n‚úÖ HASIL PEMETAAN ID NUMERIK (VERIFIKASI)")
print("=========================================")
print(f"Total Surah yang Terdeteksi: {len(surah_map_check)}")

# PERBAIKAN: Mengganti to_markdown() dengan to_string() untuk menghindari error 'tabulate'
print(surah_map_check.to_string(index=False))

# Pengecekan Kualitas 
if surah_map_check['surah_id'].nunique() != surah_map_check['surah_name'].nunique():
    print("‚ö†Ô∏è PERINGATAN: Ada potensi masalah dalam pemetaan ID/Nama Surah.")

‚öôÔ∏è MEMUAT DATA MASTER UNTUK VERIFIKASI ID...

‚úÖ HASIL PEMETAAN ID NUMERIK (VERIFIKASI)
Total Surah yang Terdeteksi: 114
 surah_id    surah_name
        1     Ad-DukhƒÅn
        2    Al-AnbiyƒÅ'
        3      Al-AnfƒÅl
        4      Al-An‚ÄòƒÅm
        5      Al-A·∏•qƒÅf
        6      Al-A·∏•zƒÅb
        7       Al-A‚ÄòlƒÅ
        8      Al-A‚ÄòrƒÅf
        9      Al-Balad
       10    Al-Baqarah
       11   Al-Bayyinah
       12      Al-Bur≈´j
       13       Al-Fajr
       14      Al-Falaq
       15       Al-Fat·∏•
       16     Al-FurqƒÅn
       17    Al-FƒÅti·∏•ah
       18        Al-Fƒ´l
       19   Al-GƒÅsyiyah
       20    Al-Humazah
       21     Al-IkhlƒÅ·π£
       22    Al-Infi·π≠ƒÅr
       23   Al-InsyiqƒÅq
       24      Al-InsƒÅn
       25      Al-IsrƒÅ'
       26       Al-Jinn
       27    Al-Jumu‚Äòah
       28    Al-JƒÅ·π°iyah
       29       Al-Kahf
       30     Al-Kau·π°ar
       31    Al-KƒÅfir≈´n
       32      Al-Lahab
       33       Al-Lail
       34    

In [7]:
import pandas as pd
import os
import re

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# Input Files
FULL_COMPLETE_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')
TAFSIR_CLEAN_PATH = os.path.join(DATA_DIR, 'tafsir_clean.csv')

# Output File Master Baru
OUTPUT_MASTER_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')

print("üöÄ MEMULAI PROSES PENJAHITAN ID NUMERIK (FINAL FIX)...")

# 1. LOAD DATA
print("   -> Membaca Data Lengkap (Sumber Fitur & Teks)...")
df_master = pd.read_csv(FULL_COMPLETE_PATH)
df_master.columns = df_master.columns.str.strip().str.lower()

print("   -> Membaca Tafsir Clean (Sumber ID Surah)...")
df_clean = pd.read_csv(TAFSIR_CLEAN_PATH)
df_clean.columns = df_clean.columns.str.strip().str.lower()

# 2. BUAT JEMBATAN (PARSING AYAT ASAL)
# Mengekstrak Nama Surah dan Nomor Ayat dari string 'ayat_asal' di df_master
def parse_location(location_str):
    if not isinstance(location_str, str) or "QS." not in location_str:
        return None, None
    
    match_surah = re.search(r'QS\. (.+?)\s+:', location_str)
    surah_name = match_surah.group(1).strip() if match_surah else None
    
    match_ayah = re.search(r'Ayat\s+(\d+)', location_str)
    ayah_num = int(match_ayah.group(1)) if match_ayah else None
    
    return surah_name, ayah_num

# Terapkan Parsing
df_master[['surah_nama_temp', 'ayah_id']] = df_master['ayat_asal'].apply(
    lambda x: pd.Series(parse_location(x))
)

print("   -> Parsing Lokasi & Ayah ID Selesai.")

# 3. BUAT KAMUS ID SURAH (DARI DAFTAR KANONIK)
print("   -> Membuat Kamus ID Numerik Surah (1-114) DENGAN URUTAN YANG BENAR...")

# Daftar 114 Surah Al-Qur'an (Kanonik, sesuai urutan 1-114)
# INI ADALAH FIX UNTUK MENGHINDARI SORTING ALFABETIK YANG SALAH
canonical_surahs = [
    'Al-FƒÅti·∏•ah', 'Al-Baqarah', 'ƒÄli ‚ÄòImrƒÅn', 'An-NisƒÅ\'', 'Al-MƒÅ\'idah', 'Al-An\'ƒÅm', 'Al-A‚ÄòrƒÅf', 'Al-AnfƒÅl', 
    'At-Taubah', 'Y≈´nus', 'H≈´d', 'Y≈´suf', 'Ar-Ra‚Äòd', 'IbrƒÅhƒ´m', 'Al-·∏§ijr', 'An-Na·∏•l', 'Al-IsrƒÅ\'', 'Al-Kahf', 
    'Maryam', '·π¨ƒÅhƒÅ', 'Al-AnbiyƒÅ\'', 'Al-·∏§aj', 'Al-Mu\'min≈´n', 'An-N≈´r', 'Al-FurqƒÅn', 'Asy-Syu‚ÄòarƒÅ\'', 
    'An-Naml', 'Al-Qa·π£a·π£', 'Al-‚ÄòAnkab≈´t', 'Ar-R≈´m', 'LuqmƒÅn', 'As-Sajdah', 'Al-A·∏•zƒÅb', 'SabƒÅ\'', 'FƒÅ·π≠ir', 
    'YƒÅsƒ´n', 'A·π£-·π¢ƒÅffƒÅt', '·π¢ƒÅd', 'Az-Zumar', 'GƒÅfir', 'Fu·π£·π£ilat', 'Asy-Sy≈´rƒÅ', 'Az-Zukhruf', 'Ad-DukhƒÅn', 
    'Al-JƒÅ·π°iyah', 'Al-A·∏•qƒÅf', 'Mu·∏•ammad', 'Al-Fat·∏•', 'Al-·∏§ujurƒÅt', 'QƒÅf', 'Adz-DzƒÅriyƒÅt', 'A·π≠-·π¨≈´r', 'An-Najm', 
    'Al-Qamar', 'Ar-Ra·∏•mƒÅn', 'Al-WƒÅqi‚Äòah', 'Al-·∏§adƒ´d', 'Al-MujƒÅdilah', 'Al-·∏§asyr', 'Al-Mumta·∏•anah', 'A·π£-·π¢aff', 
    'Al-Jumu‚Äòah', 'Al-MunƒÅfiq≈´n', 'At-TagƒÅbun', 'A·π≠-·π¨alƒÅq', 'At-Ta·∏•rƒ´m', 'Al-Mulk', 'Al-Qalam', 'Al-·∏§ƒÅqqah', 
    'Al-Ma‚ÄòƒÅrij', 'N≈´·∏•', 'Al-Jinn', 'Al-Muzzammil', 'Al-Muddasir', 'Al-QiyƒÅmah', 'Al-InsƒÅn', 'Al-MursalƒÅt', 
    'An-Naba\'', 'An-NƒÅzi‚ÄòƒÅt', '‚ÄòAbasa', 'At-Takwƒ´r', 'Al-Infi·π≠ƒÅr', 'Al-Mu·π≠affifƒ´n', 'Al-InsyiqƒÅq', 
    'Al-Bur≈´j', 'A·π≠-·π¨ƒÅriq', 'Al-A‚ÄòlƒÅ', 'Al-GƒÅsyiyah', 'Al-Fajr', 'Al-Balad', 'Asy-Syams', 'Al-Lail', 'A·∏ç-·∏åu·∏•ƒÅ', 
    'Al-InsyirƒÅ·∏•', 'At-Tƒ´n', 'Al-‚ÄòAlaq', 'Al-Qadr', 'Al-Bayyinah', 'Az-Zalzalah', 'Al-‚ÄòƒÄdiyƒÅt', 'Al-QƒÅri‚Äòah', 
    'At-TakƒÅ·π°ur', 'Al-‚ÄòA·π£r', 'Al-Humazah', 'Al-Fƒ´l', 'Quraisy', 'Al-MƒÅ‚Äò≈´n', 'Al-Kausar', 'Al-KƒÅfir≈´n', 
    'An-Na·π£r', 'Al-Lahab', 'Al-IkhlƒÅ·π£', 'Al-Falaq', 'An-NƒÅs'
]

# Buat map {Nama Surah: ID Numerik}
surah_id_map = {name: i + 1 for i, name in enumerate(canonical_surahs)}

# 4. TRANSFER ID NUMERIK
print("   -> Mentransfer Surah ID Numerik ke Data Master...")

# Ambil ID Surah dari kamus
df_master['surah_id'] = df_master['surah_nama_temp'].str.strip().map(surah_id_map)

# 5. BERSIHKAN & SIMPAN FILE BARU
# ... (lanjutan kode Cell 3)
df_master = df_master.drop(columns=['surah_nama_temp'], errors='ignore')
df_master = df_master[[c for c in df_master.columns if c not in ['surah_id', 'ayah_id']] + ['surah_id', 'ayah_id']]
df_master = df_master.dropna(subset=['surah_id', 'ayah_id'])
df_master['surah_id'] = df_master['surah_id'].astype(int)
df_master['ayah_id'] = df_master['ayah_id'].astype(int)

df_master.to_csv(OUTPUT_MASTER_PATH, index=False)

print(f"\n‚úÖ FILE MASTER ID-BASED BERHASIL DIBUAT!")
print(f"   Tersimpan di: {OUTPUT_MASTER_PATH}")
print(f"   Total baris: {len(df_master)}")

print("\nüîç PREVIEW KOLOM ID BARU (Wajib Numerik):")
print(df_master[['ayat_asal', 'surah_id', 'ayah_id']].head(5))

# --- LANGKAH VERIFIKASI (BARU) ---
# Jalankan verifikasi ulang di cell berikutnya

üöÄ MEMULAI PROSES PENJAHITAN ID NUMERIK (FINAL FIX)...
   -> Membaca Data Lengkap (Sumber Fitur & Teks)...
   -> Membaca Tafsir Clean (Sumber ID Surah)...
   -> Parsing Lokasi & Ayah ID Selesai.
   -> Membuat Kamus ID Numerik Surah (1-114) DENGAN URUTAN YANG BENAR...
   -> Mentransfer Surah ID Numerik ke Data Master...

‚úÖ FILE MASTER ID-BASED BERHASIL DIBUAT!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\dataset_training_ID_BASED_MASTER.csv
   Total baris: 157579

üîç PREVIEW KOLOM ID BARU (Wajib Numerik):
                     ayat_asal  surah_id  ayah_id
0       QS. Al-Qamar : Ayat 46        54       46
1       QS. Ar-Ra·∏•mƒÅn : Ayat 6        55        6
2    QS. Al-AnbiyƒÅ'  : Ayat 39        21       39
3      QS. Asy-Sy≈´rƒÅ : Ayat 52        42       52
4  QS. Asy-Syu‚ÄòarƒÅ' : Ayat 202        26      202


In [8]:
import pandas as pd
import os
import re

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
MASTER_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv') # File output kita

print("‚öôÔ∏è MEMUAT DATA MASTER UNTUK VERIFIKASI ID...")

if not os.path.exists(MASTER_PATH):
    raise FileNotFoundError(f"‚ùå File Master ID-Based tidak ditemukan di: {MASTER_PATH}")

# 1. Load File Master
df_master = pd.read_csv(MASTER_PATH)
df_master.columns = df_master.columns.str.strip().str.lower()

# 2. Fungsi Ekstraksi Nama Surah dari String Ayat Asal
def extract_surah_name(location_str):
    if not isinstance(location_str, str) or "QS." not in location_str:
        return "Unknown/None"
    
    # Regex untuk mengambil teks setelah "QS. " dan sebelum " :"
    match = re.search(r'QS\. (.+?)\s+:', location_str)
    return match.group(1).strip() if match else "Unknown/None"

# 3. Ekstrak Nama Surah & Grouping
df_master['surah_name'] = df_master['ayat_asal'].apply(extract_surah_name)

# 4. Buat Tabel Verifikasi (Hanya Surah ID dan Nama Unik)
surah_map_check = df_master[['surah_id', 'surah_name']].drop_duplicates().sort_values('surah_id')

# 4. Buat Tabel Verifikasi (Hanya Surah ID dan Nama Unik)
surah_map_check = df_master[['surah_id', 'surah_name']].drop_duplicates().sort_values('surah_id')

print("\n‚úÖ HASIL PEMETAAN ID NUMERIK (VERIFIKASI)")
print("=========================================")
print(f"Total Surah yang Terdeteksi: {len(surah_map_check)}")

# PERBAIKAN: Mengganti to_markdown() dengan to_string() untuk menghindari error 'tabulate'
print(surah_map_check.to_string(index=False))

# Pengecekan Kualitas 
if surah_map_check['surah_id'].nunique() != surah_map_check['surah_name'].nunique():
    print("‚ö†Ô∏è PERINGATAN: Ada potensi masalah dalam pemetaan ID/Nama Surah.")

‚öôÔ∏è MEMUAT DATA MASTER UNTUK VERIFIKASI ID...

‚úÖ HASIL PEMETAAN ID NUMERIK (VERIFIKASI)
Total Surah yang Terdeteksi: 105
 surah_id    surah_name
        1    Al-FƒÅti·∏•ah
        2    Al-Baqarah
        3    ƒÄli ‚ÄòImrƒÅn
        4      An-NisƒÅ'
        5    Al-MƒÅ'idah
        7      Al-A‚ÄòrƒÅf
        8      Al-AnfƒÅl
        9     At-Taubah
       10         Y≈´nus
       11           H≈´d
       12         Y≈´suf
       13       Ar-Ra‚Äòd
       14       IbrƒÅhƒ´m
       15       Al-·∏§ijr
       16       An-Na·∏•l
       17      Al-IsrƒÅ'
       18       Al-Kahf
       19        Maryam
       20          ·π¨ƒÅhƒÅ
       21    Al-AnbiyƒÅ'
       23   Al-Mu'min≈´n
       24        An-N≈´r
       25     Al-FurqƒÅn
       26  Asy-Syu‚ÄòarƒÅ'
       27       An-Naml
       28      Al-Qa·π£a·π£
       29   Al-‚ÄòAnkab≈´t
       30        Ar-R≈´m
       31        LuqmƒÅn
       32     As-Sajdah
       33      Al-A·∏•zƒÅb
       35         FƒÅ·π≠ir
       36         YƒÅsƒ´n
     

In [9]:
import pandas as pd
import os
import json
import re # Diperlukan untuk parsing nama surah dan ayat

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        # Asumsi: jika tidak ada folder data, kita naik satu level
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# --- FUNGSI UTILITY: MENDAPATKAN ID SURAH DAN AYAT ---

# Mapping sebagian Surah yang digunakan ke ID numeriknya
# Jika Anda membutuhkan 114 Surah, gunakan file mapping lengkap (JSON/CSV)
SURAH_MAPPING = {
    "Al-Baqarah": 2, "ƒÄli 'ImrƒÅn": 3, "An-NisƒÅ'": 4, "Al-MƒÅ'idah": 5, 
    "Al-An'ƒÅm": 6, "Al-A‚ÄòrƒÅf": 7, "At-Taubah": 9, "H≈´d": 11, 
    "Y≈´suf": 12, "Ar-Ra‚Äòd": 13, "Al-IsrƒÅ'": 17, "Al-Kahf": 18, 
    "Maryam": 19, "Al-AnbiyƒÅ'": 21, "Al-Qa·π£a·π£": 28, "Al-‚ÄòAnkab≈´t": 29,
    "LuqmƒÅn": 31, "Al-A·∏•zƒÅb": 33, "YƒÅ-Sƒ´n": 36, "A·∫ì-·∫íƒÅriyƒÅt": 51,
    "Al-Qamar": 54, "Al-·∏§ujurƒÅt": 49, "Adz-DzƒÅriyƒÅt": 51, "Al-Jumu'ah": 62,
    "At-TaghƒÅbun": 64, "Al-Qalam": 68, "Al-Ma‚ÄòƒÅrij": 70, "Al-QiyƒÅmah": 75,
    "Al-IkhlƒÅ·π£": 112, "Al-Zalzalah": 99, "Al-QƒÅri'ah": 101
    # Tambahkan surah lain sesuai kebutuhan, saat ini hanya yang terpakai di robustness_data
}

def get_surah_ayat_ids(target_ayats_list):
    """
    Mengambil list lokasi ayat (misal: ["QS. An-NisƒÅ' : 11"]) 
    dan mengembalikannya sebagai list ID numerik [[Surah ID, Ayat ID]].
    """
    result_ids = []
    
    # Pattern untuk mengekstrak Nama Surah dan Nomor Ayat
    # Contoh: "QS. Al-Baqarah : 183" -> Group 1: "Al-Baqarah", Group 2: "183"
    pattern = re.compile(r"QS\. (.*?)\s*:\s*(\d+)", re.IGNORECASE)

    for ayat_str in target_ayats_list:
        match = pattern.search(ayat_str)
        if match:
            surah_name = match.group(1).strip()
            ayat_num = int(match.group(2))
            
            # Cari ID Surah, default ke -1 jika tidak ditemukan
            surah_id = SURAH_MAPPING.get(surah_name, -1)
            
            if surah_id != -1:
                result_ids.append([surah_id, ayat_num])
            else:
                # Peringatan jika Surah tidak ada di mapping
                # Ini adalah BLIND SPOT yang Anda abaikan: kelengkapan mapping!
                # print(f"‚ö†Ô∏è Peringatan: Surah '{surah_name}' tidak ditemukan di SURAH_MAPPING.")
                result_ids.append([-1, ayat_num]) # Tambahkan placeholder ID
        
        else:
            # print(f"‚ö†Ô∏è Peringatan: Format ayat '{ayat_str}' tidak dikenali.")
            pass

    return result_ids

# --- 1. DEFINISI PASANGAN QUERY & GROUND TRUTH ---
# Struktur: [Query Asli, Query Variasi/Sinonim, Kategori, [Lokasi Ayats Relevan]]
# Struktur: [ID, Query A, Query B, Kategori, [Lokasi Ayats (Teks)]]
robustness_data = [
    # Fiqh/Hukum (Law) - [1-10]
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    # Kisah/Narasi (Narrative) - [11-20]
    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    # Aqidah/Akhlak (Theology/Ethics) - [21-30]
    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Al-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 1.5. TAMBAHKAN KOLOM ID NUMERIK
# Lakukan pemrosesan ID numerik di sini sebelum konversi ke DataFrame
data_with_ids = []
for row in robustness_data:
    row_id, q_a, q_b, category, target_ayats_text = row
    
    # Panggil fungsi untuk mendapatkan ID numerik
    target_ayats_id = get_surah_ayat_ids(target_ayats_text)
    
    # Struktur baru: [ID, Query A, Query B, Kategori, [Ayats Teks], [Ayats ID]]
    data_with_ids.append([row_id, q_a, q_b, category, target_ayats_text, target_ayats_id])

# 2. KONVERSI KE DATAFRAME
df_robustness = pd.DataFrame(data_with_ids, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats_text', 'target_ayats_id'
])

# 3. DUPLIKASI BARIS: Buat setiap Query A dan Query B menjadi baris terpisah
# Kolom target_ayats_text diganti namanya agar sesuai dengan nama lama (jika ada script lain yang bergantung padanya)
rows_a = df_robustness.rename(columns={'query_a': 'query', 'target_ayats_text': 'target_ayats'})[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']]
rows_b = df_robustness.rename(columns={'query_b': 'query', 'target_ayats_text': 'target_ayats'})[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']]

df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 4. SIMPAN KE CSV
# Kita simpan kolom list sebagai string JSON
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))


df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT (dengan ID Surah & Ayat)!")
print(f" ¬† Tersimpan di: {OUTPUT_PATH}")
print(f" ¬† Total Query Uji: {len(df_test_set)} baris ({len(df_robustness)} pasangan).")

print("\nSTRUKTUR DATA (Preview dengan kolom ID baru):")
# Tampilkan 6 baris pertama dan kolom yang relevan
print(df_test_set[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']].head(6).to_markdown(index=False))


‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT (dengan ID Surah & Ayat)!
 ¬† Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries.csv
 ¬† Total Query Uji: 60 baris (30 pasangan).

STRUKTUR DATA (Preview dengan kolom ID baru):


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.

In [10]:
import pandas as pd
import os
import json

# --- KONFIGURASI PATH ---
# Notebook ada di folder notebooks/, jadi naik 1 level ke root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# --- 1. DEFINISI PASANGAN QUERY & GROUND TRUTH ---
# Struktur: [Query Asli, Query Variasi/Sinonim, Kategori, [Lokasi Ayats Relevan]]
robustness_data = [
    # Fiqh/Hukum (Law) - [1-10]
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    # Kisah/Narasi (Narrative) - [11-20]
    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A'rƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    # Aqidah/Akhlak (Theology/Ethics) - [21-30]
    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Al-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 2. KONVERSI KE DATAFRAME
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])

# 3. DUPLIKASI BARIS: Buat setiap Query A dan Query B menjadi baris terpisah
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]

df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 4. SIMPAN KE CSV
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT!")
print(f"   Tersimpan di: {OUTPUT_PATH}")
print(f"   Total Query Uji: {len(df_test_set)} baris ({len(df_test_set)/2} pasangan).")

print("\nSTRUKTUR DATA (Preview):")
print(df_test_set.head(6))


‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries.csv
   Total Query Uji: 60 baris (30.0 pasangan).

STRUKTUR DATA (Preview):
   id                            query category  \
0   1     Hukum warisan bagi perempuan     Fiqh   
1   2      Kapan puasa Ramadan dimulai     Fiqh   
2   3   Cara melaksanakan sholat Jumat     Fiqh   
3   4                 Zakat hasil bumi     Fiqh   
4   5  Denda bagi yang bersumpah palsu     Fiqh   
5   6            Berwudu sebelum salat     Fiqh   

                                       target_ayats  
0                        ["QS. An-Nis\u0101' : 11"]  
1  ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]  
2                            ["QS. Al-Jumu'ah : 9"]  
3                       ["QS. Al-An'\u0101m : 141"]  
4                      ["QS. Al-M\u0101'idah : 89"]  
5                       ["QS. Al-M\u0101'idah : 6"]  


In [11]:
import pandas as pd
import os
import json
import re # Diperlukan untuk parsing nama surah dan ayat

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        # Asumsi: jika tidak ada folder data, kita naik satu level
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# --- FUNGSI UTILITY: MENDAPATKAN ID SURAH DAN AYAT ---

# Mapping sebagian Surah yang digunakan ke ID numeriknya
# Jika Anda membutuhkan 114 Surah, gunakan file mapping lengkap (JSON/CSV)
SURAH_MAPPING = {
    "Al-Baqarah": 2, "ƒÄli 'ImrƒÅn": 3, "An-NisƒÅ'": 4, "Al-MƒÅ'idah": 5, 
    "Al-An'ƒÅm": 6, "Al-A‚ÄòrƒÅf": 7, "At-Taubah": 9, "H≈´d": 11, 
    "Y≈´suf": 12, "Ar-Ra‚Äòd": 13, "Al-IsrƒÅ'": 17, "Al-Kahf": 18, 
    "Maryam": 19, "Al-AnbiyƒÅ'": 21, "Al-Qa·π£a·π£": 28, "Al-‚ÄòAnkab≈´t": 29,
    "LuqmƒÅn": 31, "Al-A·∏•zƒÅb": 33, "YƒÅ-Sƒ´n": 36, "A·∫ì-·∫íƒÅriyƒÅt": 51,
    "Al-Qamar": 54, "Al-·∏§ujurƒÅt": 49, "Adz-DzƒÅriyƒÅt": 51, "Al-Jumu'ah": 62,
    "At-TaghƒÅbun": 64, "Al-Qalam": 68, "Al-Ma‚ÄòƒÅrij": 70, "Al-QiyƒÅmah": 75,
    "Al-IkhlƒÅ·π£": 112, "Al-Zalzalah": 99, "Al-QƒÅri'ah": 101
    # Tambahkan surah lain sesuai kebutuhan, saat ini hanya yang terpakai di robustness_data
}

def get_surah_ayat_ids(target_ayats_list):
    """
    Mengambil list lokasi ayat (misal: ["QS. An-NisƒÅ' : 11"]) 
    dan mengembalikannya sebagai list ID numerik [[Surah ID, Ayat ID]].
    """
    result_ids = []
    
    # Pattern untuk mengekstrak Nama Surah dan Nomor Ayat
    # Contoh: "QS. Al-Baqarah : 183" -> Group 1: "Al-Baqarah", Group 2: "183"
    pattern = re.compile(r"QS\. (.*?)\s*:\s*(\d+)", re.IGNORECASE)

    for ayat_str in target_ayats_list:
        match = pattern.search(ayat_str)
        if match:
            surah_name = match.group(1).strip()
            ayat_num = int(match.group(2))
            
            # Cari ID Surah, default ke -1 jika tidak ditemukan
            surah_id = SURAH_MAPPING.get(surah_name, -1)
            
            if surah_id != -1:
                result_ids.append([surah_id, ayat_num])
            else:
                # Peringatan jika Surah tidak ada di mapping
                # Ini adalah BLIND SPOT yang Anda abaikan: kelengkapan mapping!
                # print(f"‚ö†Ô∏è Peringatan: Surah '{surah_name}' tidak ditemukan di SURAH_MAPPING.")
                result_ids.append([-1, ayat_num]) # Tambahkan placeholder ID
        
        else:
            # print(f"‚ö†Ô∏è Peringatan: Format ayat '{ayat_str}' tidak dikenali.")
            pass

    return result_ids

# --- 1. DEFINISI PASANGAN QUERY & GROUND TRUTH ---
# Struktur: [Query Asli, Query Variasi/Sinonim, Kategori, [Lokasi Ayats Relevan]]
# Struktur: [ID, Query A, Query B, Kategori, [Lokasi Ayats (Teks)]]
robustness_data = [
    # Fiqh/Hukum (Law) - [1-10]
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    # Kisah/Narasi (Narrative) - [11-20]
    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    # Aqidah/Akhlak (Theology/Ethics) - [21-30]
    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Al-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 1.5. TAMBAHKAN KOLOM ID NUMERIK
# Lakukan pemrosesan ID numerik di sini sebelum konversi ke DataFrame
data_with_ids = []
for row in robustness_data:
    row_id, q_a, q_b, category, target_ayats_text = row
    
    # Panggil fungsi untuk mendapatkan ID numerik
    target_ayats_id = get_surah_ayat_ids(target_ayats_text)
    
    # Struktur baru: [ID, Query A, Query B, Kategori, [Ayats Teks], [Ayats ID]]
    data_with_ids.append([row_id, q_a, q_b, category, target_ayats_text, target_ayats_id])

# 2. KONVERSI KE DATAFRAME
df_robustness = pd.DataFrame(data_with_ids, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats_text', 'target_ayats_id'
])

# 3. DUPLIKASI BARIS: Buat setiap Query A dan Query B menjadi baris terpisah
# Kolom target_ayats_text diganti namanya agar sesuai dengan nama lama (jika ada script lain yang bergantung padanya)
rows_a = df_robustness.rename(columns={'query_a': 'query', 'target_ayats_text': 'target_ayats'})[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']]
rows_b = df_robustness.rename(columns={'query_b': 'query', 'target_ayats_text': 'target_ayats'})[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']]

df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 4. SIMPAN KE CSV
# Kita simpan kolom list sebagai string JSON
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))


df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT (dengan ID Surah & Ayat)!")
print(f" ¬† Tersimpan di: {OUTPUT_PATH}")
print(f" ¬† Total Query Uji: {len(df_test_set)} baris ({len(df_robustness)} pasangan).")

print("\nSTRUKTUR DATA (Preview dengan kolom ID baru):")
# Tampilkan 6 baris pertama dan kolom yang relevan
print(df_test_set[['id', 'query', 'category', 'target_ayats', 'target_ayats_id']].head(6).to_markdown(index=False))


‚úÖ FILE TEST SET ROBUSTNESS BERHASIL DIBUAT (dengan ID Surah & Ayat)!
 ¬† Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries.csv
 ¬† Total Query Uji: 60 baris (30 pasangan).

STRUKTUR DATA (Preview dengan kolom ID baru):


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.

In [12]:
import pandas as pd
import numpy as np
import re
import json
import os

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries_ID_BASED.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# 1. Daftar Kanonik 114 Surah (ID 1-114) - Kunci Perbaikan Data
canonical_surahs_simple = [
    'Al-Fatihah', 'Al-Baqarah', 'Ali Imran', 'An-Nisa', 'Al-Maidah', 'Al-Anam', 'Al-Araf', 'Al-Anfal', 
    'At-Taubah', 'Yunus', 'Hud', 'Yusuf', 'Ar-Rad', 'Ibrahim', 'Al-Hijr', 'An-Nahl', 'Al-Isra', 'Al-Kahf', 
    'Maryam', 'Taha', 'Al-Anbiya', 'Al-Haj', 'Al-Muminun', 'An-Nur', 'Al-Furqan', 'Asy-Syuara', 
    'An-Naml', 'Al-Qasas', 'Al-Ankabut', 'Ar-Rum', 'Luqman', 'As-Sajdah', 'Al-Ahzab', 'Saba', 'Fatir', 
    'Yasin', 'As-Saffat', 'Sad', 'Az-Zumar', 'Gafir', 'Fussilat', 'Asy-Syura', 'Az-Zukhruf', 'Ad-Dukhan', 
    'Al-Jasiyah', 'Al-Ahqaf', 'Muhammad', 'Al-Fath', 'Al-Hujurat', 'Qaf', 'Adz-Zariyat', 'At-Tur', 'An-Najm', 
    'Al-Qamar', 'Ar-Rahman', 'Al-Waqiah', 'Al-Hadid', 'Al-Mujadilah', 'Al-Hasyr', 'Al-Mumtahanah', 'As-Saff', 
    'Al-Jumuah', 'Al-Munafiqun', 'At-Tagabun', 'At-Talaq', 'At-Tahrim', 'Al-Mulk', 'Al-Qalam', 'Al-Haqqah', 
    'Al-Maarij', 'Nuh', 'Al-Jinn', 'Al-Muzzammil', 'Al-Muddassir', 'Al-Qiyamah', 'Al-Insan', 'Al-Mursalat', 
    'An-Naba', 'An-Naziat', 'Abasa', 'At-Takwir', 'Al-Infitar', 'Al-Mutaffifin', 'Al-Insyiqaq', 
    'Al-Buruj', 'At-Tariq', 'Al-Ala', 'Al-Gasyiyah', 'Al-Fajr', 'Al-Balad', 'Asy-Syams', 'Al-Lail', 'Ad-Duha', 
    'Al-Insyirah', 'At-Tin', 'Al-Alaq', 'Al-Qadr', 'Al-Bayyinah', 'Az-Zalzalah', 'Al-Adiyat', 'Al-Qariah', 
    'At-Takasur', 'Al-Asr', 'Al-Humazah', 'Al-Fil', 'Quraisy', 'Al-Maun', 'Al-Kausar', 'Al-Kafirun', 
    'An-Nasr', 'Al-Lahab', 'Al-Ikhlas', 'Al-Falaq', 'An-Nas'
]

surah_name_to_id = {name: i + 1 for i, name in enumerate(canonical_surahs_simple)}

# 2. Fungsi Konversi String Lokasi ke ID Numerik
# Tabel manual untuk mapping karakter Unicode di input string ke nama sederhana di list kanonik
translation_table = {
    'An-NisƒÅ\'': 'An-Nisa', 'Al-Baqarah': 'Al-Baqarah', 'Al-Jumu\'ah': 'Al-Jumuah', 'Al-An\'ƒÅm': 'Al-Anam', 
    'Al-MƒÅ\'idah': 'Al-Maidah', 'Al-Qa·π£a·π£': 'Al-Qasas', 'H≈´d': 'Hud', 'Maryam': 'Maryam', 'Y≈´suf': 'Yusuf', 
    'Al-Kahf': 'Al-Kahf', 'Al-A‚ÄòrƒÅf': 'Al-Araf', 'LuqmƒÅn': 'Luqman', 'Al-IsrƒÅ\'': 'Al-Isra', 
    'At-Taubah': 'At-Taubah', 'Al-IkhlƒÅ·π£': 'Al-Ikhlas', 'Al-Qamar': 'Al-Qamar', 'Adz-DzƒÅriyƒÅt': 'Adz-Zariyat', 
    'Al-·∏§ujurƒÅt': 'Al-Hujurat', 'Al-QƒÅri\'ah': 'Al-Qariah', 'Az-Zalzalah': 'Az-Zalzalah', 'Al-KƒÅfir≈´n': 'Al-Kafirun', 
    'Ar-Ra·∏•mƒÅn': 'Ar-Rahman', 'Al-AnbiyƒÅ\'': 'Al-Anbiya', 'Al-FƒÅti·∏•ah': 'Al-Fatihah', 'ƒÄli ‚ÄòImrƒÅn': 'Ali Imran', 
    'Al-Fƒ´l': 'Al-Fil', 'Al-MƒÅ‚Äò≈´n': 'Al-Maun', 'Al-Kausar': 'Al-Kausar', 'An-Na·π£r': 'An-Nasr', 'Al-Lahab': 'Al-Lahab', 
    'Al-Falaq': 'Al-Falaq', 'An-NƒÅs': 'An-Nas', 'Al-AnfƒÅl': 'Al-Anfal', 'Al-A·∏•zƒÅb': 'Al-Ahzab', 'Al-InsƒÅn': 'Al-Insan',
    'Al-Kausar': 'Al-Kausar', 'Al-Fƒ´l': 'Al-Fil', 'Quraisy': 'Quraisy', 'Al-MƒÅ‚Äò≈´n': 'Al-Maun', 'Al-KƒÅfir≈´n': 'Al-Kafirun',
    'Adz-DzƒÅriyƒÅt': 'Adz-Zariyat', 'A·π£-·π¢affƒÅt': 'As-Saffat', '·π¢ƒÅd': 'Sad', 'Az-Zumar': 'Az-Zumar', 'GƒÅfir': 'Gafir',
    'Fu·π£·π£ilat': 'Fussilat', 'Asy-Sy≈´rƒÅ': 'Asy-Syura', 'Az-Zukhruf': 'Az-Zukhruf', 'Ad-DukhƒÅn': 'Ad-Dukhan',
    'Al-JƒÅ·π°iyah': 'Al-Jasiyah', 'Al-A·∏•qƒÅf': 'Al-Ahqaf', 'Mu·∏•ammad': 'Muhammad', 'Al-Fat·∏•': 'Al-Fath', 
    'Al-MujƒÅdilah': 'Al-Mujadilah', 'Al-·∏§asyr': 'Al-Hasyr', 'Al-Mumta·∏•anah': 'Al-Mumtahanah', 'A·π£-·π¢aff': 'As-Saff',
    'Al-Jumu‚Äòah': 'Al-Jumuah', 'Al-MunƒÅfiq≈´n': 'Al-Munafiqun', 'At-TagƒÅbun': 'At-Tagabun', 'A·π≠-·π¨alƒÅq': 'At-Talaq',
    'At-Ta·∏•rƒ´m': 'At-Tahrim', 'Al-Mulk': 'Al-Mulk', 'Al-Qalam': 'Al-Qalam', 'Al-·∏§ƒÅqqah': 'Al-Haqqah',
    'Al-Ma‚ÄòƒÅrij': 'Al-Maarij', 'N≈´·∏•': 'Nuh', 'Al-Jinn': 'Al-Jinn', 'Al-Muzzammil': 'Al-Muzzammil',
    'Al-Muddasir': 'Al-Muddassir', 'Al-QiyƒÅmah': 'Al-Qiyamah', 'Al-MursalƒÅt': 'Al-Mursalat', 'An-Naba\'': 'An-Naba',
    'An-NƒÅzi‚ÄòƒÅt': 'An-Naziat', '‚ÄòAbasa': 'Abasa', 'At-Takwƒ´r': 'At-Takwir', 'Al-Infi·π≠ƒÅr': 'Al-Infitar',
    'Al-Mu·π≠affifƒ´n': 'Al-Mutaffifin', 'Al-InsyiqƒÅq': 'Al-Insyiqaq', 'Al-Bur≈´j': 'Al-Buruj', 'A·π≠-·π¨ƒÅriq': 'At-Tariq',
    'Al-A‚ÄòlƒÅ': 'Al-Ala', 'Al-GƒÅsyiyah': 'Al-Gasyiyah', 'Al-Fajr': 'Al-Fajr', 'Al-Balad': 'Al-Balad',
    'Asy-Syams': 'Asy-Syams', 'Al-Lail': 'Al-Lail', 'A·∏ç-·∏åu·∏•ƒÅ': 'Ad-Duha', 'Al-InsyirƒÅ·∏•': 'Al-Insyirah',
    'At-Tƒ´n': 'At-Tin', 'Al-‚ÄòAlaq': 'Al-Alaq', 'Al-Qadr': 'Al-Qadr', 'Al-Bayyinah': 'Al-Bayyinah',
    'Az-Zalzalah': 'Az-Zalzalah', 'Al-Adiyat': 'Al-Adiyat', 'Al-QƒÅri‚Äòah': 'Al-Qariah', 'At-TakƒÅ·π°ur': 'At-Takasur',
    'Al-‚ÄòA·π£r': 'Al-Asr', 'Al-Humazah': 'Al-Humazah'
}

def convert_to_numeric_id(location_str_list):
    numeric_targets = []
    
    for loc_str in location_str_list:
        if not isinstance(loc_str, str) or "QS." not in loc_str: continue
            
        match_surah = re.search(r'QS\. (.+?)\s+:', loc_str)
        match_ayah = re.search(r'Ayat\s+(\d+)', loc_str)
        
        if match_surah and match_ayah:
            surah_name_raw = match_surah.group(1).strip()
            ayah_id = int(match_ayah.group(1))
            
            # KUNCI PERBAIKAN: Gunakan tabel translasi
            simple_name = translation_table.get(surah_name_raw)
            surah_id = surah_name_to_id.get(simple_name)

            if surah_id is not None:
                numeric_targets.append({
                    'surah_id': surah_id,
                    'ayah_id': ayah_id,
                    'location_str': loc_str 
                })
    return numeric_targets

# 3. DEFINISI PASANGAN QUERY & GROUND TRUTH (Menggunakan string asli)
robustness_data = [
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Az-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 4. KONVERSI KE DATAFRAME
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])

# 5. DUPLIKASI BARIS
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]
df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 6. KONVERSI STRING GROUND TRUTH KE STRUCTURED NUMERIC ID
df_test_set['target_ayats_id'] = df_test_set['target_ayats'].apply(convert_to_numeric_id)

# 7. SIMPAN KE CSV
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (OUTPUT FIX)!")
print(f"   Tersimpan di: {OUTPUT_PATH}")
print(f"   Total Query Uji: {len(df_test_set)} baris.")

# DEBUG: Hitung berapa target yang berhasil di-map
success_count = df_test_set['target_ayats_id'].apply(lambda x: len(json.loads(x))).sum()
print(f"   Total Target Ayats yang berhasil di-map: {success_count}. (Harusnya 43)")

print("\nSTRUKTUR DATA (Preview, cek apakah list sudah terisi):")
# PERBAIKAN FINAL: Menggunakan to_string() untuk menghindari error 'tabulate'
print(df_test_set[['id', 'query', 'category', 'target_ayats_id']].head(6).to_string(index=False))


‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (OUTPUT FIX)!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries_ID_BASED.csv
   Total Query Uji: 60 baris.
   Total Target Ayats yang berhasil di-map: 0. (Harusnya 43)

STRUKTUR DATA (Preview, cek apakah list sudah terisi):
 id                           query category target_ayats_id
  1    Hukum warisan bagi perempuan     Fiqh              []
  2     Kapan puasa Ramadan dimulai     Fiqh              []
  3  Cara melaksanakan sholat Jumat     Fiqh              []
  4                Zakat hasil bumi     Fiqh              []
  5 Denda bagi yang bersumpah palsu     Fiqh              []
  6           Berwudu sebelum salat     Fiqh              []


In [15]:
import pandas as pd
import numpy as np
import re
import json
import os
import unicodedata # Wajib untuk menangani Unicode

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries_ID_BASED.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# 1. Daftar Kanonik 114 Surah (ID 1-114) - Kunci Referensi
canonical_surahs_simple = [
    'Al-Fatihah', 'Al-Baqarah', 'Ali Imran', 'An-Nisa', 'Al-Maidah', 'Al-Anam', 'Al-Araf', 'Al-Anfal', 
    'At-Taubah', 'Yunus', 'Hud', 'Yusuf', 'Ar-Rad', 'Ibrahim', 'Al-Hijr', 'An-Nahl', 'Al-Isra', 'Al-Kahf', 
    'Maryam', 'Taha', 'Al-Anbiya', 'Al-Haj', 'Al-Muminun', 'An-Nur', 'Al-Furqan', 'Asy-Syuara', 
    'An-Naml', 'Al-Qasas', 'Al-Ankabut', 'Ar-Rum', 'Luqman', 'As-Sajdah', 'Al-Ahzab', 'Saba', 'Fatir', 
    'Yasin', 'As-Saffat', 'Sad', 'Az-Zumar', 'Gafir', 'Fussilat', 'Asy-Syura', 'Az-Zukhruf', 'Ad-Dukhan', 
    'Al-Jasiyah', 'Al-Ahqaf', 'Muhammad', 'Al-Fath', 'Al-Hujurat', 'Qaf', 'Adz-Zariyat', 'At-Tur', 'An-Najm', 
    'Al-Qamar', 'Ar-Rahman', 'Al-Waqiah', 'Al-Hadid', 'Al-Mujadilah', 'Al-Hasyr', 'Al-Mumtahanah', 'As-Saff', 
    'Al-Jumuah', 'Al-Munafiqun', 'At-Tagabun', 'At-Talaq', 'At-Tahrim', 'Al-Mulk', 'Al-Qalam', 'Al-Haqqah', 
    'Al-Maarij', 'Nuh', 'Al-Jinn', 'Al-Muzzammil', 'Al-Muddassir', 'Al-Qiyamah', 'Al-Insan', 'Al-Mursalat', 
    'An-Naba', 'An-Naziat', 'Abasa', 'At-Takwir', 'Al-Infitar', 'Al-Mutaffifin', 'Al-Insyiqaq', 
    'Al-Buruj', 'At-Tariq', 'Al-Ala', 'Al-Gasyiyah', 'Al-Fajr', 'Al-Balad', 'Asy-Syams', 'Al-Lail', 'Ad-Duha', 
    'Al-Insyirah', 'At-Tin', 'Al-Alaq', 'Al-Qadr', 'Al-Bayyinah', 'Az-Zalzalah', 'Al-Adiyat', 'Al-Qariah', 
    'At-Takasur', 'Al-Asr', 'Al-Humazah', 'Al-Fil', 'Quraisy', 'Al-Maun', 'Al-Kausar', 'Al-Kafirun', 
    'An-Nasr', 'Al-Lahab', 'Al-Ikhlas', 'Al-Falaq', 'An-Nas'
]

surah_name_to_id = {name: i + 1 for i, name in enumerate(canonical_surahs_simple)}

# 2. FUNGSI HYPER-NORMALISASI (KUNCI FIX)
def normalize_string(text):
    """Menghapus spasi, tanda baca, dan aksen untuk membuat key yang aman."""
    if not isinstance(text, str): return ""
    
    # 1. Menghapus aksen Unicode (Contoh: ƒÅ -> a, ‚Äò -> ')
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    
    # 2. Menghapus semua karakter non-alfanumerik
    text = re.sub(r'[^\w]', '', text)
    
    return text.lower()

# 3. BUAT KAMUS TRANSLASI YANG SUDAH DINORMALISASI
# Keys: Nama surah di input ground truth (raw)
# Values: Nama surah yang sudah dinormalisasi dan ada di list kanonik
translation_table_raw_to_simple = {
    "An-NisƒÅ'": "An-Nisa", "Al-Baqarah": "Al-Baqarah", "Al-Jumu'ah": "Al-Jumuah", "Al-An'ƒÅm": "Al-Anam", 
    "Al-MƒÅ'idah": "Al-Maidah", "Al-Qa·π£a·π£": "Al-Qasas", "H≈´d": "Hud", "Maryam": "Maryam", "Y≈´suf": "Yusuf", 
    "Al-Kahf": "Al-Kahf", "Al-A‚ÄòrƒÅf": "Al-Araf", "LuqmƒÅn": "Luqman", "Al-IsrƒÅ'": "Al-Isra", 
    "At-Taubah": "At-Taubah", "Al-IkhlƒÅ·π£": "Al-Ikhlas", "Al-Qamar": "Al-Qamar", "Adz-DzƒÅriyƒÅt": "Adz-Zariyat", 
    "Al-·∏§ujurƒÅt": "Al-Hujurat", "Al-QƒÅri'ah": "Al-Qariah", "Az-Zalzalah": "Az-Zalzalah"
    # Hanya butuh yang ada di 30 query Anda
}

# Kamus final: { [Nama Surah Mentah yang Dinormalisasi] : ID Numerik }
# Ini adalah kamus yang akan kita pakai untuk lookup
final_id_map = {}
for raw_name, simple_name in translation_table_raw_to_simple.items():
    if simple_name in surah_name_to_id:
        normalized_key = normalize_string(raw_name)
        final_id_map[normalized_key] = surah_name_to_id[simple_name]


# 4. Fungsi Konversi String Lokasi ke ID Numerik (Final)
def convert_to_numeric_id(location_str_list):
    numeric_targets = []
    
    for loc_str in location_str_list:
        if not isinstance(loc_str, str) or "QS." not in loc_str:
            continue
            
        match_surah = re.search(r'QS\. (.+?)\s+:', loc_str)
        match_ayah = re.search(r'Ayat\s+(\d+)', loc_str)
        
        if match_surah and match_ayah:
            surah_name_raw = match_surah.group(1).strip()
            ayah_id = int(match_ayah.group(1))
            
            # KUNCI PERBAIKAN: Normalisasi sebelum Lookup
            normalized_name_input = normalize_string(surah_name_raw)
            
            surah_id = final_id_map.get(normalized_name_input)

            if surah_id is not None:
                numeric_targets.append({
                    'surah_id': surah_id,
                    'ayah_id': ayah_id,
                    'location_str': loc_str 
                })
    return numeric_targets

# 5. DEFINISI PASANGAN QUERY & GROUND TRUTH (Tidak diubah)
robustness_data = [
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Az-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 6. KONVERSI KE DATAFRAME & DUPLIKASI BARIS (Tidak diubah)
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]
df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 7. KONVERSI STRING GROUND TRUTH KE STRUCTURED NUMERIC ID (Memakai fungsi perbaikan)
df_test_set['target_ayats_id'] = df_test_set['target_ayats'].apply(convert_to_numeric_id)

# 8. SIMPAN KE CSV (Output Fix)
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!")
print(f"   Tersimpan di: {OUTPUT_PATH}")
print(f"   Total Query Uji: {len(df_test_set)} baris.")

# DEBUG: Hitung berapa target yang berhasil di-map
success_count = df_test_set['target_ayats_id'].apply(lambda x: len(json.loads(x))).sum()
print(f"   Total Target Ayats yang berhasil di-map: {success_count}. (Harusnya 43)")

print("\nSTRUKTUR DATA (Preview, cek apakah list sudah terisi):")
print(df_test_set[['id', 'query', 'category', 'target_ayats_id']].head(6).to_string(index=False))


‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries_ID_BASED.csv
   Total Query Uji: 60 baris.
   Total Target Ayats yang berhasil di-map: 0. (Harusnya 43)

STRUKTUR DATA (Preview, cek apakah list sudah terisi):
 id                           query category target_ayats_id
  1    Hukum warisan bagi perempuan     Fiqh              []
  2     Kapan puasa Ramadan dimulai     Fiqh              []
  3  Cara melaksanakan sholat Jumat     Fiqh              []
  4                Zakat hasil bumi     Fiqh              []
  5 Denda bagi yang bersumpah palsu     Fiqh              []
  6           Berwudu sebelum salat     Fiqh              []


In [16]:
import pandas as pd
import numpy as np
import re
import json
import os
import unicodedata # Digunakan untuk menangani Unicode

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries_ID_BASED.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# 1. Daftar Kanonik 114 Surah (ID 1-114)
canonical_surahs_simple = [
    'Al-Fatihah', 'Al-Baqarah', 'Ali Imran', 'An-Nisa', 'Al-Maidah', 'Al-Anam', 'Al-Araf', 'Al-Anfal', 
    'At-Taubah', 'Yunus', 'Hud', 'Yusuf', 'Ar-Rad', 'Ibrahim', 'Al-Hijr', 'An-Nahl', 'Al-Isra', 'Al-Kahf', 
    'Maryam', 'Taha', 'Al-Anbiya', 'Al-Haj', 'Al-Muminun', 'An-Nur', 'Al-Furqan', 'Asy-Syuara', 
    'An-Naml', 'Al-Qasas', 'Al-Ankabut', 'Ar-Rum', 'Luqman', 'As-Sajdah', 'Al-Ahzab', 'Saba', 'Fatir', 
    'Yasin', 'As-Saffat', 'Sad', 'Az-Zumar', 'Gafir', 'Fussilat', 'Asy-Syura', 'Az-Zukhruf', 'Ad-Dukhan', 
    'Al-Jasiyah', 'Al-Ahqaf', 'Muhammad', 'Al-Fath', 'Al-Hujurat', 'Qaf', 'Adz-Zariyat', 'At-Tur', 'An-Najm', 
    'Al-Qamar', 'Ar-Rahman', 'Al-Waqiah', 'Al-Hadid', 'Al-Mujadilah', 'Al-Hasyr', 'Al-Mumtahanah', 'As-Saff', 
    'Al-Jumuah', 'Al-Munafiqun', 'At-Tagabun', 'At-Talaq', 'At-Tahrim', 'Al-Mulk', 'Al-Qalam', 'Al-Haqqah', 
    'Al-Maarij', 'Nuh', 'Al-Jinn', 'Al-Muzzammil', 'Al-Muddassir', 'Al-Qiyamah', 'Al-Insan', 'Al-Mursalat', 
    'An-Naba', 'An-Naziat', 'Abasa', 'At-Takwir', 'Al-Infitar', 'Al-Mutaffifin', 'Al-Insyiqaq', 
    'Al-Buruj', 'At-Tariq', 'Al-Ala', 'Al-Gasyiyah', 'Al-Fajr', 'Al-Balad', 'Asy-Syams', 'Al-Lail', 'Ad-Duha', 
    'Al-Insyirah', 'At-Tin', 'Al-Alaq', 'Al-Qadr', 'Al-Bayyinah', 'Az-Zalzalah', 'Al-Adiyat', 'Al-Qariah', 
    'At-Takasur', 'Al-Asr', 'Al-Humazah', 'Al-Fil', 'Quraisy', 'Al-Maun', 'Al-Kausar', 'Al-Kafirun', 
    'An-Nasr', 'Al-Lahab', 'Al-Ikhlas', 'Al-Falaq', 'An-Nas'
]

surah_name_to_id = {name: i + 1 for i, name in enumerate(canonical_surahs_simple)}

# 2. FUNGSI DAN KAMUS KONVERSI EKSPISIT (KUNCI FIX)
# Mapping eksplisit dari string raw input ke nama kanonik yang mudah dicari
EXPLICIT_TRANSLATION_MAP = {
    # Surah dengan Unicode / Apostrof
    "An-NisƒÅ'": "An-Nisa", "Al-Baqarah": "Al-Baqarah", "Al-Jumu'ah": "Al-Jumuah", 
    "Al-An'ƒÅm": "Al-Anam", "Al-MƒÅ'idah": "Al-Maidah", "Al-Qa·π£a·π£": "Al-Qasas", 
    "H≈´d": "Hud", "Y≈´suf": "Yusuf", "Al-Kahf": "Al-Kahf", "Al-A‚ÄòrƒÅf": "Al-Araf", 
    "LuqmƒÅn": "Luqman", "Al-IsrƒÅ'": "Al-Isra", "At-Taubah": "At-Taubah", "Al-IkhlƒÅ·π£": "Al-Ikhlas", 
    "Al-Qamar": "Al-Qamar", "Adz-DzƒÅriyƒÅt": "Adz-Zariyat", "Al-·∏§ujurƒÅt": "Al-Hujurat", 
    "Al-QƒÅri'ah": "Al-Qariah", "Az-Zalzalah": "Az-Zalzalah", "Al-AnbiyƒÅ'": "Al-Anbiya",
    # Tambahkan yang lain (tanpa unicode, tapi untuk melengkapi kamus)
    "Maryam": "Maryam", "Al-Qamar": "Al-Qamar"
}


def convert_to_numeric_id(location_str_list):
    numeric_targets = []
    
    for loc_str in location_str_list:
        if not isinstance(loc_str, str) or "QS." not in loc_str:
            continue
            
        match_surah = re.search(r'QS\. (.+?)\s*:\s*(\d+)', loc_str)
        
        if match_surah:
            surah_name_raw = match_surah.group(1).strip()
            ayah_id = int(match_surah.group(2))
            
            # KUNCI PERBAIKAN: Lookup ke tabel terjemahan eksplisit
            simple_name = EXPLICIT_TRANSLATION_MAP.get(surah_name_raw)
            
            # Cari ID Numerik dari nama sederhana
            surah_id = surah_name_to_id.get(simple_name)

            if surah_id is not None:
                numeric_targets.append({
                    'surah_id': surah_id,
                    'ayah_id': ayah_id,
                    'location_str': loc_str 
                })
    return numeric_targets

# 3. DEFINISI PASANGAN QUERY & GROUND TRUTH (Menggunakan string asli)
robustness_data = [
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Az-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 4. KONVERSI KE DATAFRAME & DUPLIKASI BARIS
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]
df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 5. KONVERSI STRING GROUND TRUTH KE STRUCTURED NUMERIC ID
df_test_set['target_ayats_id'] = df_test_set['target_ayats'].apply(convert_to_numeric_id)

# 6. SIMPAN KE CSV
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!")
print(f"   Tersimpan di: {OUTPUT_PATH}")
print(f"   Total Query Uji: {len(df_test_set)} baris.")

# DEBUG: Hitung berapa target yang berhasil di-map
success_count = df_test_set['target_ayats_id'].apply(lambda x: len(json.loads(x))).sum()
print(f"   Total Target Ayats yang berhasil di-map: {success_count}. (Harusnya 43)")

print("\nSTRUKTUR DATA (Preview, cek apakah list sudah terisi):")
print(df_test_set[['id', 'query', 'category', 'target_ayats_id']].head(6).to_string(index=False))


‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries_ID_BASED.csv
   Total Query Uji: 60 baris.
   Total Target Ayats yang berhasil di-map: 76. (Harusnya 43)

STRUKTUR DATA (Preview, cek apakah list sudah terisi):
 id                           query category                                                                                                                                    target_ayats_id
  1    Hukum warisan bagi perempuan     Fiqh                                                                         [{"surah_id": 4, "ayah_id": 11, "location_str": "QS. An-Nis\u0101' : 11"}]
  2     Kapan puasa Ramadan dimulai     Fiqh [{"surah_id": 2, "ayah_id": 183, "location_str": "QS. Al-Baqarah : 183"}, {"surah_id": 2, "ayah_id": 185, "location_str": "QS. Al-Baqarah : 185"}]
  3  Cara melaksanakan sholat Jumat     Fiqh                     

In [17]:
import pandas as pd
import numpy as np
import re
import json
import os
import unicodedata # Wajib untuk menangani Unicode

# --- KONFIGURASI PATH ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

OUTPUT_PATH = os.path.join(ROOT_DIR, 'data', 'processed', 'robustness_queries_ID_BASED.csv')
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# 1. Daftar Kanonik 114 Surah (Sederhana, untuk ID)
canonical_surahs_simple = [
    'Al-Fatihah', 'Al-Baqarah', 'Ali Imran', 'An-Nisa', 'Al-Maidah', 'Al-Anam', 'Al-Araf', 'Al-Anfal', 
    'At-Taubah', 'Yunus', 'Hud', 'Yusuf', 'Ar-Rad', 'Ibrahim', 'Al-Hijr', 'An-Nahl', 'Al-Isra', 'Al-Kahf', 
    'Maryam', 'Taha', 'Al-Anbiya', 'Al-Haj', 'Al-Muminun', 'An-Nur', 'Al-Furqan', 'Asy-Syuara', 
    'An-Naml', 'Al-Qasas', 'Al-Ankabut', 'Ar-Rum', 'Luqman', 'As-Sajdah', 'Al-Ahzab', 'Saba', 'Fatir', 
    'Yasin', 'As-Saffat', 'Sad', 'Az-Zumar', 'Gafir', 'Fussilat', 'Asy-Syura', 'Az-Zukhruf', 'Ad-Dukhan', 
    'Al-Jasiyah', 'Al-Ahqaf', 'Muhammad', 'Al-Fath', 'Al-Hujurat', 'Qaf', 'Adz-Zariyat', 'At-Tur', 'An-Najm', 
    'Al-Qamar', 'Ar-Rahman', 'Al-Waqiah', 'Al-Hadid', 'Al-Mujadilah', 'Al-Hasyr', 'Al-Mumtahanah', 'As-Saff', 
    'Al-Jumuah', 'Al-Munafiqun', 'At-Tagabun', 'At-Talaq', 'At-Tahrim', 'Al-Mulk', 'Al-Qalam', 'Al-Haqqah', 
    'Al-Maarij', 'Nuh', 'Al-Jinn', 'Al-Muzzammil', 'Al-Muddassir', 'Al-Qiyamah', 'Al-Insan', 'Al-Mursalat', 
    'An-Naba', 'An-Naziat', 'Abasa', 'At-Takwir', 'Al-Infitar', 'Al-Mutaffifin', 'Al-Insyiqaq', 
    'Al-Buruj', 'At-Tariq', 'Al-Ala', 'Al-Gasyiyah', 'Al-Fajr', 'Al-Balad', 'Asy-Syams', 'Al-Lail', 'Ad-Duha', 
    'Al-Insyirah', 'At-Tin', 'Al-Alaq', 'Al-Qadr', 'Al-Bayyinah', 'Az-Zalzalah', 'Al-Adiyat', 'Al-Qariah', 
    'At-Takasur', 'Al-Asr', 'Al-Humazah', 'Al-Fil', 'Quraisy', 'Al-Maun', 'Al-Kausar', 'Al-Kafirun', 
    'An-Nasr', 'Al-Lahab', 'Al-Ikhlas', 'Al-Falaq', 'An-Nas'
]

# 2. FUNGSI HYPER-NORMALISASI (KUNCI FIX)
def hyper_normalize(name):
    """Menghapus semua karakter non-alfanumerik dan aksen untuk membuat key yang aman."""
    if not isinstance(name, str): return ""
    
    # 1. Normalisasi Unicode (e.g., ƒÅ -> a) dan encoding ke ascii
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    
    # 2. Hapus semua karakter non-alfanumerik (termasuk spasi dan apostrof)
    name = re.sub(r'[^a-zA-Z0-9]', '', name)
    
    return name.lower()

# 3. BUAT KAMUS ID FINAL (Hyper-Normalized Key -> ID Numerik)
final_id_map = {}
for i, simple_name in enumerate(canonical_surahs_simple):
    # Key adalah bentuk ter-normalisasi dari nama surah sederhana (Contoh: 'annisƒÅ'' -> 'annisa')
    normalized_key = hyper_normalize(simple_name) 
    final_id_map[normalized_key] = i + 1 # ID Surah
    
# 4. Fungsi Konversi String Lokasi ke ID Numerik (Final)
def convert_to_numeric_id(location_str_list):
    numeric_targets = []
    
    for loc_str in location_str_list:
        if not isinstance(loc_str, str) or "QS." not in loc_str:
            continue
            
        match_surah = re.search(r'QS\. (.+?)\s*:\s*(\d+)', loc_str)
        
        if match_surah:
            surah_name_raw = match_surah.group(1).strip()
            ayah_id = int(match_surah.group(2))
            
            # KUNCI FIX: Normalisasi input string mentah sebelum lookup
            normalized_name_input = hyper_normalize(surah_name_raw)
            
            surah_id = final_id_map.get(normalized_name_input)

            if surah_id is not None:
                numeric_targets.append({
                    'surah_id': surah_id,
                    'ayah_id': ayah_id,
                    'location_str': loc_str 
                })
    return numeric_targets

# 5. DEFINISI PASANGAN QUERY & GROUND TRUTH (Tidak diubah)
robustness_data = [
    [1, "Hukum warisan bagi perempuan", "Pembagian harta pusaka istri", "Fiqh", ["QS. An-NisƒÅ' : 11"]],
    [2, "Kapan puasa Ramadan dimulai", "Kewajiban saum di bulan suci", "Fiqh", ["QS. Al-Baqarah : 183", "QS. Al-Baqarah : 185"]],
    [3, "Cara melaksanakan sholat Jumat", "Ketentuan sembahyang Jumat", "Fiqh", ["QS. Al-Jumu'ah : 9"]],
    [4, "Zakat hasil bumi", "Kewajiban sedekah pertanian", "Fiqh", ["QS. Al-An'ƒÅm : 141"]],
    [5, "Denda bagi yang bersumpah palsu", "Konsekuensi sumpah dusta", "Fiqh", ["QS. Al-MƒÅ'idah : 89"]],
    [6, "Berwudu sebelum salat", "Tata cara bersuci sebelum ibadah", "Fiqh", ["QS. Al-MƒÅ'idah : 6"]],
    [7, "Pernikahan beda agama", "Hukum perkawinan non-Muslim", "Fiqh", ["QS. Al-Baqarah : 221"]],
    [8, "Larangan memakan riba", "Haramnya pinjaman berbunga", "Fiqh", ["QS. Al-Baqarah : 275"]],
    [9, "Membayar fidyah karena tidak puasa", "Kewajiban ganti rugi puasa", "Fiqh", ["QS. Al-Baqarah : 184"]],
    [10, "Apa itu khamar", "Definisi minuman memabukkan", "Fiqh", ["QS. Al-MƒÅ'idah : 90"]],

    [11, "Kisah Nabi Musa dan Firaun", "Cerita pertemuan Musa dengan raja Mesir", "Kisah", ["QS. Al-Qa·π£a·π£ : 31", "QS. Al-Qa·π£a·π£ : 36"]],
    [12, "Kapal Nabi Nuh", "Perahu raksasa nuh", "Kisah", ["QS. H≈´d : 44"]],
    [13, "Maryam melahirkan Isa", "Kelahiran putra Maryam", "Kisah", ["QS. Maryam : 23", "QS. Maryam : 27"]],
    [14, "Nabi Yusuf dan mimpi 11 bintang", "Tafsir mimpi nabi Yakub tentang bintang", "Kisah", ["QS. Y≈´suf : 4"]],
    [15, "Kisah Ashabul Kahfi", "Tujuh pemuda yang tertidur lama", "Kisah", ["QS. Al-Kahf : 10", "QS. Al-Kahf : 25"]],
    [16, "Kenapa Iblis diusir dari surga", "Alasan setan menolak sujud Adam", "Kisah", ["QS. Al-A‚ÄòrƒÅf : 12", "QS. Al-Kahf : 50"]],
    [17, "Tugas malaikat Jibril", "Fungsi Gabriel membawa wahyu", "Kisah", ["QS. Al-Baqarah : 97"]],
    [18, "Kisah Qabil dan Habil", "Pembunuhan putra Adam", "Kisah", ["QS. Al-MƒÅ'idah : 27"]],
    [19, "Raja Thalut dan Jalut", "Pertempuran Daud melawan Goliat", "Kisah", ["QS. Al-Baqarah : 249", "QS. Al-Baqarah : 251"]],
    [20, "Bangsa Ya'juj dan Ma'juj", "Siapa Gog dan Magog", "Kisah", ["QS. Al-Kahf : 94"]],

    [21, "Larangan berbuat syirik", "Dosa menyekutukan Allah", "Aqidah", ["QS. An-NisƒÅ' : 48", "QS. LuqmƒÅn : 13"]],
    [22, "Berbakti pada kedua orang tua", "Kewajiban menghormati ayah ibu", "Aqidah", ["QS. Al-IsrƒÅ' : 23"]],
    [23, "Definisi tauhid", "Konsep keesaan Tuhan", "Aqidah", ["QS. Al-IkhlƒÅ·π£ : 1"]],
    [24, "Takdir baik dan buruk", "Ketentuan nasib yang ditetapkan Allah", "Aqidah", ["QS. Al-Qamar : 49"]],
    [25, "Larangan berbuat dusta", "Hukum berkata bohong", "Aqidah", ["QS. At-Taubah : 119"]],
    [26, "Tentang Hari Kiamat", "Deskripsi Hari Pembalasan", "Aqidah", ["QS. Al-QƒÅri'ah : 1", "QS. Az-Zalzalah : 1"]],
    [27, "Balasan bagi orang yang sombong", "Konsekuensi sifat takabur", "Aqidah", ["QS. LuqmƒÅn : 18"]],
    [28, "Larangan mengumpat", "Hukum ghibah dan mencela", "Aqidah", ["QS. Al-·∏§ujurƒÅt : 12"]],
    [29, "Keutamaan sabar", "Pentingnya menahan diri", "Aqidah", ["QS. Al-Baqarah : 153"]],
    [30, "Tujuan hidup manusia", "Mengapa kita diciptakan", "Aqidah", ["QS. Adz-DzƒÅriyƒÅt : 56"]],
]

# 6. KONVERSI KE DATAFRAME & DUPLIKASI BARIS
df_robustness = pd.DataFrame(robustness_data, columns=[
    'id', 'query_a', 'query_b', 'category', 'target_ayats'
])
rows_a = df_robustness.rename(columns={'query_a': 'query'})[['id', 'query', 'category', 'target_ayats']]
rows_b = df_robustness.rename(columns={'query_b': 'query'})[['id', 'query', 'category', 'target_ayats']]
df_test_set = pd.concat([rows_a, rows_b], ignore_index=True)

# 7. KONVERSI STRING GROUND TRUTH KE STRUCTURED NUMERIC ID
df_test_set['target_ayats_id'] = df_test_set['target_ayats'].apply(convert_to_numeric_id)

# 8. SIMPAN KE CSV
df_test_set['target_ayats'] = df_test_set['target_ayats'].apply(lambda x: json.dumps(x))
df_test_set['target_ayats_id'] = df_test_set['target_ayats_id'].apply(lambda x: json.dumps(x))

df_test_set.to_csv(OUTPUT_PATH, index=False)

print("\n‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!")
print(f"   Tersimpan di: {OUTPUT_PATH}")

# DEBUG: Hitung berapa target yang berhasil di-map
success_count = df_test_set['target_ayats_id'].apply(lambda x: len(json.loads(x))).sum()
print(f"   Total Target Ayats yang berhasil di-map: {success_count}. (Harusnya 43)")

print("\nSTRUKTUR DATA (Preview, cek apakah list sudah terisi):")
print(df_test_set[['id', 'query', 'category', 'target_ayats_id']].head(6).to_string(index=False))


‚úÖ FILE TEST SET ROBUSTNESS ID-BASED BERHASIL DIBUAT (FINAL FIX)!
   Tersimpan di: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\robustness_queries_ID_BASED.csv
   Total Target Ayats yang berhasil di-map: 74. (Harusnya 43)

STRUKTUR DATA (Preview, cek apakah list sudah terisi):
 id                           query category                                                                                                                                    target_ayats_id
  1    Hukum warisan bagi perempuan     Fiqh                                                                         [{"surah_id": 4, "ayah_id": 11, "location_str": "QS. An-Nis\u0101' : 11"}]
  2     Kapan puasa Ramadan dimulai     Fiqh [{"surah_id": 2, "ayah_id": 183, "location_str": "QS. Al-Baqarah : 183"}, {"surah_id": 2, "ayah_id": 185, "location_str": "QS. Al-Baqarah : 185"}]
  3  Cara melaksanakan sholat Jumat     Fiqh                                                   

In [1]:
import pandas as pd
import numpy as np
import joblib 
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch
import os
import re
import json
from nltk.corpus import stopwords
import string
import gc
from sklearn.metrics import ndcg_score, average_precision_score

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals(): 
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'models')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

MODEL_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# FILE-FILE YANG DIBUTUHKAN
MASTER_CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')
QUERIES_PATH = os.path.join(DATA_DIR, 'robustness_queries_ID_BASED.csv')
EMB_FILE = os.path.join(MODEL_DIR, 'corpus_embeddings.pt')
RF_PATH = os.path.join(MODEL_DIR, 'randomforest_custom.pkl') 

# Urutan Fitur Model RF
RF_FEATURES = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score']

# --- 2. LOAD DATA & MODEL (Optimized for Speed) ---
print("‚öôÔ∏è MEMUAT ENGINE RANDOM FOREST UNTUK UJI KRITIS...")

# A. Load Data Master ID-BASED
df_master = pd.read_csv(MASTER_CSV_PATH, usecols=['text', 'surah_id', 'ayah_id'])
df_master.columns = df_master.columns.str.strip().str.lower()
df_index = df_master.drop_duplicates(subset=['text']).copy().reset_index(drop=True)
unique_tafsirs = df_index['text'].astype(str).tolist()

del df_master
gc.collect()

# B. Load Embeddings
corpus_embeddings = torch.load(EMB_FILE, map_location='cpu')

# C. Setup Tools (SBERT & BM25)
sbert_model = SentenceTransformer(os.path.join(MODEL_DIR, 'sbert_finetuned_quran'), device='cpu')

try: stopwords_id = stopwords.words('indonesian')
except: stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk'] 
def clean_tokens(text):
    if not isinstance(text, str): return []
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if w not in stopwords_id]

corpus_tokens = [clean_tokens(t) for t in unique_tafsirs]
bm25 = BM25Okapi(corpus_tokens)

# D. Load Random Forest Model
try:
    rf_model = joblib.load(RF_PATH)
    print("‚úÖ Model Random Forest berhasil dimuat.")
except Exception as e:
    print(f"‚ùå Gagal memuat model RF: {e}")
    raise SystemExit("Gagal memuat model RF.")

# E. Load Query Test Set (Filter hanya Query Fiqih Kritis)
df_queries = pd.read_csv(QUERIES_PATH)
df_queries['target_ayats_id'] = df_queries['target_ayats_id'].apply(json.loads) 
df_queries = df_queries[df_queries['category'] == 'Fiqh'].head(10).copy() # Ambil 10 query Fiqih pertama

print(f"‚úÖ Melakukan Uji Kritis pada {len(df_queries)} Query Fiqih...")

# --- 3. FUNGSI UJI P@1 ---

def run_critical_test(model, df_queries):
    
    hits_at_1 = 0
    total_queries = 0
    
    for query_idx, row in df_queries.iterrows():
        query_text = row['query']
        target_ayats_id = row['target_ayats_id'] 
        
        # 1. TENTUKAN GROUND TRUTH NUMERIK
        y_true = np.zeros(len(unique_tafsirs))
        
        for target in target_ayats_id:
            s_id = target['surah_id']
            a_id = target['ayah_id']
            matching_indices = df_index[
                (df_index['surah_id'] == s_id) & 
                (df_index['ayah_id'] == a_id)
            ].index.tolist()
            for idx in matching_indices:
                if idx < len(y_true): y_true[idx] = 1

        if y_true.sum() == 0: continue 
        total_queries += 1

        # 2. Re-ranking (Top K 50 untuk menjamin jawaban benar terambil)
        query_vec = sbert_model.encode(query_text, convert_to_tensor=True)
        # KUNCI PERBAIKAN: TOP K dinaikkan agar tahap 1 retrieval tidak kosong
        hits = util.semantic_search(query_vec, corpus_embeddings, top_k=50)[0] 
        
        candidates = []
        q_toks = clean_tokens(query_text)
        
        for hit in hits:
            idx = hit['corpus_id']
            if idx >= len(unique_tafsirs): continue 

            txt = unique_tafsirs[idx]
            
            # Hitung 4 fitur wajib
            t_toks = clean_tokens(txt)
            sq, st = set(q_toks), set(t_toks)
            ov = len(sq & st) / len(sq) if sq else 0
            jac = len(sq & st) / (len(sq | st) + 1e-9)
            bm25_s = bm25.get_batch_scores(q_toks, [idx])[0]
            
            candidates.append({
                'sbert_sim': hit['score'], 'bm25_score': bm25_s,
                'overlap_score': ov, 'jaccard_score': jac, 'idx_corpus': idx
            })
            
        if not candidates: continue # Skip jika tidak ada kandidat

        # 3. Prediksi Model
        df_cand = pd.DataFrame(candidates)
        # KUNCI PERBAIKAN: Cek jika DataFrame kosong sebelum memprediksi
        if df_cand.empty: continue 

        X_pred = df_cand[RF_FEATURES]

        scores = model.predict_proba(X_pred)[:, 1]
        df_cand['pred_score'] = scores
        
        # 4. Cek P@1
        df_cand = df_cand.sort_values('pred_score', ascending=False)
        
        # KUNCI PERBAIKAN: Cek df_cand.iloc[0] hanya jika DataFrame tidak kosong
        if not df_cand.empty:
            # Ambil indeks corpus dari rank 1
            rank_1_idx_corpus = df_cand.iloc[0]['idx_corpus']
            
            # Cek apakah rank 1 termasuk ground truth
            if rank_1_idx_corpus < len(y_true) and y_true[int(rank_1_idx_corpus)] == 1:
                hits_at_1 += 1
    
    # Final Results
    p_at_1 = hits_at_1 / total_queries if total_queries > 0 else 0
    return p_at_1, total_queries

# --- 4. EKSEKUSI UJI KRITIS ---
p_at_1_score, total_tested = run_critical_test(rf_model, df_queries)

print("\n\n=======================================================")
print("üèÜ HASIL UJI KRITIS (RANDOM FOREST)")
print("=======================================================")
print(f"Metrik Uji: Precision at 1 (P@1)")
print(f"Query Fiqih Diuji: {total_tested}")
print("-------------------------------------------------------")
print(f"P@1 Score (Jawaban di Rank 1 Benar): {p_at_1_score*100:.2f}%")

if p_at_1_score > 0.6:
    print("\nKESIMPULAN: Performa RF sangat Kuat dan Andal. Siap untuk diintegrasikan.")
else:
    print("\nKESIMPULAN: Performa RF Cukup. Diperlukan tuning atau menggunakan Model Pemenang (LightGBM).")

‚öôÔ∏è MEMUAT ENGINE RANDOM FOREST UNTUK UJI KRITIS...


The tokenizer you are loading from 'c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\models\sbert_finetuned_quran' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


‚úÖ Model Random Forest berhasil dimuat.
‚úÖ Melakukan Uji Kritis pada 10 Query Fiqih...


üèÜ HASIL UJI KRITIS (RANDOM FOREST)
Metrik Uji: Precision at 1 (P@1)
Query Fiqih Diuji: 9
-------------------------------------------------------
P@1 Score (Jawaban di Rank 1 Benar): 0.00%

KESIMPULAN: Performa RF Cukup. Diperlukan tuning atau menggunakan Model Pemenang (LightGBM).


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import torch
import os
import re
import json
import gc
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
import string
import joblib 

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals(): 
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'models')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

MODEL_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# FILE-FILE YANG DIBUTUHKAN
MASTER_CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')
QUERIES_PATH = os.path.join(DATA_DIR, 'robustness_queries_ID_BASED.csv')
EMB_FILE = os.path.join(MODEL_DIR, 'corpus_embeddings.pt')

# Urutan Fitur Model XGBoost
XGBOOST_FEATURES = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score']

# --- 2. LOAD DATA & MODEL (Optimized for Speed) ---
print("‚öôÔ∏è MEMUAT ENGINE XGBOOST UNTUK UJI KRITIS...")

# A. Load Data Master ID-BASED (Hanya kolom penting)
df_master = pd.read_csv(MASTER_CSV_PATH, usecols=['text', 'surah_id', 'ayah_id'])
df_master.columns = df_master.columns.str.strip().str.lower()
df_index = df_master.drop_duplicates(subset=['text']).copy().reset_index(drop=True)
unique_tafsirs = df_index['text'].astype(str).tolist()

del df_master
gc.collect() # Bersihkan memori

# B. Load Embeddings
corpus_embeddings = torch.load(EMB_FILE, map_location='cpu')

# C. Setup Tools (SBERT & BM25)
sbert_model = SentenceTransformer(os.path.join(MODEL_DIR, 'sbert_finetuned_quran'), device='cpu')

try: stopwords_id = stopwords.words('indonesian')
except: stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk'] 
def clean_tokens(text):
    if not isinstance(text, str): return []
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if w not in stopwords_id]

corpus_tokens = [clean_tokens(t) for t in unique_tafsirs]
bm25 = BM25Okapi(corpus_tokens)

# D. Load XGBoost Model
try:
    xgb_model = xgb.Booster()
    xgb_model.load_model(os.path.join(MODEL_DIR, 'xgboost_best_model.json'))
    print("‚úÖ Model XGBoost berhasil dimuat.")
except Exception as e:
    print(f"‚ùå Gagal memuat model XGBoost: {e}")
    raise SystemExit("Gagal memuat model XGBoost.")

# E. Load Query Test Set (Filter hanya Query Fiqih Kritis)
df_queries = pd.read_csv(QUERIES_PATH)
df_queries['target_ayats_id'] = df_queries['target_ayats_id'].apply(json.loads) 
df_queries = df_queries[df_queries['category'] == 'Fiqh'].head(10).copy() 

print(f"‚úÖ Melakukan Uji Kritis pada {len(df_queries)} Query Fiqih...")

# --- 3. FUNGSI UJI P@1 KRITIS ---

def run_critical_test(model, df_queries):
    
    hits_at_1 = 0
    total_queries = 0
    
    for query_idx, row in df_queries.iterrows():
        query_text = row['query']
        target_ayats_id = row['target_ayats_id'] 
        
        # 1. TENTUKAN GROUND TRUTH NUMERIK
        y_true = np.zeros(len(unique_tafsirs))
        
        for target in target_ayats_id:
            s_id = target['surah_id']
            a_id = target['ayah_id']
            
            matching_indices = df_index[
                (df_index['surah_id'] == s_id) & 
                (df_index['ayah_id'] == a_id)
            ].index.tolist()
            for idx in matching_indices:
                if idx < len(y_true): y_true[idx] = 1

        if y_true.sum() == 0: continue 
        total_queries += 1

        # 2. Re-ranking (Top K 50 untuk menjamin jawaban benar terambil)
        query_vec = sbert_model.encode(query_text, convert_to_tensor=True)
        # Top K 50 (Naikkan dari 10 agar ada peluang)
        hits = util.semantic_search(query_vec, corpus_embeddings, top_k=50)[0] 
        
        candidates = []
        q_toks = clean_tokens(query_text)
        
        for hit in hits:
            idx = hit['corpus_id']
            if idx >= len(unique_tafsirs): continue 

            txt = unique_tafsirs[idx]
            
            # Hitung 4 fitur wajib
            t_toks = clean_tokens(txt)
            sq, st = set(q_toks), set(t_toks)
            ov = len(sq & st) / len(sq) if sq else 0
            jac = len(sq & st) / (len(sq | st) + 1e-9)
            bm25_s = bm25.get_batch_scores(q_toks, [idx])[0]
            
            candidates.append({
                'sbert_sim': hit['score'], 'bm25_score': bm25_s,
                'overlap_score': ov, 'jaccard_score': jac, 'idx_corpus': idx
            })
            
        if not candidates: continue 

        # 3. Prediksi Model
        df_cand = pd.DataFrame(candidates)

        if df_cand.empty: continue 

        X_pred = df_cand[XGBOOST_FEATURES]

        # PREDIKSI XGBOOST
        scores = model.predict(xgb.DMatrix(X_pred))
        
        df_cand['pred_score'] = scores
        
        # 4. Cek P@1
        df_cand = df_cand.sort_values('pred_score', ascending=False)
        
        if not df_cand.empty:
            rank_1_idx_corpus = df_cand.iloc[0]['idx_corpus']
            
            if rank_1_idx_corpus < len(y_true) and y_true[int(rank_1_idx_corpus)] == 1:
                hits_at_1 += 1
    
    # Final Results
    p_at_1 = hits_at_1 / total_queries if total_queries > 0 else 0
    return p_at_1, total_queries

# --- 4. EKSEKUSI UJI KRITIS ---
p_at_1_score, total_tested = run_critical_test(xgb_model, df_queries)

print("\n\n=======================================================")
print("üèÜ HASIL UJI KRITIS (XGBOOST)")
print("=======================================================")
print(f"Metrik Uji: Precision at 1 (P@1)")
print(f"Query Fiqih Diuji: {total_tested}")
print("-------------------------------------------------------")
print(f"P@1 Score (Jawaban di Rank 1 Benar): {p_at_1_score*100:.2f}%")

if p_at_1_score > 0.8:
    print("\nKESIMPULAN: XGBoost SANGAT KUAT dan Andal. Pilih ini.")
elif p_at_1_score > 0.5:
    print("\nKESIMPULAN: XGBoost Cukup Kuat. Dapat dipertimbangkan jika LightGBM lebih rendah.")
else:
    print("\nKESIMPULAN: Performa XGBoost BURUK. Pilih LightGBM yang sudah terbukti kuat.")

‚öôÔ∏è MEMUAT ENGINE XGBOOST UNTUK UJI KRITIS...


The tokenizer you are loading from 'c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\models\sbert_finetuned_quran' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


‚úÖ Model XGBoost berhasil dimuat.
‚úÖ Melakukan Uji Kritis pada 10 Query Fiqih...


üèÜ HASIL UJI KRITIS (XGBOOST)
Metrik Uji: Precision at 1 (P@1)
Query Fiqih Diuji: 9
-------------------------------------------------------
P@1 Score (Jawaban di Rank 1 Benar): 0.00%

KESIMPULAN: Performa XGBoost BURUK. Pilih LightGBM yang sudah terbukti kuat.


In [1]:
import pandas as pd
import numpy as np
import joblib 
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch
import os
import re
import json
from nltk.corpus import stopwords
import string
import gc

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals(): 
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'models')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

MODEL_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# FILE-FILE YANG DIBUTUHKAN
MASTER_CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')
QUERIES_PATH = os.path.join(DATA_DIR, 'robustness_queries_ID_BASED.csv')
EMB_FILE = os.path.join(MODEL_DIR, 'corpus_embeddings.pt')
LGBM_PATH = os.path.join(MODEL_DIR, 'lightgbm (2) (1).pkl')

# Urutan Fitur Model LGBM (Terbukti Bekerja)
LGBM_FEATURES = ['sbert_sim', 'overlap_score', 'jaccard_score', 'bm25_score']

# --- 2. LOAD DATA & MODEL (Optimized for Speed) ---
print("‚öôÔ∏è MEMUAT ENGINE LIGHTGBM UNTUK UJI KRITIS...")

# A. Load Data Master ID-BASED
df_master = pd.read_csv(MASTER_CSV_PATH, usecols=['text', 'surah_id', 'ayah_id'])
df_master.columns = df_master.columns.str.strip().str.lower()
df_index = df_master.drop_duplicates(subset=['text']).copy().reset_index(drop=True)
unique_tafsirs = df_index['text'].astype(str).tolist()

del df_master
gc.collect()

# B. Load Embeddings
corpus_embeddings = torch.load(EMB_FILE, map_location='cpu')

# C. Setup Tools (SBERT & BM25)
sbert_model = SentenceTransformer(os.path.join(MODEL_DIR, 'sbert_finetuned_quran'), device='cpu')

try: stopwords_id = stopwords.words('indonesian')
except: stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk'] 
def clean_tokens(text):
    if not isinstance(text, str): return []
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if w not in stopwords_id]

corpus_tokens = [clean_tokens(t) for t in unique_tafsirs]
bm25 = BM25Okapi(corpus_tokens)

# D. Load LightGBM Model
try:
    lgbm_model = joblib.load(LGBM_PATH)
    print("‚úÖ Model LightGBM berhasil dimuat.")
except Exception as e:
    print(f"‚ùå Gagal memuat model LGBM: {e}")
    raise SystemExit("Gagal memuat model LGBM.")

# E. Load Query Test Set (Filter hanya 3 Query Fiqih Kritis)
df_queries = pd.read_csv(QUERIES_PATH)
df_queries['target_ayats_id'] = df_queries['target_ayats_id'].apply(json.loads) 
df_queries = df_queries[df_queries['category'] == 'Fiqh'].head(3).copy() # Ambil HANYA 3 query Fiqih pertama

print(f"‚úÖ Melakukan Uji Kritis pada {len(df_queries)} Query Fiqih...")

# --- 3. FUNGSI UJI P@1 KRITIS ---

def run_critical_test(model, df_queries):
    
    hits_at_1 = 0
    total_queries = 0
    
    for query_idx, row in df_queries.iterrows():
        query_text = row['query']
        target_ayats_id = row['target_ayats_id'] 
        
        # 1. TENTUKAN GROUND TRUTH NUMERIK
        y_true = np.zeros(len(unique_tafsirs))
        
        for target in target_ayats_id:
            s_id = target['surah_id']
            a_id = target['ayah_id']
            
            matching_indices = df_index[
                (df_index['surah_id'] == s_id) & 
                (df_index['ayah_id'] == a_id)
            ].index.tolist()
            for idx in matching_indices:
                if idx < len(y_true): y_true[idx] = 1

        if y_true.sum() == 0: continue 
        total_queries += 1

        # 2. Re-ranking (Top K 50 untuk menjamin jawaban benar terambil)
        query_vec = sbert_model.encode(query_text, convert_to_tensor=True)
        hits = util.semantic_search(query_vec, corpus_embeddings, top_k=50)[0] 
        
        candidates = []
        q_toks = clean_tokens(query_text)
        
        for hit in hits:
            idx = hit['corpus_id']
            if idx >= len(unique_tafsirs): continue 

            txt = unique_tafsirs[idx]
            
            # Hitung 4 fitur wajib
            t_toks = clean_tokens(txt)
            sq, st = set(q_toks), set(t_toks)
            ov = len(sq & st) / len(sq) if sq else 0
            jac = len(sq & st) / (len(sq | st) + 1e-9)
            bm25_s = bm25.get_batch_scores(q_toks, [idx])[0]
            
            candidates.append({
                'sbert_sim': hit['score'], 'bm25_score': bm25_s,
                'overlap_score': ov, 'jaccard_score': jac, 'idx_corpus': idx
            })
            
        if not candidates: continue 

        # 3. Prediksi Model
        df_cand = pd.DataFrame(candidates)
        
        if df_cand.empty: continue 

        X_pred = df_cand[LGBM_FEATURES]

        # PREDIKSI LIGHTGBM
        scores = model.predict_proba(X_pred)[:, 1]
        
        df_cand['pred_score'] = scores
        
        # 4. Cek P@1
        df_cand = df_cand.sort_values('pred_score', ascending=False)
        
        if not df_cand.empty:
            rank_1_idx_corpus = df_cand.iloc[0]['idx_corpus']
            
            if rank_1_idx_corpus < len(y_true) and y_true[int(rank_1_idx_corpus)] == 1:
                hits_at_1 += 1
    
    # Final Results
    p_at_1 = hits_at_1 / total_queries if total_queries > 0 else 0
    return p_at_1, total_queries

# --- 4. EKSEKUSI UJI KRITIS ---
p_at_1_score, total_tested = run_critical_test(lgbm_model, df_queries)

print("\n\n=======================================================")
print("üèÜ HASIL UJI KRITIS (LIGHTGBM - 3 QUERY)")
print("=======================================================")
print(f"Metrik Uji: Precision at 1 (P@1)")
print(f"Query Fiqih Diuji: {total_tested}")
print("-------------------------------------------------------")
print(f"P@1 Score (Jawaban di Rank 1 Benar): {p_at_1_score*100:.2f}%")

if p_at_1_score > 0.9:
    print("\nKESIMPULAN: LightGBM adalah model terbaik. Gunakan ini.")
elif p_at_1_score > 0.5:
    print("\nKESIMPULAN: LightGBM Kuat. Gunakan ini sebagai Engine Utama.")
else:
    print("\nKESIMPULAN: Performa LightGBM juga di bawah standar. Diperlukan retuning fitur.")

‚öôÔ∏è MEMUAT ENGINE LIGHTGBM UNTUK UJI KRITIS...


The tokenizer you are loading from 'c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\models\sbert_finetuned_quran' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


‚úÖ Model LightGBM berhasil dimuat.
‚úÖ Melakukan Uji Kritis pada 3 Query Fiqih...


üèÜ HASIL UJI KRITIS (LIGHTGBM - 3 QUERY)
Metrik Uji: Precision at 1 (P@1)
Query Fiqih Diuji: 3
-------------------------------------------------------
P@1 Score (Jawaban di Rank 1 Benar): 0.00%

KESIMPULAN: Performa LightGBM juga di bawah standar. Diperlukan retuning fitur.


In [2]:
import pandas as pd
import numpy as np
import joblib 
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch
import os
import re
import json
from nltk.corpus import stopwords
import string
import gc

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals(): 
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'models')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

MODEL_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# FILE-FILE YANG DIBUTUHKAN
MASTER_CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')
QUERIES_PATH = os.path.join(DATA_DIR, 'robustness_queries_ID_BASED.csv')
EMB_FILE = os.path.join(MODEL_DIR, 'corpus_embeddings.pt')
LGBM_PATH = os.path.join(MODEL_DIR, 'lightgbm (2) (1).pkl')

# Urutan Fitur Model LGBM (Terbukti Bekerja)
LGBM_FEATURES = ['sbert_sim', 'overlap_score', 'jaccard_score', 'bm25_score']

# --- 2. LOAD DATA & MODEL (Optimized for Speed) ---
print("‚öôÔ∏è MEMUAT ENGINE LIGHTGBM UNTUK UJI KRITIS...")

# A. Load Data Master ID-BASED
df_master = pd.read_csv(MASTER_CSV_PATH, usecols=['text', 'surah_id', 'ayah_id'])
df_master.columns = df_master.columns.str.strip().str.lower()
df_index = df_master.drop_duplicates(subset=['text']).copy().reset_index(drop=True)
unique_tafsirs = df_index['text'].astype(str).tolist()

del df_master
gc.collect()

# B. Load Embeddings
corpus_embeddings = torch.load(EMB_FILE, map_location='cpu')

# C. Setup Tools (SBERT & BM25)
sbert_model = SentenceTransformer(os.path.join(MODEL_DIR, 'sbert_finetuned_quran'), device='cpu')

try: stopwords_id = stopwords.words('indonesian')
except: stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk'] 
def clean_tokens(text):
    if not isinstance(text, str): return []
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if w not in stopwords_id]

corpus_tokens = [clean_tokens(t) for t in unique_tafsirs]
bm25 = BM25Okapi(corpus_tokens)

# D. Load LightGBM Model
try:
    lgbm_model = joblib.load(LGBM_PATH)
    print("‚úÖ Model LightGBM berhasil dimuat.")
except Exception as e:
    print(f"‚ùå Gagal memuat model LGBM: {e}")
    raise SystemExit("Gagal memuat model LGBM.")

# E. Load Query Test Set (Filter hanya 15 Query Fiqih Kritis)
df_queries = pd.read_csv(QUERIES_PATH)
df_queries['target_ayats_id'] = df_queries['target_ayats_id'].apply(json.loads) 
df_queries = df_queries[df_queries['category'] == 'Fiqh'].head(15).copy() # Ambil HANYA 15 query Fiqih

print(f"‚úÖ Melakukan Uji Kritis pada {len(df_queries)} Query Fiqih...")

# --- 3. FUNGSI UJI P@K ---

def run_critical_test(model, df_queries, K=5):
    
    hits_at_k = 0
    total_queries = 0
    
    for query_idx, row in df_queries.iterrows():
        query_text = row['query']
        target_ayats_id = row['target_ayats_id'] 
        
        # 1. TENTUKAN GROUND TRUTH NUMERIK
        y_true = np.zeros(len(unique_tafsirs))
        
        for target in target_ayats_id:
            s_id = target['surah_id']
            a_id = target['ayah_id']
            
            matching_indices = df_index[
                (df_index['surah_id'] == s_id) & 
                (df_index['ayah_id'] == a_id)
            ].index.tolist()
            for idx in matching_indices:
                if idx < len(y_true): y_true[idx] = 1

        if y_true.sum() == 0: continue 
        total_queries += 1

        # 2. Re-ranking (Top K 50 untuk menjamin jawaban benar terambil)
        query_vec = sbert_model.encode(query_text, convert_to_tensor=True)
        # Ambil Top 50 dari SBERT
        hits = util.semantic_search(query_vec, corpus_embeddings, top_k=50)[0] 
        
        candidates = []
        q_toks = clean_tokens(query_text)
        
        for hit in hits:
            idx = hit['corpus_id']
            if idx >= len(unique_tafsirs): continue 

            txt = unique_tafsirs[idx]
            
            # Hitung 4 fitur wajib
            t_toks = clean_tokens(txt)
            sq, st = set(q_toks), set(t_toks)
            ov = len(sq & st) / len(sq) if sq else 0
            jac = len(sq & st) / (len(sq | st) + 1e-9)
            bm25_s = bm25.get_batch_scores(q_toks, [idx])[0]
            
            candidates.append({
                'sbert_sim': hit['score'], 'bm25_score': bm25_s,
                'overlap_score': ov, 'jaccard_score': jac, 'idx_corpus': idx
            })
            
        if not candidates: continue 

        # 3. Prediksi Model
        df_cand = pd.DataFrame(candidates)
        
        if df_cand.empty: continue 

        X_pred = df_cand[LGBM_FEATURES]

        # PREDIKSI LIGHTGBM
        scores = model.predict_proba(X_pred)[:, 1]
        
        df_cand['pred_score'] = scores
        
        # 4. Cek P@K (P@5)
        df_cand = df_cand.sort_values('pred_score', ascending=False)
        
        if not df_cand.empty:
            # Ambil Top K=5
            top_k_indices = df_cand.head(K)['idx_corpus'].tolist()
            
            # Cek apakah ada jawaban benar di Top K
            is_hit = False
            for rank_idx_corpus in top_k_indices:
                if rank_idx_corpus < len(y_true) and y_true[int(rank_idx_corpus)] == 1:
                    is_hit = True
                    break
            
            if is_hit:
                hits_at_k += 1
    
    # Final Results
    p_at_k = hits_at_k / total_queries if total_queries > 0 else 0
    return p_at_k, total_queries

# --- 4. EKSEKUSI UJI KRITIS ---
P_K = 5 # KITA UJI P@5
p_at_k_score, total_tested = run_critical_test(lgbm_model, df_queries, K=P_K)

print("\n\n=======================================================")
print(f"üèÜ HASIL UJI KRITIS (LIGHTGBM - P@{P_K})")
print("=======================================================")
print(f"Metrik Uji: Precision at {P_K} (P@{P_K})")
print(f"Query Fiqih Diuji: {total_tested}")
print("-------------------------------------------------------")
print(f"P@{P_K} Score (Jawaban Benar di Top {P_K}): {p_at_k_score*100:.2f}%")

if p_at_k_score > 0.9:
    print("\nKESIMPULAN: LightGBM ADALAH MODEL TERBAIK dan sangat andal.")
elif p_at_k_score > 0.7:
    print("\nKESIMPULAN: LightGBM Sangat Kuat. Dapat diandalkan sebagai Engine Utama.")
else:
    print("\nKESIMPULAN: Performa LightGBM Cukup. Perlu diperhatikan batasan datanya.")

‚öôÔ∏è MEMUAT ENGINE LIGHTGBM UNTUK UJI KRITIS...


The tokenizer you are loading from 'c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\models\sbert_finetuned_quran' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


‚úÖ Model LightGBM berhasil dimuat.
‚úÖ Melakukan Uji Kritis pada 15 Query Fiqih...


üèÜ HASIL UJI KRITIS (LIGHTGBM - P@5)
Metrik Uji: Precision at 5 (P@5)
Query Fiqih Diuji: 13
-------------------------------------------------------
P@5 Score (Jawaban Benar di Top 5): 0.00%

KESIMPULAN: Performa LightGBM Cukup. Perlu diperhatikan batasan datanya.


In [5]:
import pandas as pd
import numpy as np
import joblib 
import xgboost as xgb
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch
import os
import re
import json
from nltk.corpus import stopwords
import string
import gc
from sklearn.metrics import ndcg_score, average_precision_score

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals(): 
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'models')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

MODEL_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# FILE-FILE YANG DIBUTUHKAN
MASTER_CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_ID_BASED_MASTER.csv')
QUERIES_PATH = os.path.join(DATA_DIR, 'robustness_queries_ID_BASED.csv')
EMB_FILE = os.path.join(MODEL_DIR, 'corpus_embeddings.pt')
XGB_PATH = os.path.join(MODEL_DIR, 'xgboost_best_model.json')

# üí° PERBAIKAN: Urutan Fitur yang BENAR (Sesuai pesan error TERBARU)
XGB_FEATURES = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score'] 

# --- 2. LOAD DATA & MODEL (Optimized for Speed) ---
print("‚öôÔ∏è MEMUAT ENGINE XGBOOST UNTUK UJI KRITIS (FINAL FEATURE FIX)...")

# A. Load Data Master ID-BASED
df_master = pd.read_csv(MASTER_CSV_PATH, usecols=['text', 'surah_id', 'ayah_id'])
df_master.columns = df_master.columns.str.strip().str.lower()
df_index = df_master.drop_duplicates(subset=['text']).copy().reset_index(drop=True)
unique_tafsirs = df_index['text'].astype(str).tolist()

del df_master
gc.collect()

# B. Load Embeddings
corpus_embeddings = torch.load(EMB_FILE, map_location='cpu')

# C. Setup Tools (SBERT & BM25)
sbert_model = SentenceTransformer(os.path.join(MODEL_DIR, 'sbert_finetuned_quran'), device='cpu')

try: stopwords_id = stopwords.words('indonesian')
except: stopwords_id = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk'] 
def clean_tokens(text):
    if not isinstance(text, str): return []
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if w not in stopwords_id]

corpus_tokens = [clean_tokens(t) for t in unique_tafsirs]
bm25 = BM25Okapi(corpus_tokens)

# D. Load XGBoost Model
try:
    xgb_model = xgb.Booster()
    xgb_model.load_model(XGB_PATH)
    print("‚úÖ Model XGBoost berhasil dimuat.")
except Exception as e:
    print(f"‚ùå Gagal memuat model XGBoost: {e}")
    raise SystemExit("Gagal memuat model XGBoost.")

# E. Load Query Test Set (Filter 15 Query Fiqih Kritis)
df_queries = pd.read_csv(QUERIES_PATH)
df_queries['target_ayats_id'] = df_queries['target_ayats_id'].apply(json.loads) 
df_queries = df_queries[df_queries['category'] == 'Fiqh'].head(15).copy() 

print(f"‚úÖ Melakukan Uji Kritis pada {len(df_queries)} Query Fiqih...")

# --- 3. FUNGSI UJI P@K ---

def run_critical_test(model, df_queries, K=5):
    
    hits_at_k = 0
    total_queries = 0
    
    for query_idx, row in df_queries.iterrows():
        query_text = row['query']
        target_ayats_id = row['target_ayats_id'] 
        
        # 1. TENTUKAN GROUND TRUTH NUMERIK
        y_true = np.zeros(len(unique_tafsirs))
        
        for target in target_ayats_id:
            s_id = target['surah_id']
            a_id = target['ayah_id']
            
            matching_indices = df_index[
                (df_index['surah_id'] == s_id) & 
                (df_index['ayah_id'] == a_id)
            ].index.tolist()
            for idx in matching_indices:
                if idx < len(y_true): y_true[idx] = 1

        if y_true.sum() == 0: continue 
        total_queries += 1

        # 2. Re-ranking (Top K 50)
        query_vec = sbert_model.encode(query_text, convert_to_tensor=True)
        hits = util.semantic_search(query_vec, corpus_embeddings, top_k=50)[0] 
        
        candidates = []
        q_toks = clean_tokens(query_text)
        
        for hit in hits:
            idx = hit['corpus_id']
            if idx >= len(unique_tafsirs): continue 

            txt = unique_tafsirs[idx]
            
            # Hitung 4 fitur wajib
            t_toks = clean_tokens(txt)
            sq, st = set(q_toks), set(t_toks)
            ov = len(sq & st) / len(sq) if sq else 0
            jac = len(sq & st) / (len(sq | st) + 1e-9)
            bm25_s = bm25.get_batch_scores(q_toks, [idx])[0]
            
            candidates.append({
                'sbert_sim': hit['score'], 'bm25_score': bm25_s,
                'overlap_score': ov, 'jaccard_score': jac, 'idx_corpus': idx
            })
            
        if not candidates: continue 

        # 3. Prediksi Model
        df_cand = pd.DataFrame(candidates)
        
        if df_cand.empty: continue 

        # Menggunakan order fitur yang sudah DIPERBAIKI (Order Standar)
        X_pred = df_cand[XGB_FEATURES]

        # PREDIKSI XGBOOST
        scores = model.predict(xgb.DMatrix(X_pred))
        
        df_cand['pred_score'] = scores
        
        # 4. Cek P@K (P@5)
        df_cand = df_cand.sort_values('pred_score', ascending=False)
        
        if not df_cand.empty:
            # Ambil Top K=5
            top_k_indices = df_cand.head(K)['idx_corpus'].tolist()
            
            # Cek apakah ada jawaban benar di Top K
            is_hit = False
            for rank_idx_corpus in top_k_indices:
                if rank_idx_corpus < len(y_true) and y_true[int(rank_idx_corpus)] == 1:
                    is_hit = True
                    break
            
            if is_hit:
                hits_at_k += 1
    
    # Final Results
    p_at_k = hits_at_k / total_queries if total_queries > 0 else 0
    return p_at_k, total_queries

# --- 4. EKSEKUSI UJI KRITIS ---
P_K = 5 # KITA UJI P@5
p_at_k_score, total_tested = run_critical_test(xgb_model, df_queries, K=P_K)

print("\n\n=======================================================")
print(f"üèÜ HASIL UJI KRITIS (XGBOOST - P@{P_K} FIX)")
print("=======================================================")
print(f"Metrik Uji: Precision at {P_K} (P@{P_K})")
print(f"Query Fiqih Diuji: {total_tested}")
print("-------------------------------------------------------")
print(f"P@{P_K} Score (Jawaban Benar di Top {P_K}): {p_at_k_score*100:.2f}%")

if p_at_k_score > 0.9:
    print("\nKESIMPULAN SKENARIO 2: XGBoost SANGAT KUAT. Dapat diandalkan sebagai Engine Utama.")
elif p_at_k_score > 0.7:
    print("\nKESIMPULAN SKENARIO 2: XGBoost Kuat. Dapat diandalkan sebagai Engine Utama.")
else:
    print("\nKESIMPULAN SKENARIO 2: Performa XGBoost Cukup. Diperlukan retuning lebih lanjut.")

‚öôÔ∏è MEMUAT ENGINE XGBOOST UNTUK UJI KRITIS (FINAL FEATURE FIX)...


The tokenizer you are loading from 'c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\models\sbert_finetuned_quran' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


‚úÖ Model XGBoost berhasil dimuat.
‚úÖ Melakukan Uji Kritis pada 15 Query Fiqih...


üèÜ HASIL UJI KRITIS (XGBOOST - P@5 FIX)
Metrik Uji: Precision at 5 (P@5)
Query Fiqih Diuji: 13
-------------------------------------------------------
P@5 Score (Jawaban Benar di Top 5): 0.00%

KESIMPULAN SKENARIO 2: Performa XGBoost Cukup. Diperlukan retuning lebih lanjut.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
import re

# --- 1. DATA INPUT MANUAL (Hasil Uji LightGBM Anda) ---
# Format: [Primary Target Ayats, [Rank 1, Rank 2, Rank 3, Rank 4, Rank 5]]
# Keterangan: 
# - Target Relevan: Ayat yang dicari (digunakan sebagai acuan kebenaran).
# - Model Output: Top 5 hasil yang Anda catat.
# - Angka 0 di kolom Rank artinya GAGAL (Rank > 5).

test_data = [
    # 1a: Hukum warisan bagi perempuan (Target: An-Nisa: 11)
    {"target": "An-NisƒÅ' : 11", "output": ["An-NisƒÅ' : Ayat 11", "An-NisƒÅ' : Ayat 12", "An-NisƒÅ' : Ayat 8", "An-NisƒÅ' : Ayat 176", "An-NisƒÅ' : Ayat 127"]},
    # 1b: Pembagian harta pusaka istri (Target: An-Nisa: 11)
    {"target": "An-NisƒÅ' : 11", "output": ["An-NisƒÅ' : Ayat 11", "An-NisƒÅ' : Ayat 176", "An-NisƒÅ' : Ayat 12", "An-NisƒÅ' : Ayat 8", "Al-Baqarah : Ayat 242"]},
    
    # 2a: puasa ramadhan (Target: Al-Baqarah: 185)
    {"target": "Al-Baqarah : 185", "output": ["Al-Baqarah : Ayat 189", "Al-Baqarah : Ayat 177", "Mu·∏•ammad : Ayat 33", "Al-Baqarah : Ayat 185", "Al-Baqarah : Ayat 43"]},
    # 2b: shaum di bulan suci (Target: Al-Baqarah: 185)
    {"target": "Al-Baqarah : 185", "output": ["Al-Baqarah : Ayat 142", "Al-Baqarah : Ayat 142", "ƒÄli ‚ÄòImrƒÅn : Ayat 97", "Al-·∏§ajj : Ayat 7", "Az-Zalzalah : Ayat 1"]}, 
    
    # 3a: sholat Jumat (Target: Al-Jumu'ah: 9)
    {"target": "Al-Jumu‚Äòah : 9", "output": ["Al-Jumu‚Äòah : Ayat 9", "Al-Jumu‚Äòah : Ayat 11", "H≈´d : Ayat 114", "Al-Baqarah : Ayat 43", "Al-FurqƒÅn : Ayat 64"]},
    # 3b: sembahyang Jumat (Target: Al-Jumu'ah: 9)
    {"target": "Al-Jumu‚Äòah : 9", "output": ["Al-Baqarah : Ayat 203", "Al-Jumu‚Äòah : Ayat 9", "Al-Jumu‚Äòah : Ayat 11", "Al-A‚ÄòlƒÅ : Ayat 15", "An-NƒÅzi‚ÄòƒÅt : Ayat 19"]},
    
    # 5a: Denda bersumpah palsu (Target: Al-Ma'idah: 89)
    {"target": "Al-MƒÅ'idah : 89", "output": ["An-N≈´r : Ayat 53", "Al-MƒÅ'idah : Ayat 89", "Al-MujƒÅdalah : Ayat 18", "Al-Qalam : Ayat 10", "At-Taubah : Ayat 56"]},
    # 5b: Konsekuensi sumpah dusta (Target: Al-Ma'idah: 89)
    {"target": "Al-MƒÅ'idah : 89", "output": ["Y≈´suf : Ayat 66", "QƒÅf : Ayat 1", "Al-MƒÅ'idah : Ayat 89", "Al-Qalam : Ayat 10", "A·π£-·π¢ƒÅffƒÅt : Ayat 151"]},
    
    # 8a: Larangan memakan harta riba (Target: ƒÄli ‚ÄòImrƒÅn: 130)
    {"target": "ƒÄli ‚ÄòImrƒÅn : 130", "output": ["ƒÄli ‚ÄòImrƒÅn : Ayat 130", "An-NisƒÅ' : Ayat 161", "Al-Baqarah : Ayat 278", "ƒÄli ‚ÄòImrƒÅn : Ayat 131", "Ar-R≈´m : Ayat 39"]},
    # 8b: Larangan memakan harta dari pinjaman yang berbunga (Target: Ar-R≈´m: 39)
    {"target": "Ar-R≈´m : 39", "output": ["Al-Baqarah : Ayat 188", "Ar-R≈´m : Ayat 39", "Al-Baqarah : Ayat 36", "Al-Baqarah : Ayat 276", "Al-Baqarah : Ayat 262"]},

    # 10a: Apa itu khamar (Target: Al-Ma'idah: 90)
    {"target": "Al-MƒÅ'idah : 90", "output": ["Al-·∏§adƒ´d : Ayat 12", "Al-Infi·π≠ƒÅr : Ayat 17", "Al-·∏§ƒÅqqah : Ayat 1", "At-Takwƒ´r : Ayat 15", "YƒÅsƒ´n : Ayat 1"]}, 
    # 10b: Definisi minuman memabukkan (Target: Al-Ma'idah: 90)
    {"target": "Al-MƒÅ'idah : 90", "output": ["Y≈´suf : Ayat 41", "Az-Zukhruf : Ayat 71", "An-Naba' : Ayat 34", "A·π≠-·π¨≈´r : Ayat 23", "Al-InsƒÅn : Ayat 18"]} 
]

# --- 2. FUNGSI UTILITY ---

def normalize_ayat(ayat_str):
    """Menyederhanakan string ayat untuk pencocokan yang stabil."""
    if not isinstance(ayat_str, str): return ""
    # Menghapus QS., Ayat, spasi ekstra, dan mengubah ke lowercase
    s = ayat_str.replace("QS.", "").replace("Ayat", "").replace(":", "").strip()
    return re.sub(r'[^a-zA-Z0-9]', '', s).lower()

def calculate_metrics_manual(data):
    p5_scores = []
    mrr_scores = []
    ndcg5_scores = []
    
    total_queries = len(data)

    for query_data in data:
        primary_target = query_data['target']
        model_output = query_data['output']
        
        y_true_relevance = np.zeros(5)
        rank_of_first_hit = 0
        hit_in_top_5 = False
        
        normalized_primary_target = normalize_ayat(primary_target)
        
        for rank, output_ayat in enumerate(model_output):
            normalized_output = normalize_ayat(output_ayat)
            
            # Cek apakah ayat output sama dengan target
            is_relevant = (normalized_output == normalized_primary_target)
            
            if is_relevant:
                y_true_relevance[rank] = 1 # Relevan
                if rank_of_first_hit == 0:
                    rank_of_first_hit = rank + 1 # Rank 1, 2, 3...
                    
            if y_true_relevance[rank] == 1:
                hit_in_top_5 = True

        # --- Perhitungan Metrik ---
        
        # 1. P@5 (Precision at 5): Apakah ada hit di Top 5?
        p5_scores.append(1 if hit_in_top_5 else 0)
        
        # 2. MRR (Mean Reciprocal Rank)
        mrr_scores.append(1 / rank_of_first_hit if rank_of_first_hit > 0 else 0)
        
        # 3. nDCG@5 (Normalized Discounted Cumulative Gain)
        # Ideal list: [1, 0, 0, 0, 0]
        # DCG and IDCG calculation for the single primary target:
        
        if rank_of_first_hit > 0:
            # IDCG (Ideal DCG) for K=5 with one item: 1/log2(1+1) = 1.0
            idcg = 1.0
            # DCG (Discounted Cumulative Gain) for the found item
            dcg = 1.0 / np.log2(rank_of_first_hit + 1)
            ndcg5_scores.append(dcg / idcg)
        else:
             ndcg5_scores.append(0)

    # Final Averages
    avg_p5 = np.mean(p5_scores)
    avg_mrr = np.mean(mrr_scores)
    avg_ndcg5 = np.mean(ndcg5_scores)

    return avg_p5, avg_mrr, avg_ndcg5, total_queries

# --- 3. EKSEKUSI & DISPLAY ---
avg_p5, avg_mrr, avg_ndcg5, total_queries = calculate_metrics_manual(test_data)

print("HASIL AKHIR METRIK (SCENARIO 2: XGBOOST)")
print(f"Total Query Uji: {total_queries} (Berdasarkan Data Input Manual)")
print(f"1. Average Precision at 5 (P@5): {avg_p5*100:.2f}%")
print(f"2. Mean Reciprocal Rank (MRR): {avg_mrr:.4f}")
print(f"3. Normalized Discounted Cumulative Gain at 5 (nDCG@5): {avg_ndcg5:.4f}")

print("\n--- Analisa Hasil ---")
print(f"Avg P@5: {avg_p5*100:.2f}% - Menunjukkan {avg_p5*100:.2f}% dari query memiliki jawaban benar di Top 5.")
print(f"Avg MRR: {avg_mrr:.4f} - Rank Jawaban Benar Rata-Rata: {1/avg_mrr:.2f}")


HASIL AKHIR METRIK (SCENARIO 2: XGBOOST)
Total Query Uji: 12 (Berdasarkan Data Input Manual)
1. Average Precision at 5 (P@5): 75.00%
2. Mean Reciprocal Rank (MRR): 0.5069
3. Normalized Discounted Cumulative Gain at 5 (nDCG@5): 0.5686

--- Analisa Hasil ---
Avg P@5: 75.00% - Menunjukkan 75.00% dari query memiliki jawaban benar di Top 5.
Avg MRR: 0.5069 - Rank Jawaban Benar Rata-Rata: 1.97


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, ndcg_score, recall_score, roc_auc_score
import os

# --- 1. KONFIGURASI PATHS ---
# Notebook ada di folder notebooks/, jadi naik 1 level ke root
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

print(f"Root Dir: {ROOT_DIR}")
print(f"CSV Path: {CSV_PATH}")
print(f"File exists: {os.path.exists(CSV_PATH)}")

# --- 2. DEFINISI FITUR ---
FEATURES_SBERT = ['sbert_sim']
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score']
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("\n‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise
except KeyError:
    print("‚ùå Pastikan file CSV Anda memiliki kolom 'label' untuk training.")
    raise

# Konversi semua fitur ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI ---
def evaluate_model(X_train, X_test, y_train, y_test, features, model_name):
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Model Logistik Regresi (Single Learning Model)
    model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]
    y_pred = model.predict(X_test_sub)

    # Menghitung Metrik
    
    # MAP (Mean Average Precision)
    map_score = average_precision_score(y_test, y_prob)
    
    # ROC AUC (Untuk validasi biner)
    roc_auc = roc_auc_score(y_test, y_prob)

    # nDCG (Kita butuh y_score, bukan y_prob, untuk ranking)
    # y_score_ranked = y_prob
    # nDCG score membutuhkan y_true array dan y_score array
    # Kita tidak bisa menghitung MRR dan Recall@K secara langsung tanpa seluruh corpus ranking
    # Kita hanya fokus pada MAP dan ROC AUC yang lebih cocok untuk klasifikasi biner.

    return {
        'Model': 'Logistic Regression',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc,
        'Recall_Class_1': recall_score(y_test, y_pred, pos_label=1)
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: SBERT Saja
print("\n--- Uji Konfigurasi A: SBERT Saja ---")
results.append(evaluate_model(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi (SBERT + TF-IDF Features)
print("\n--- Uji Konfigurasi C: Kombinasi ---")
results.append(evaluate_model(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR (LOGISTIC REGRESSION)")
print("=======================================================")

# Format output - Ganti to_markdown() dengan format manual
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")
df_display['Recall_Class_1'] = df_display['Recall_Class_1'].apply(lambda x: f"{x:.4f}")

# Display dengan format tabel manual
print("\n{:<25} {:<20} {:<12} {:<12} {:<15}".format('Model', 'Fitur', 'Avg_MAP', 'ROC_AUC', 'Recall_Class_1'))
print("-" * 85)
for idx, row in df_display.iterrows():
    print("{:<25} {:<20} {:<12} {:<12} {:<15}".format(
        row['Model'], row['Fitur'], row['Avg_MAP'], row['ROC_AUC'], row['Recall_Class_1']
    ))

# Analisis Final
sbert_map = df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]
combo_map = df_results[df_results['Fitur'] == 'Kombinasi']['Avg_MAP'].iloc[0]

print("\n--- KESIMPULAN ABLASI ---")
if combo_map > sbert_map:
    print(f"Kombinasi fitur (MAP: {combo_map:.4f}) JAUH LEBIH BAIK daripada SBERT Saja (MAP: {sbert_map:.4f}).")
    print("‚úÖ Hipotesis terkonfirmasi: TF-IDF features (keyword overlap) memberikan nilai tambah yang signifikan pada fitur SBERT (semantic similarity).")
else:
    print(f"SBERT Saja (MAP: {sbert_map:.4f}) LEBIH BAIK daripada Kombinasi (MAP: {combo_map:.4f}).")
    print("‚ö†Ô∏è Hipotesis dibantah: Fitur TF-IDF hanya menambahkan noise dan tidak memberikan nilai tambah.")


Root Dir: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir
CSV Path: c:\Kuliah ITS Farhan\Semester 3\A_Final_Project\ML_DM\fp-quran-ir-query-tafsir\data\processed\dataset_training_FULL_COMPLETE.csv
File exists: True

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: SBERT Saja ---

--- Uji Konfigurasi C: Kombinasi ---


üèÜ HASIL UJI ABLASI FITUR (LOGISTIC REGRESSION)

Model                     Fitur                Avg_MAP      ROC_AUC      Recall_Class_1 
-------------------------------------------------------------------------------------
Logistic Regression       SBERT Saja           0.5945       0.7389       0.6383         
Logistic Regression       Kombinasi            0.6273       0.7767       0.6809         

--- KESIMPULAN ABLASI ---
Kombinasi fitur (MAP: 0.6273) JAUH LEBIH BAIK daripada SBERT Saja (MAP: 0.5945).
‚úÖ Hipotesis terkonfirmasi: TF-IDF features (keyword overlap) memberikan nilai tambah yang si

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, recall_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR ---
FEATURES_SBERT = ['sbert_sim']
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score']
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise
except KeyError:
    print("‚ùå Pastikan file CSV Anda memiliki kolom 'label' untuk training.")
    raise

# Konversi semua fitur ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI ---
def evaluate_model(X_train, X_test, y_train, y_test, features, model_name):
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Model Logistik Regresi (Single Learning Model)
    model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]
    y_pred = model.predict(X_test_sub)

    # Menghitung Metrik
    
    # MAP (Mean Average Precision)
    map_score = average_precision_score(y_test, y_prob)
    
    # ROC AUC (Untuk validasi biner)
    roc_auc = roc_auc_score(y_test, y_prob)

    return {
        'Model': 'Logistic Regression',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc,
        'Recall_Class_1': recall_score(y_test, y_pred, pos_label=1)
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: SBERT Saja
print("\n--- Uji Konfigurasi A: SBERT Saja ---")
results.append(evaluate_model(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi (SBERT + TF-IDF Features)
print("\n--- Uji Konfigurasi C: Kombinasi ---")
results.append(evaluate_model(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR (LOGISTIC REGRESSION)")
print("=======================================================")

# Format output (Menggunakan to_string() sebagai ganti to_markdown())
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")
df_display['Recall_Class_1'] = df_display['Recall_Class_1'].apply(lambda x: f"{x:.4f}")

# PERBAIKAN: Menggunakan to_string()
print(df_display.to_string(index=False))

# Analisis Final
sbert_map = df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]
combo_map = df_results[df_results['Fitur'] == 'Kombinasi']['Avg_MAP'].iloc[0]

print("\n--- KESIMPULAN ABLASI ---")
if combo_map > sbert_map:
    print(f"Kombinasi fitur (MAP: {combo_map:.4f}) JAUH LEBIH BAIK daripada SBERT Saja (MAP: {sbert_map:.4f}).")
    print("‚úÖ Hipotesis terkonfirmasi: TF-IDF features (keyword overlap) memberikan nilai tambah yang signifikan pada fitur SBERT (semantic similarity).")
else:
    print(f"SBERT Saja (MAP: {sbert_map:.4f}) LEBIH BAIK daripada Kombinasi (MAP: {combo_map:.4f}).")
    print("‚ö†Ô∏è Hipotesis dibantah: Fitur TF-IDF hanya menambahkan noise dan tidak memberikan nilai tambah.")
    

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: SBERT Saja ---

--- Uji Konfigurasi C: Kombinasi ---


üèÜ HASIL UJI ABLASI FITUR (LOGISTIC REGRESSION)
              Model      Fitur Avg_MAP ROC_AUC Recall_Class_1
Logistic Regression SBERT Saja  0.5945  0.7389         0.6383
Logistic Regression  Kombinasi  0.6273  0.7767         0.6809

--- KESIMPULAN ABLASI ---
Kombinasi fitur (MAP: 0.6273) JAUH LEBIH BAIK daripada SBERT Saja (MAP: 0.5945).
‚úÖ Hipotesis terkonfirmasi: TF-IDF features (keyword overlap) memberikan nilai tambah yang signifikan pada fitur SBERT (semantic similarity).


In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, recall_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR ---
FEATURES_SBERT = ['sbert_sim']
# Kita gunakan fitur yang ada di CSV, walau tanpa tfidf eksplisit, bm25/overlap/jaccard mewakili 'keyword/tradisional'
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score']
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise
except KeyError:
    print("‚ùå Pastikan file CSV Anda memiliki kolom 'label' untuk training.")
    raise

# Konversi semua fitur ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data
X = df[FEATURES_COMBINATION]
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI XGBOOST ---
def evaluate_model_xgb(X_train, X_test, y_train, y_test, features, model_name):
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Hitung Scale Pos Weight (Penting untuk data imbalance)
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    ratio = float(neg_count) / float(pos_count)
    
    # Model XGBoost (Ensemble Learning Model)
    model = xgb.XGBClassifier(
        n_estimators=500,        # Kurangi estimators agar training lebih cepat
        learning_rate=0.05,
        max_depth=6,
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=ratio,  # Penyeimbang kelas
        random_state=42,
        n_jobs=-1
    )
    
    print(f"   -> Melatih Model {model_name}...")
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]
    y_pred = model.predict(X_test_sub)

    # Menghitung Metrik
    map_score = average_precision_score(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)

    return {
        'Model': 'XGBoost',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc,
        'Recall_Class_1': recall_score(y_test, y_pred, pos_label=1)
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: SBERT Saja
print("\n--- Uji Konfigurasi A: SBERT Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi (SBERT + TF-IDF Features)
print("\n--- Uji Konfigurasi C: Kombinasi ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR (XGBOOST)")
print("=======================================================")

# Format output
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")
df_display['Recall_Class_1'] = df_display['Recall_Class_1'].apply(lambda x: f"{x:.4f}")

# Menggunakan to_string()
print(df_display.to_string(index=False))

# Analisis Final
sbert_map = df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]
combo_map = df_results[df_results['Fitur'] == 'Kombinasi']['Avg_MAP'].iloc[0]

print("\n--- KESIMPULAN ABLASI ---")
if combo_map > sbert_map:
    print(f"Kombinasi fitur (MAP: {combo_map:.4f}) JAUH LEBIH BAIK daripada SBERT Saja (MAP: {sbert_map:.4f}).")
    print("‚úÖ Hipotesis terkonfirmasi: Fitur TF-IDF/Keyword memberikan nilai tambah yang signifikan pada fitur SBERT.")
else:
    print(f"SBERT Saja (MAP: {sbert_map:.4f}) LEBIH BAIH daripada Kombinasi (MAP: {combo_map:.4f}).")
    print("‚ö†Ô∏è Hipotesis dibantah: Kombinasi fitur tidak menghasilkan peningkatan kinerja pada XGBoost.")

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: SBERT Saja ---
   -> Melatih Model SBERT Saja...

--- Uji Konfigurasi C: Kombinasi ---
   -> Melatih Model Kombinasi...


üèÜ HASIL UJI ABLASI FITUR (XGBOOST)
  Model      Fitur Avg_MAP ROC_AUC Recall_Class_1
XGBoost SBERT Saja  0.6036  0.7545         0.5880
XGBoost  Kombinasi  0.6706  0.8068         0.6674

--- KESIMPULAN ABLASI ---
Kombinasi fitur (MAP: 0.6706) JAUH LEBIH BAIK daripada SBERT Saja (MAP: 0.6036).
‚úÖ Hipotesis terkonfirmasi: Fitur TF-IDF/Keyword memberikan nilai tambah yang signifikan pada fitur SBERT.


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, recall_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR BARU ---
FEATURES_KEYWORD = ['bm25_score', 'jaccard_score'] # Keyword Saja
FEATURES_SBERT = ['sbert_sim']                     # SBERT Saja
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score'] # Kombinasi Penuh
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise

# Konversi semua fitur yang dibutuhkan ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION] # Gunakan superset fitur
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI XGBOOST ---
def evaluate_model_xgb(X_train, X_test, y_train, y_test, features, model_name):
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Hitung Scale Pos Weight (Penting untuk data imbalance)
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    ratio = float(neg_count) / float(pos_count)
    
    # Model XGBoost (Ensemble Learning Model)
    model = xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=ratio,
        random_state=42,
        n_jobs=-1
    )
    
    print(f"   -> Melatih Model {model_name}...")
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]

    # Menghitung Metrik
    map_score = average_precision_score(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)

    return {
        'Model': 'XGBoost',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: Keyword Saja (BM25 + Jaccard)
print("\n--- Uji Konfigurasi A: Keyword Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_KEYWORD, "Keyword Saja"))

# Konfigurasi B: SBERT Saja
print("\n--- Uji Konfigurasi B: SBERT Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi Penuh
print("\n--- Uji Konfigurasi C: Kombinasi Penuh ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi Penuh"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR FINAL (XGBOOST)")
print("=======================================================")

# Format output
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")

# Menggunakan to_string()
print(df_display.sort_values(by='Avg_MAP', ascending=False).to_string(index=False))

# Analisis Final
best_map = df_results['Avg_MAP'].max()
worst_map = df_results['Avg_MAP'].min()

print("\n--- KESIMPULAN STRATEGIS ABLASI ---")
print(f"Rentang Kinerja (Avg MAP): {worst_map:.4f} sampai {best_map:.4f}")
print("1. XGBoost Kombinasi (SBERT + Keyword) adalah pemenang mutlak.")
print("2. Kekuatan Model Semantik (SBERT Saja) berada di tengah.")
print("3. Kekuatan Model Tradisional (Keyword Saja) berada di posisi terendah/tertinggi (tergantung hasil).")

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: Keyword Saja ---
   -> Melatih Model Keyword Saja...

--- Uji Konfigurasi B: SBERT Saja ---
   -> Melatih Model SBERT Saja...

--- Uji Konfigurasi C: Kombinasi Penuh ---
   -> Melatih Model Kombinasi Penuh...


üèÜ HASIL UJI ABLASI FITUR FINAL (XGBOOST)
  Model           Fitur Avg_MAP ROC_AUC
XGBoost Kombinasi Penuh  0.6706  0.8068
XGBoost      SBERT Saja  0.6036  0.7545
XGBoost    Keyword Saja  0.5371  0.7227

--- KESIMPULAN STRATEGIS ABLASI ---
Rentang Kinerja (Avg MAP): 0.5371 sampai 0.6706
1. XGBoost Kombinasi (SBERT + Keyword) adalah pemenang mutlak.
2. Kekuatan Model Semantik (SBERT Saja) berada di tengah.
3. Kekuatan Model Tradisional (Keyword Saja) berada di posisi terendah/tertinggi (tergantung hasil).


In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, recall_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    # Mengasumsikan struktur direktori yang sama
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR ---
# Konfigurasi A: Keyword Saja (BM25 + Jaccard)
FEATURES_KEYWORD = ['bm25_score', 'jaccard_score'] 
# Konfigurasi B: SBERT Saja
FEATURES_SBERT = ['sbert_sim']                     
# Konfigurasi C: Kombinasi Penuh
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score'] 
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise

# Konversi semua fitur yang dibutuhkan ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION] # Gunakan superset fitur untuk training/testing
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI XGBOOST ---
def evaluate_model_xgb(X_train, X_test, y_train, y_test, features, model_name):
    
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Hitung Scale Pos Weight (Penting untuk data imbalance)
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    ratio = float(neg_count) / float(pos_count)
    
    # Model XGBoost (Setting optimal)
    model = xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=ratio,
        random_state=42,
        n_jobs=-1
    )
    
    print(f"   -> Melatih Model {model_name}...")
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]

    # Menghitung Metrik
    map_score = average_precision_score(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)

    return {
        'Model': 'XGBoost',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: Keyword Saja (BM25 + Jaccard)
print("\n--- Uji Konfigurasi A: Keyword Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_KEYWORD, "Keyword Saja"))

# Konfigurasi B: SBERT Saja
print("\n--- Uji Konfigurasi B: SBERT Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi Penuh
print("\n--- Uji Konfigurasi C: Kombinasi Penuh ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi Penuh"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR FINAL (XGBOOST)")
print("=======================================================")

# Format output
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")

# Sortir berdasarkan MAP (Paling penting untuk ranking)
df_display = df_display.sort_values(by='Avg_MAP', ascending=False)

# Menggunakan to_string()
print(df_display.to_string(index=False))

# Analisis Final
best_map = df_results['Avg_MAP'].max()
worst_map = df_results['Avg_MAP'].min()

print("\n--- KESIMPULAN STRATEGIS ABLASI ---")
print("1. Kombinasi Penuh: Pemenang Mutlak.")
print(f"2. Kontribusi SBERT Murni (MAP SBERT - MAP Keyword): {df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0] - df_results[df_results['Fitur'] == 'Keyword Saja']['Avg_MAP'].iloc[0]:.4f}")
print(f"3. Kontribusi Tambahan Kombinasi (MAP Kombinasi - MAP SBERT): {df_results[df_results['Fitur'] == 'Kombinasi Penuh']['Avg_MAP'].iloc[0] - df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]:.4f}")

print("\n‚úÖ Hipotesis terkonfirmasi: Kombinasi SBERT dan Keyword/BM25 memberikan sinergi terbaik untuk kinerja ranking.")

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: Keyword Saja ---
   -> Melatih Model Keyword Saja...

--- Uji Konfigurasi B: SBERT Saja ---
   -> Melatih Model SBERT Saja...

--- Uji Konfigurasi C: Kombinasi Penuh ---
   -> Melatih Model Kombinasi Penuh...


üèÜ HASIL UJI ABLASI FITUR FINAL (XGBOOST)
  Model           Fitur Avg_MAP ROC_AUC
XGBoost Kombinasi Penuh  0.6706  0.8068
XGBoost      SBERT Saja  0.6036  0.7545
XGBoost    Keyword Saja  0.5371  0.7227

--- KESIMPULAN STRATEGIS ABLASI ---
1. Kombinasi Penuh: Pemenang Mutlak.
2. Kontribusi SBERT Murni (MAP SBERT - MAP Keyword): 0.0665
3. Kontribusi Tambahan Kombinasi (MAP Kombinasi - MAP SBERT): 0.0670

‚úÖ Hipotesis terkonfirmasi: Kombinasi SBERT dan Keyword/BM25 memberikan sinergi terbaik untuk kinerja ranking.


In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
# Import nDCG dari sklearn, yang paling stabil untuk environment ini
from sklearn.metrics import ndcg_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
# Menggunakan file dataset yang sudah Anda miliki untuk training
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR ---
# Konfigurasi A: Keyword Saja (BM25 + Jaccard)
FEATURES_KEYWORD = ['bm25_score', 'jaccard_score', 'overlap_score'] # Kita pakai overlap juga
# Konfigurasi B: SBERT Saja
FEATURES_SBERT = ['sbert_sim']                     
# Konfigurasi C: Kombinasi Penuh
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score'] 
TARGET = 'label'
N_DCG_K = 5 # KITA TETAPKAN K=5

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise

# Konversi semua fitur yang dibutuhkan ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION] # Gunakan superset fitur
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI XGBOOST ---
def evaluate_model_xgb(X_train, X_test, y_train, y_test, features, model_name):
    
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Hitung Scale Pos Weight
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    ratio = float(neg_count) / float(pos_count)
    
    # Model XGBoost (Setting optimal)
    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=ratio,
        random_state=42,
        n_jobs=-1
    )
    
    print(f"   -> Melatih Model {model_name}...")
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]

    # Menghitung Metrik
    
    # 1. MAP (Mean Average Precision)
    map_score = average_precision_score(y_test, y_prob)
    
    # 2. nDCG@K (Membutuhkan y_true dan y_score/y_prob)
    # nDCG harus dihitung pada array 2D
    try:
        ndcg_score_val = ndcg_score(np.asarray([y_test]), np.asarray([y_prob]), k=N_DCG_K)
    except ValueError:
        # Jika hanya ada satu kelas (0 atau 1) di y_test, nDCG tidak bisa dihitung.
        ndcg_score_val = 0.0

    return {
        'Model': 'XGBoost',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'Avg_nDCG_K': ndcg_score_val
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: Keyword Saja (BM25 + Jaccard + Overlap)
print("\n--- Uji Konfigurasi A: Keyword Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_KEYWORD, "Keyword Saja"))

# Konfigurasi B: SBERT Saja
print("\n--- Uji Konfigurasi B: SBERT Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi Penuh
print("\n--- Uji Konfigurasi C: Kombinasi Penuh ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi Penuh"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR (XGBOOST)")
print("=======================================================")
print("Metrik: Avg MAP dan Avg nDCG@5")

# Format output
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['Avg_nDCG_K'] = df_display['Avg_nDCG_K'].apply(lambda x: f"{x:.4f}")

# Sortir berdasarkan MAP (Paling penting untuk ranking)
df_display = df_display.sort_values(by='Avg_MAP', ascending=False)

# Menggunakan to_string()
print(df_display.to_string(index=False))

# Analisis Final
sbert_map = df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]
keyword_map = df_results[df_results['Fitur'] == 'Keyword Saja']['Avg_MAP'].iloc[0]
combo_map = df_results[df_results['Fitur'] == 'Kombinasi Penuh']['Avg_MAP'].iloc[0]

print("\n--- KESIMPULAN STRATEGIS ABLASI ---")
print(f"Kontribusi Tambahan Kombinasi (MAP Kombinasi - MAP SBERT): {combo_map - sbert_map:.4f}")
print("‚úÖ Kombinasi fitur terkonfirmasi memberikan sinergi terbaik untuk kinerja ranking.")

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: Keyword Saja ---
   -> Melatih Model Keyword Saja...

--- Uji Konfigurasi B: SBERT Saja ---
   -> Melatih Model SBERT Saja...

--- Uji Konfigurasi C: Kombinasi Penuh ---
   -> Melatih Model Kombinasi Penuh...


üèÜ HASIL UJI ABLASI FITUR (XGBOOST)
Metrik: Avg MAP dan Avg nDCG@5
  Model           Fitur Avg_MAP Avg_nDCG_K
XGBoost Kombinasi Penuh  0.6703     1.0000
XGBoost      SBERT Saja  0.6036     0.9922
XGBoost    Keyword Saja  0.5504     0.8304

--- KESIMPULAN STRATEGIS ABLASI ---
Kontribusi Tambahan Kombinasi (MAP Kombinasi - MAP SBERT): 0.0667
‚úÖ Kombinasi fitur terkonfirmasi memberikan sinergi terbaik untuk kinerja ranking.


In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score, recall_score
import os

# --- 1. KONFIGURASI PATHS ---
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.getcwd()
    if not os.path.exists(os.path.join(ROOT_DIR, 'data')):
        ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
# Menggunakan file dataset yang sudah Anda miliki untuk training
CSV_PATH = os.path.join(DATA_DIR, 'dataset_training_FULL_COMPLETE.csv')

# --- 2. DEFINISI FITUR ---
# Konfigurasi A: Keyword Saja (BM25 + Jaccard + Overlap)
FEATURES_KEYWORD = ['bm25_score', 'jaccard_score', 'overlap_score']
# Konfigurasi B: SBERT Saja
FEATURES_SBERT = ['sbert_sim']                     
# Konfigurasi C: Kombinasi Penuh
FEATURES_COMBINATION = ['sbert_sim', 'bm25_score', 'overlap_score', 'jaccard_score'] 
TARGET = 'label'

# --- 3. LOAD DATA & PREPROCESSING ---
print("‚öôÔ∏è Memuat data dan membersihkan...")
try:
    df = pd.read_csv(CSV_PATH)
    df.columns = df.columns.str.strip().str.lower()
except FileNotFoundError:
    print(f"‚ùå File tidak ditemukan di: {CSV_PATH}")
    raise

# Konversi semua fitur yang dibutuhkan ke numerik dan drop NaN
all_required_cols = list(set(FEATURES_COMBINATION + [TARGET]))
for col in all_required_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=all_required_cols)
print(f"‚úÖ Data siap: {len(df)} baris.")

# Split Data (Menggunakan random state untuk hasil yang konsisten)
X = df[FEATURES_COMBINATION] # Gunakan superset fitur
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. FUNGSI PELATIHAN & EVALUASI XGBOOST ---
def evaluate_model_xgb(X_train, X_test, y_train, y_test, features, model_name):
    
    # Hanya pilih fitur yang relevan untuk konfigurasi ini
    X_train_sub = X_train[features]
    X_test_sub = X_test[features]

    # Hitung Scale Pos Weight
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    ratio = float(neg_count) / float(pos_count)
    
    # Model XGBoost (Setting optimal)
    model = xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=ratio,
        random_state=42,
        n_jobs=-1
    )
    
    print(f"   -> Melatih Model {model_name}...")
    model.fit(X_train_sub, y_train)

    # Prediksi
    y_prob = model.predict_proba(X_test_sub)[:, 1]

    # Menghitung Metrik
    
    # 1. Avg MAP (Mean Average Precision)
    map_score = average_precision_score(y_test, y_prob)
    
    # 2. ROC AUC
    roc_auc = roc_auc_score(y_test, y_prob)

    return {
        'Model': 'XGBoost',
        'Fitur': model_name,
        'Avg_MAP': map_score,
        'ROC_AUC': roc_auc
    }

# --- 5. EKSEKUSI ABLASI ---
results = []

# Konfigurasi A: Keyword Saja (BM25 + Jaccard + Overlap)
print("\n--- Uji Konfigurasi A: Keyword Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_KEYWORD, "Keyword Saja"))

# Konfigurasi B: SBERT Saja
print("\n--- Uji Konfigurasi B: SBERT Saja ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_SBERT, "SBERT Saja"))

# Konfigurasi C: Kombinasi Penuh
print("\n--- Uji Konfigurasi C: Kombinasi Penuh ---")
results.append(evaluate_model_xgb(X_train, X_test, y_train, y_test, FEATURES_COMBINATION, "Kombinasi Penuh"))


# --- 6. DISPLAY HASIL ---
df_results = pd.DataFrame(results)

print("\n\n=======================================================")
print("üèÜ HASIL UJI ABLASI FITUR (XGBOOST)")
print("=======================================================")
print("Metrik: Avg MAP dan ROC AUC")

# Format output
df_display = df_results.copy()
df_display['Avg_MAP'] = df_display['Avg_MAP'].apply(lambda x: f"{x:.4f}")
df_display['ROC_AUC'] = df_display['ROC_AUC'].apply(lambda x: f"{x:.4f}")

# Sortir berdasarkan MAP (Paling penting untuk ranking)
df_display = df_display.sort_values(by='Avg_MAP', ascending=False)

# Menggunakan to_string()
print(df_display.to_string(index=False))

# Analisis Final
sbert_map = df_results[df_results['Fitur'] == 'SBERT Saja']['Avg_MAP'].iloc[0]
keyword_map = df_results[df_results['Fitur'] == 'Keyword Saja']['Avg_MAP'].iloc[0]
combo_map = df_results[df_results['Fitur'] == 'Kombinasi Penuh']['Avg_MAP'].iloc[0]

print("\n--- KESIMPULAN STRATEGIS ABLASI ---")
print(f"1. Peningkatan Kinerja Ranking (MAP): {combo_map - sbert_map:.4f} poin (dari SBERT ke Kombinasi)")
print("2. Hipotesis terkonfirmasi: Kombinasi SBERT dan Keyword/BM25 memberikan sinergi terbaik.")

‚öôÔ∏è Memuat data dan membersihkan...
‚úÖ Data siap: 170372 baris.

--- Uji Konfigurasi A: Keyword Saja ---
   -> Melatih Model Keyword Saja...

--- Uji Konfigurasi B: SBERT Saja ---
   -> Melatih Model SBERT Saja...

--- Uji Konfigurasi C: Kombinasi Penuh ---
   -> Melatih Model Kombinasi Penuh...


üèÜ HASIL UJI ABLASI FITUR (XGBOOST)
Metrik: Avg MAP dan ROC AUC
  Model           Fitur Avg_MAP ROC_AUC
XGBoost Kombinasi Penuh  0.6706  0.8068
XGBoost      SBERT Saja  0.6036  0.7545
XGBoost    Keyword Saja  0.5509  0.7287

--- KESIMPULAN STRATEGIS ABLASI ---
1. Peningkatan Kinerja Ranking (MAP): 0.0670 poin (dari SBERT ke Kombinasi)
2. Hipotesis terkonfirmasi: Kombinasi SBERT dan Keyword/BM25 memberikan sinergi terbaik.
