# **QUERY PROCESSING FOR INFORMATION RETRIEVAL**

---
---
# **IMPORT LIBRARIES**
---
---

In [1]:
# Libraries akan diimport pada setial cell

---
---
# **IMPORT DATASET**
---
---

## **1. Dataset Final**

**cluster_df = RESULT CLUSTERING OF AL-QUR'AN DATAFRAME**

In [2]:
import pandas as pd
cluster_df = pd.read_csv("Result\quran_finalclustered.csv")

  cluster_df = pd.read_csv("Result\quran_finalclustered.csv")


## **2. Data Ground Truth**

**groundtruthraw_df = GROUND TRUTH DATAFRAME FROM WEB SCRAPPING**

In [3]:
import pandas as pd
groundtruthraw_df = pd.read_csv("Dataset\quran_groundtruth.csv")

  groundtruthraw_df = pd.read_csv("Dataset\quran_groundtruth.csv")


---
---
# **IMPORT EMBEDDING & MODEL**
---
---

## **1. Embedding**

In [4]:
import numpy as np
from gensim.models import Word2Vec
import joblib

# Word2Vec
w2v_embeddings = np.load("Result/Embedding/w2v_embeddings.npy")
w2v_model = Word2Vec.load("Result/Embedding/w2v_model.model")

# BERT
bert_embeddings = np.load("Result/Embedding/bert_embeddings.npy")

# TF-IDF
tfidf_vectorizer = joblib.load("Result/Embedding/tfidf_vectorizer.pkl")
tfidf_embeddings = joblib.load("Result/Embedding/tfidf_embeddings.pkl")

# TF-IDF LDA PCA
tfidfldapca_vectorizer = joblib.load("Result/Embedding/tfidfldapca_vectorizer.pkl")
tfidfldapca_embeddings = joblib.load("Result/Embedding/tfidfldapca_embeddings.pkl")
lda_model = joblib.load("Result/Embedding/lda_model.pkl")
pca_model = joblib.load("Result/Embedding/pca_model.pkl")

## **2. Model**

In [5]:
import joblib

# K-means Models
kmeans_w2v = joblib.load("Result/Model/kmeans_w2v.pkl")
kmeans_bert = joblib.load("Result/Model/kmeans_bert.pkl")
kmeans_tfidf = joblib.load("Result/Model/kmeans_tfidf.pkl")
kmeans_tfidfldapca = joblib.load("Result/Model/kmeans_tfidfldapca.pkl")

# AHC Models
ahc_w2v = joblib.load("Result/Model/ahc_w2v.pkl")
ahc_bert = joblib.load("Result/Model/ahc_bert.pkl")
ahc_tfidf = joblib.load("Result/Model/ahc_tfidf.pkl")
ahc_tfidfldapca = joblib.load("Result/Model/ahc_tfidfldapca.pkl")

# DBSCAN Models
dbscan_w2v = joblib.load("Result/Model/dbscan_w2v.pkl")
dbscan_bert = joblib.load("Result/Model/dbscan_bert.pkl")
dbscan_tfidf = joblib.load("Result/Model/dbscan_tfidf.pkl")
dbscan_tfidfldapca = joblib.load("Result/Model/dbscan_tfidfldapca.pkl")

---
---
# **PREPROCESSING GROUND TRUTH**
---
---

**groundtruth_df = PREPROCESSED groundtruthraw_df**

In [6]:
# Prapemrosesan Data Groundtruth
import pandas as pd
import re
from unidecode import unidecode

groundtruth_df = groundtruthraw_df.copy()

groundtruth_df["Surah"] = groundtruth_df["Surah"].apply(
    lambda x: unidecode(x).replace("'", "").replace("’", "").strip()
)

surah_names = [ 
    "Al-Fatiha", "Al-Baqarah", "Al-Imran", "An-Nisa", "Al-Ma'idah", "Al-Anam", "Al-Araf", "Al-Anfal",
    "At-Taubah", "Yunus", "Hud", "Yusuf", "Ar-Ra'd", "Ibrahim", "Al-Hijr", "An-Nahl", "Al-Isra", "Al-Kahf",
    "Maryam", "Ta-Ha", "Al-Anbiya", "Al-Hajj", "Al-Muminun", "An-Nur", "Al-Furqan", "Ash-Shuara",
    "An-Naml", "Al-Qasas", "Al-Ankabut", "Ar-Rum", "Luqman", "As-Sajdah", "Al-Ahzab", "Saba", "Fatir",
    "Ya-Sin", "As-Saffat", "Sad", "Az-Zumar", "Ghafir", "Fussilat", "Ash-Shura", "Az-Zukhruf", "Ad-Dukhan",
    "Al-Jathiyah", "Al-Ahqaf", "Muhammad", "Al-Fath", "Al-Hujurat", "Qaf", "Adh-Dhariyat", "At-Tur",
    "An-Najm", "Al-Qamar", "Ar-Rahman", "Al-Waqiah", "Al-Hadid", "Al-Mujadila", "Al-Hashr", "Al-Mumtahanah",
    "As-Saff", "Al-Jumu'ah", "Al-Munafiqun", "At-Taghabun", "At-Talaq", "At-Tahrim", "Al-Mulk", "Al-Qalam",
    "Al-Haqqah", "Al-Maarij", "Nuh", "Al-Jinn", "Al-Muzzammil", "Al-Muddaththir", "Al-Qiyamah", "Al-Insan",
    "Al-Mursalat", "An-Naba", "An-Naziat", "Abasa", "At-Takwir", "Al-Infitar", "Al-Mutaffifin", "Al-Inshiqaq",
    "Al-Buruj", "At-Tariq", "Al-Ala", "Al-Ghashiyah", "Al-Fajr", "Al-Balad", "Ash-Shams", "Al-Lail",
    "Ad-Duhaa", "Ash-Sharh", "At-Tin", "Al-Alaq", "Al-Qadr", "Al-Bayyina", "Az-Zalzalah", "Al-Adiyat",
    "Al-Qariah", "At-Takathur", "Al-Asr", "Al-Humazah", "Al-Fil", "Quraysh", "Al-Maun", "Al-Kawthar",
    "Al-Kafirun", "An-Nasr", "Al-Masad", "Al-Ikhlas", "Al-Falaq", "An-Nas"
]

normalized_surah_names = [unidecode(s).replace("'", "").replace("’", "").strip() for s in surah_names]
surah_mapping = {name: idx + 1 for idx, name in enumerate(normalized_surah_names)}

groundtruth_df["SurahNo"] = groundtruth_df["Surah"].map(surah_mapping)

list_to_query = {
    "guidance": "petunjuk",
    "haram-and-forbidden": "haram dan larangan",
    "zina": "zina",
    "qiyamah": "hari kiamat",
    "shaitan": "setan",
    "prophet-ibrahim": "nabi ibrahim",
    "prophet-musa": "nabi musa",
    "jinn": "jin",
    "jahannam": "neraka",
    "jannah": "surga",
    "jesus-isa": "nabi isa",
    "zakat": "zakat",
    "hypocrites": "orang munafik",
    "rizq": "rezeki dari Allah",
    "tawakkul": "tawakal kepada allah"
}

groundtruth_df["UserQuery"] = groundtruth_df["List"].map(list_to_query)
groundtruth_df.rename(columns={"Ayah": "AyahNo"}, inplace=True)
groundtruth_df

Unnamed: 0,List,Surah,AyahNo,SurahNo,UserQuery
0,guidance,Al-Fatiha,6,1,petunjuk
1,guidance,Al-Fatiha,7,1,petunjuk
2,guidance,Al-Baqarah,2,2,petunjuk
3,guidance,Al-Baqarah,5,2,petunjuk
4,guidance,Al-Baqarah,16,2,petunjuk
...,...,...,...,...,...
1296,tawakkul,Al-Mumtahanah,4,60,tawakal kepada allah
1297,tawakkul,At-Taghabun,13,64,tawakal kepada allah
1298,tawakkul,At-Talaq,3,65,tawakal kepada allah
1299,tawakkul,Al-Mulk,29,67,tawakal kepada allah


In [7]:
# Expand Rentang AyahNo (Dari Data "2-5" Ke [2, 3, 4, 5])
import pandas as pd

def expand_ayah_ranges(row):
    surah = row['SurahNo']
    query = row['UserQuery']
    ayah_entry = str(row['AyahNo']).strip()
    
    if ayah_entry == '':
        return []

    if '-' in ayah_entry:
        try:
            start, end = map(int, ayah_entry.split('-'))
            return [{'UserQuery': query, 'SurahNo': surah, 'AyahNo': i} for i in range(start, end + 1)]
        except:
            return []
    else:
        try:
            return [{'UserQuery': query, 'SurahNo': surah, 'AyahNo': int(ayah_entry)}]
        except:
            return []

expanded_rows = []
for _, row in groundtruth_df.iterrows():
    expanded_rows.extend(expand_ayah_ranges(row))

groundtruth_df = pd.DataFrame(expanded_rows)

**readytoevaluate_df = KOMBINASI ANTARA groundtruth_df DAN cluster_df DENGAN NILAI YANG SAMA DARI KOLOM 'AyahNo' DAN 'SurahNO'**

In [8]:
import pandas as pd

readytoevaluate_df = cluster_df.copy()

for query in groundtruth_df['UserQuery'].unique():
    subset = groundtruth_df[groundtruth_df['UserQuery'] == query]
    valid_pairs = set(zip(subset['SurahNo'], subset['AyahNo']))
    readytoevaluate_df[query] = readytoevaluate_df.apply(lambda row: 1 if (row['SurahNo'], row['AyahNo']) in valid_pairs else 0, axis=1)

---
---
# **PREPROCESSING USER QUERY**
---
---

## **1. Preprocessing Query**

In [9]:
# Contoh Kueri
query = "rezeki dari Allah"

In [10]:
# Prapemrosesan Kueri
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stop_words = set(stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess_query(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"[-]", ' ', text)
    text = re.sub(r"[^\w\s']", '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens_clean = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens_clean).strip()

processed_query = preprocess_query(query)
tokens = processed_query.split()
print("Processed Query:", processed_query)

Processed Query: rezeki allah


## **2. Embedding Query**

In [11]:
# Ekstraksi Embedding untuk Kueri
from gensim.models import Word2Vec
import numpy as np
import joblib
from transformers import AutoTokenizer, AutoModel
import torch

# Word2Vec Embedding
def get_w2v_query_embedding(tokens, model, vector_size=200):
    valid_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not valid_vectors:
        return np.zeros(vector_size)
    return np.mean(valid_vectors, axis=0)

w2v_query_vector = get_w2v_query_embedding(tokens, w2v_model)

# BERT Embedding
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
bert_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0][0].numpy()

bert_query_vector = get_bert_embedding(processed_query, tokenizer, bert_model).astype(np.float64)

# TF-IDF Group Embedding
tfidf_query_vectors = tfidf_vectorizer.transform([processed_query]).toarray()[0]
tfidf_query_vector = tfidfldapca_vectorizer.transform([processed_query]).toarray()[0]
lda_query_vector = lda_model.transform(tfidf_query_vector.reshape(1, -1))
combined_query_vector = np.hstack([tfidf_query_vector.reshape(1, -1), lda_query_vector])
query_vector_for_cluster = pca_model.transform(combined_query_vector)[0]



---
---
# **COSINE SIMILARITY**
---
---

**final_df = DATAFRAME FINAL YANG AKAN DIHASILKAN DI SITUS WEB**

In [12]:
# Cosine Similarity untuk Pengurutan Ayat
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def search_clustered_quran(query_vector, method_name, embedding_name, cluster_column, model, embeddings, raw_embeddings, query_vector_for_similarity=None, top_n=10):
    if hasattr(model, "predict"):
        query_cluster = model.predict([query_vector])[0]
    else:
        all_similarities = cosine_similarity([query_vector], embeddings)[0]
        most_similar_idx = np.argmax(all_similarities)
        query_cluster = readytoevaluate_df.loc[most_similar_idx, cluster_column]

    filtered_df = readytoevaluate_df[readytoevaluate_df[cluster_column] == query_cluster]
    filtered_embeddings = raw_embeddings[filtered_df.index]

    similarity_query_vector = query_vector_for_similarity if query_vector_for_similarity is not None else query_vector
    similarities = cosine_similarity([similarity_query_vector], filtered_embeddings)[0]

    filtered_df = filtered_df.copy()
    filtered_df['Similarity'] = similarities

    top_results = filtered_df.sort_values(by="Similarity", ascending=False).head(25)

    top_results['Method'] = method_name
    top_results['Embedding'] = embedding_name
    top_results['Cluster'] = query_cluster

    columns = ['Method', 'Embedding', 'Cluster', 'SurahNo', 'AyahNo', 'IndonesianText', 'Similarity']
    if query in top_results.columns:
        columns.append(query)

    return top_results[columns]

# Combinations (method_name, 'embedding", cluster_column, model, embeddings (for predict cluster), query_vector (for predict cluster), raw_embeddings (for similarity), query_vector_for_similarity (for similarity))
combinations = [
    ("KMeans",  "W2V",                  "Cluster_W2V_KMEANS",           kmeans_w2v,         w2v_embeddings,         w2v_query_vector,            w2v_embeddings,            None),
    ("KMeans",  "BERT",                 "Cluster_BERT_KMEANS",          kmeans_bert,        bert_embeddings,        bert_query_vector,           bert_embeddings,           None),
    ("KMeans",  "TF-IDF",               "Cluster_TFIDF_KMEANS",         kmeans_tfidf,       tfidf_embeddings,       tfidf_query_vectors,         tfidf_embeddings,          None),
    ("KMeans",  "TF-IDF + LDA + PCA",   "Cluster_TFIDFLDAPCA_KMEANS",   kmeans_tfidfldapca, tfidfldapca_embeddings, query_vector_for_cluster,    tfidfldapca_embeddings,    None),
    ("KMeans",  "Hybrid TF-IDF",        "Cluster_TFIDFLDAPCA_KMEANS",   kmeans_tfidfldapca, tfidfldapca_embeddings, query_vector_for_cluster,    tfidf_embeddings,          tfidf_query_vectors),

    ("AHC",     "W2V",                  "Cluster_W2V_AHC",              ahc_w2v,            w2v_embeddings,         w2v_query_vector,            w2v_embeddings,            None),
    ("AHC",     "BERT",                 "Cluster_BERT_AHC",             ahc_bert,           bert_embeddings,        bert_query_vector,           bert_embeddings,           None),
    ("AHC",     "TF-IDF",               "Cluster_TFIDF_AHC",            ahc_tfidf,          tfidf_embeddings,       tfidf_query_vectors,         tfidf_embeddings,          None),
    ("AHC",     "TF-IDF + LDA + PCA",   "Cluster_TFIDFLDAPCA_AHC",      ahc_tfidfldapca,    tfidfldapca_embeddings, query_vector_for_cluster,    tfidfldapca_embeddings,    None),
    ("AHC",     "Hybrid TF-IDF",        "Cluster_TFIDFLDAPCA_AHC",      ahc_tfidfldapca,    tfidfldapca_embeddings, query_vector_for_cluster,    tfidf_embeddings,          tfidf_query_vectors),

    ("DBSCAN",  "W2V",                  "Cluster_W2V_DBSCAN",           dbscan_w2v,         w2v_embeddings,         w2v_query_vector,            w2v_embeddings,            None),
    ("DBSCAN",  "BERT",                 "Cluster_BERT_DBSCAN",          dbscan_bert,        bert_embeddings,        bert_query_vector,           bert_embeddings,           None),
    ("DBSCAN",  "TF-IDF",                "Cluster_TFIDF_DBSCAN",         dbscan_tfidf,       tfidf_embeddings,       tfidf_query_vectors,         tfidf_embeddings,          None),
    ("DBSCAN",  "TF-IDF + LDA + PCA",   "Cluster_TFIDFLDAPCA_DBSCAN",   dbscan_tfidfldapca, tfidfldapca_embeddings, query_vector_for_cluster,    tfidfldapca_embeddings,    None),
    ("DBSCAN",  "Hybrid TFIDF",         "Cluster_TFIDFLDAPCA_DBSCAN",   dbscan_tfidfldapca, tfidfldapca_embeddings, query_vector_for_cluster,    tfidf_embeddings,          tfidf_query_vectors),
]

all_results = []

for method_name, embedding_name, cluster_column, model, embeddings, query_vector, raw_embeddings, query_vector_for_similarity in combinations:
    try:
        result = search_clustered_quran(
            query_vector=query_vector,
            method_name=method_name,
            embedding_name=embedding_name,
            cluster_column=cluster_column,
            model=model,
            embeddings=embeddings,
            raw_embeddings=raw_embeddings,
            query_vector_for_similarity=query_vector_for_similarity
        )
        all_results.append(result)
    except Exception as e:
        print(f"❌ Error in {method_name}: {e}")

final_df = pd.concat(all_results, ignore_index=True)
final_df

Unnamed: 0,Method,Embedding,Cluster,SurahNo,AyahNo,IndonesianText,Similarity,rezeki dari Allah
0,KMeans,W2V,3,65,3,dan menganugerahkan kepadanya rezeki dari arah...,0.999771,1
1,KMeans,W2V,3,2,245,Siapakah yang mau memberi pinjaman yang baik k...,0.999675,0
2,KMeans,W2V,3,28,82,Orang-orang yang kemarin mengangan-angankan ke...,0.999648,1
3,KMeans,W2V,3,4,172,Almasih tidak akan pernah enggan menjadi hamba...,0.999642,0
4,KMeans,W2V,3,29,65,"Apabila naik ke dalam bahtera, mereka berdoa k...",0.999634,0
...,...,...,...,...,...,...,...,...
370,DBSCAN,Hybrid TFIDF,1,16,67,"Dari buah kurma dan anggur, kamu membuat minum...",0.274029,1
371,DBSCAN,Hybrid TFIDF,1,2,57,Kami menaungi kamu dengan awan dan Kami menuru...,0.270720,1
372,DBSCAN,Hybrid TFIDF,1,19,62,Di dalamnya mereka tidak mendengar perkataan y...,0.260259,1
373,DBSCAN,Hybrid TFIDF,1,17,31,Janganlah kamu membunuh anak-anakmu karena tak...,0.258150,1


---
---
# **EVALUATE PRECISION & RECALL**
---
---

## **1. Building Function**

In [13]:
# Prapemrosesan Kueri
def preprocess_query(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"[-]", ' ', text)
    text = re.sub(r"[^\w\s']", '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens_clean = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens_clean).strip()

# Fungsi Word2Vec Embedding
def get_w2v_query_embedding(tokens, model, vector_size=200):
    valid_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not valid_vectors:
        return np.zeros(vector_size)
    return np.mean(valid_vectors, axis=0)

# Fungsi BERT Embedding
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0][0].numpy().astype(np.float64)

# Fungsi TF-IDF Group Embedding
def get_tfidf_query_embedding(text, tfidf_vectorizer, tfidfldapca_vectorizer, lda_model, pca_model):
    tfidf_query_vectors = tfidf_vectorizer.transform([text]).toarray()[0]
    tfidf_query_vector = tfidfldapca_vectorizer.transform([text]).toarray()[0]
    lda_query_vector = lda_model.transform(tfidf_query_vector.reshape(1, -1))
    combined_query_vector = np.hstack([tfidf_query_vector.reshape(1, -1), lda_query_vector])
    return pca_model.transform(combined_query_vector)[0], tfidf_query_vectors

# Fungsi untuk Mencari Ayat Quran Berdasarkan Kueri
def search_clustered_quran(query_vector, method_name, embedding_name, cluster_column, model, embeddings, raw_embeddings, query_vector_for_similarity=None, top_n=20):
    if hasattr(model, "predict"):
        query_cluster = model.predict([query_vector])[0]
    else:
        all_similarities = cosine_similarity([query_vector], embeddings)[0]
        most_similar_idx = np.argmax(all_similarities)
        query_cluster = cluster_df.loc[most_similar_idx, cluster_column]

    filtered_df = cluster_df[cluster_df[cluster_column] == query_cluster]
    filtered_embeddings = raw_embeddings[filtered_df.index]

    similarity_query_vector = query_vector_for_similarity if query_vector_for_similarity is not None else query_vector
    similarities = cosine_similarity([similarity_query_vector], filtered_embeddings)[0]

    filtered_df = filtered_df.copy()
    filtered_df['Similarity'] = similarities

    top_results = filtered_df.sort_values(by="Similarity", ascending=False).head(top_n)

    top_results['Method'] = method_name
    top_results['Embedding'] = embedding_name
    top_results['Cluster'] = query_cluster

    return top_results[['Method', 'Cluster', 'SurahNo', 'AyahNo', 'IndonesianText', 'Similarity']]

# Fungsi untuk Evaluasi
def evaluate_metrics(top_results, readytoevaluate_df, query, method_name, embedding_name, cluster_column, top_n_values=[5, 10, 20]):
    metrics = {'Method': method_name, 'Embedding': embedding_name, 'Query': query}
    
    ground_truth = readytoevaluate_df[readytoevaluate_df[query] == 1][['SurahNo', 'AyahNo', cluster_column]]
    relevant_pairs = set(zip(ground_truth['SurahNo'], ground_truth['AyahNo']))
    total_relevant = len(relevant_pairs)

    for top_n in top_n_values:

        top_n_results = top_results.head(top_n)
        retrieved_pairs = set(zip(top_n_results['SurahNo'], top_n_results['AyahNo']))
        
        # Precision
        relevant_retrieved = len(relevant_pairs.intersection(retrieved_pairs))
        precision = round(relevant_retrieved / top_n, 4) if top_n > 0 else 0
        metrics[f'Precision@{top_n}'] = precision

    # Recall
    predicted_cluster = top_results['Cluster'].iloc[0]
    cluster_items = readytoevaluate_df[readytoevaluate_df[cluster_column] == predicted_cluster]
    cluster_pairs = set(zip(cluster_items['SurahNo'], cluster_items['AyahNo']))
    correct_pairs = cluster_pairs.intersection(relevant_pairs)
    recall_all = round(len(correct_pairs) / total_relevant, 4) if total_relevant > 0 else 0
    metrics['Recall@all'] = recall_all

    return metrics

## **2. Recall and Precision Model Based**

In [14]:
combinations = [
    ("KMeans", "W2V", "Cluster_W2V_KMEANS", kmeans_w2v, w2v_embeddings, w2v_embeddings, None),
    ("KMeans", "BERT", "Cluster_BERT_KMEANS", kmeans_bert, bert_embeddings, bert_embeddings, None),
    ("KMeans", "TF-IDF", "Cluster_TFIDF_KMEANS", kmeans_tfidf, tfidf_embeddings, tfidf_embeddings, None),
    ("KMeans", "TF-IDF + LDA + PCA", "Cluster_TFIDFLDAPCA_KMEANS", kmeans_tfidfldapca, tfidfldapca_embeddings, tfidfldapca_embeddings, None),
    ("KMeans", "Hybrid TF-IDF", "Cluster_TFIDFLDAPCA_KMEANS", kmeans_tfidfldapca, tfidfldapca_embeddings, tfidf_embeddings, None),

    ("AHC", "W2V", "Cluster_W2V_AHC", ahc_w2v, w2v_embeddings, w2v_embeddings, None),
    ("AHC", "BERT", "Cluster_BERT_AHC", ahc_bert, bert_embeddings, bert_embeddings, None),
    ("AHC", "TF-IDF", "Cluster_TFIDF_AHC", ahc_tfidf, tfidf_embeddings, tfidf_embeddings, None),
    ("AHC", "TF-IDF + LDA + PCA", "Cluster_TFIDFLDAPCA_AHC", ahc_tfidfldapca, tfidfldapca_embeddings, tfidfldapca_embeddings, None),
    ("AHC", "Hybrid TF-IDF", "Cluster_TFIDFLDAPCA_AHC", ahc_tfidfldapca, tfidfldapca_embeddings, tfidf_embeddings, None),
    
    ("DBSCAN", "W2V", "Cluster_W2V_DBSCAN", dbscan_w2v, w2v_embeddings, w2v_embeddings, None),
    ("DBSCAN", "BERT", "Cluster_BERT_DBSCAN", dbscan_bert, bert_embeddings, bert_embeddings, None),
    ("DBSCAN", "TF-IDF", "Cluster_TFIDF_DBSCAN", dbscan_tfidf, tfidf_embeddings, tfidf_embeddings, None),
    ("DBSCAN", "TF-IDF + LDA + PCA", "Cluster_TFIDFLDAPCA_DBSCAN", dbscan_tfidfldapca, tfidfldapca_embeddings, tfidfldapca_embeddings, None),
    ("DBSCAN", "Hybrid TF-IDF", "Cluster_TFIDFLDAPCA_DBSCAN", dbscan_tfidfldapca, tfidfldapca_embeddings, tfidf_embeddings, None),
]

unique_queries = groundtruth_df['UserQuery'].unique()
all_metrics = []

for query in unique_queries:
    # Prapemrosesan Kueri
    processed_query = preprocess_query(query)
    tokens = processed_query.split()

    # Ekstraksi Embedding untuk Kueri
    w2v_query_vector = get_w2v_query_embedding(tokens, w2v_model)
    bert_query_vector = get_bert_embedding(processed_query, tokenizer, bert_model)
    tfidf_query_vector, tfidf_query_vectors = get_tfidf_query_embedding(
        processed_query, tfidf_vectorizer, tfidfldapca_vectorizer, lda_model, pca_model
    )

    # Siapkan Vektor Kueri untuk Pencarian
    query_vectors = {
        "W2V": w2v_query_vector,
        "BERT": bert_query_vector,
        "TF-IDF": tfidf_query_vectors,
        "TF-IDF + LDA + PCA": tfidf_query_vector,
        "Hybrid TF-IDF": tfidf_query_vector
    }
    similarity_vectors = {
        "W2V": w2v_query_vector,
        "BERT": bert_query_vector,
        "TF-IDF": tfidf_query_vectors,
        "TF-IDF + LDA + PCA": tfidf_query_vector,
        "Hybrid TF-IDF": tfidf_query_vectors
    }

    for method_name, embedding_name, cluster_column, model, embeddings, raw_embeddings, _ in combinations:
        try:
            embedding_type = embedding_name
            query_vector = query_vectors[embedding_type]
            query_vector_for_similarity = similarity_vectors[embedding_type]

            top_results = search_clustered_quran(
                query_vector=query_vector,
                method_name=method_name,
                embedding_name=embedding_name,
                cluster_column=cluster_column,
                model=model,
                embeddings=embeddings,
                raw_embeddings=raw_embeddings,
                query_vector_for_similarity=query_vector_for_similarity,
                top_n=20
            )

            metrics = evaluate_metrics(top_results, readytoevaluate_df, query, method_name, embedding_name, cluster_column)
            all_metrics.append(metrics)

        except Exception as e:
            print(f"Error in {method_name} for query '{query}': {e}")

evaluationir_df = pd.DataFrame(all_metrics)
evaluationir_df

Unnamed: 0,Method,Embedding,Query,Precision@5,Precision@10,Precision@20,Recall@all
0,KMeans,W2V,petunjuk,0.6,0.4,0.50,0.7024
1,KMeans,BERT,petunjuk,0.2,0.1,0.05,0.0952
2,KMeans,TF-IDF,petunjuk,0.6,0.6,0.35,0.6548
3,KMeans,TF-IDF + LDA + PCA,petunjuk,0.2,0.2,0.10,0.3214
4,KMeans,Hybrid TF-IDF,petunjuk,0.6,0.6,0.45,0.3214
...,...,...,...,...,...,...,...
220,DBSCAN,W2V,tawakal kepada allah,1.0,0.9,0.75,0.9767
221,DBSCAN,BERT,tawakal kepada allah,0.0,0.1,0.15,0.8837
222,DBSCAN,TF-IDF,tawakal kepada allah,1.0,1.0,1.00,0.9767
223,DBSCAN,TF-IDF + LDA + PCA,tawakal kepada allah,0.0,0.0,0.05,0.5581


In [15]:
average_evaluation = evaluationir_df.groupby(['Method', 'Embedding'])[['Precision@5', 'Precision@10', 'Precision@20', 'Recall@all']].mean().round(4).reset_index()
average_evaluation

Unnamed: 0,Method,Embedding,Precision@5,Precision@10,Precision@20,Recall@all
0,AHC,BERT,0.0933,0.1267,0.07,0.0987
1,AHC,Hybrid TF-IDF,0.6533,0.62,0.5467,0.3421
2,AHC,TF-IDF,0.6933,0.64,0.6233,0.8525
3,AHC,TF-IDF + LDA + PCA,0.16,0.1467,0.0967,0.3421
4,AHC,W2V,0.6267,0.5867,0.4967,0.6077
5,DBSCAN,BERT,0.1067,0.1267,0.07,0.7609
6,DBSCAN,Hybrid TF-IDF,0.72,0.6467,0.5433,0.3713
7,DBSCAN,TF-IDF,0.7067,0.66,0.62,0.9956
8,DBSCAN,TF-IDF + LDA + PCA,0.1467,0.1533,0.0967,0.3713
9,DBSCAN,W2V,0.6933,0.6133,0.5333,0.9956


## **3. Recall and Precision Keywords Based**

In [16]:
keyword_queries = [
    "zakat", "haram dan larangan", "zina", "setan", "jin", "orang munafik", "rezeki dari Allah", 
    "tawakal kepada allah", "petunjuk", "nabi musa", "nabi isa", "nabi ibrahim", "hari kiamat", "surga", "neraka"
    ]

all_keyword_metrics = []

for query in keyword_queries:
    try:
        
        keyword = query.lower()

        filtered_df = cluster_df[cluster_df['IndonesianText'].str.lower().str.contains(keyword)].copy()
        filtered_df['Method'] = 'Keyword-Based'

        metrics = {'Method': 'Keyword-Based', 'Query': query}

        ground_truth = readytoevaluate_df[readytoevaluate_df[query] == 1][['SurahNo', 'AyahNo']]
        relevant_pairs = set(zip(ground_truth['SurahNo'], ground_truth['AyahNo']))
        total_relevant = len(relevant_pairs)

        retrieved_pairs_all = list(zip(filtered_df['SurahNo'], filtered_df['AyahNo']))
        retrieved_pairs_set = set(retrieved_pairs_all)

        relevant_retrieved_all = len(relevant_pairs.intersection(retrieved_pairs_set))
        precision_all = round(relevant_retrieved_all / len(retrieved_pairs_all), 4) if len(retrieved_pairs_all) > 0 else 0
        recall_all = round(relevant_retrieved_all / total_relevant, 4) if total_relevant > 0 else 0

        metrics['Banyak Ditemukan'] = len(retrieved_pairs_all)

        for n in [5, 10, 20]:
            top_n = retrieved_pairs_all[:n]
            relevant_top_n = len(set(top_n).intersection(relevant_pairs))
            precision_at_n = round(relevant_top_n / n, 4)
            metrics[f'Precision@{n}'] = precision_at_n

        metrics['Precision@all'] = precision_all
        metrics['Recall@all'] = recall_all

        all_keyword_metrics.append(metrics)

    except Exception as e:
        print(f"Error in KeywordBased for query '{query}': {e}")

keyword_evaluation_df = pd.DataFrame(all_keyword_metrics)
keyword_evaluation_df

Unnamed: 0,Method,Query,Banyak Ditemukan,Precision@5,Precision@10,Precision@20,Precision@all,Recall@all
0,Keyword-Based,zakat,35,1.0,1.0,0.9,0.9143,0.5079
1,Keyword-Based,haram dan larangan,0,0.0,0.0,0.0,0.0,0.0
2,Keyword-Based,zina,12,0.6,0.5,0.35,0.5833,0.5
3,Keyword-Based,setan,107,0.8,0.9,0.9,0.757,0.81
4,Keyword-Based,jin,86,0.6,0.6,0.55,0.3605,1.0
5,Keyword-Based,orang munafik,48,0.2,0.5,0.5,0.5417,0.8667
6,Keyword-Based,rezeki dari Allah,0,0.0,0.0,0.0,0.0,0.0
7,Keyword-Based,tawakal kepada allah,4,0.8,0.4,0.2,1.0,0.093
8,Keyword-Based,petunjuk,234,0.8,0.6,0.45,0.3333,0.9286
9,Keyword-Based,nabi musa,1,0.0,0.0,0.0,0.0,0.0


In [17]:
average_keywordEvaluation = keyword_evaluation_df.groupby(['Method'])[['Precision@5', 'Precision@10', 'Precision@20', 'Precision@all', 'Recall@all']].mean().round(4).reset_index()
average_keywordEvaluation

Unnamed: 0,Method,Precision@5,Precision@10,Precision@20,Precision@all,Recall@all
0,Keyword-Based,0.4133,0.4133,0.3533,0.3964,0.4383


---
---
# **EVALUATE EXPERT**
---
---

In [18]:
# Load Hasil Evaluasi Klastering
quran_evaluation = pd.read_csv('Result\quran_evaluationclustering.csv')

  quran_evaluation = pd.read_csv('Result\quran_evaluationclustering.csv')


In [19]:
# Mapping Kode Metode dan Embedding ke Kode Unik
method_code_map = {
    ("KMeans", "Hybrid TF-IDF"): "a",
    ("AHC", "Hybrid TF-IDF"): "b",
    ("DBSCAN", "Hybrid TF-IDF"): "c"
}

final_results = {}

for query in unique_queries:
    processed_query = preprocess_query(query)
    tokens = processed_query.split()

    w2v_query_vector = get_w2v_query_embedding(tokens, w2v_model)
    bert_query_vector = get_bert_embedding(processed_query, tokenizer, bert_model)
    tfidf_query_vector, tfidf_query_vectors = get_tfidf_query_embedding(
        processed_query, tfidf_vectorizer, tfidfldapca_vectorizer, lda_model, pca_model
    )

    query_vectors = {
        "W2V": w2v_query_vector,
        "BERT": bert_query_vector,
        "TFIDF": tfidf_query_vectors,
        "TF-IDF + LDA + PCA": tfidf_query_vector,
        "Hybrid TF-IDF": tfidf_query_vector
    }
    similarity_vectors = {
        "W2V": w2v_query_vector,
        "BERT": bert_query_vector,
        "TFIDF": tfidf_query_vectors,
        "TF-IDF + LDA + PCA": tfidf_query_vector,
        "Hybrid TF-IDF": tfidf_query_vectors
    }

    for method_name, embedding_name, cluster_column, model, embeddings, raw_embeddings, _ in combinations:
        try:
            embedding_type = embedding_name
            query_vector = query_vectors[embedding_type]
            query_vector_for_similarity = similarity_vectors[embedding_type]

            top5_results = search_clustered_quran(
                query_vector=query_vector,
                method_name=method_name,
                embedding_name=embedding_name,
                cluster_column=cluster_column,
                model=model,
                embeddings=embeddings,
                raw_embeddings=raw_embeddings,
                query_vector_for_similarity=query_vector_for_similarity,
                top_n=5
            )

            for _, row in top5_results.iterrows():
                key = (query, row['SurahNo'], row['AyahNo'])
                kode = method_code_map.get((method_name, embedding_name))
                if not kode:
                    continue

                if key in final_results:
                    final_results[key]["Kode"] += kode
                else:
                    arabic = readytoevaluate_df[
                        (readytoevaluate_df['SurahNo'] == row['SurahNo']) &
                        (readytoevaluate_df['AyahNo'] == row['AyahNo'])
                    ]['ArabicText'].values
                    arabic_text = arabic[0] if len(arabic) > 0 else ""

                    final_results[key] = {
                        "Query": query,
                        "Kode": kode,
                        "SurahNo": row['SurahNo'],
                        "AyahNo": row['AyahNo'],
                        "ArabicText": arabic_text,
                        "IndonesianText": row['IndonesianText']
                    }
                    
        except Exception as e:
            # Tidak Apa-apa Jika Ada Error, Hal Ini Hanya Karena Tidak Diambil di 'method_code_map'
            print(f"Error in top5 for {method_name} on '{query}': {e}")

finish_df = pd.DataFrame(list(final_results.values()))
finish_df = finish_df.sort_values(by=["Query", "SurahNo", "AyahNo"]).reset_index(drop=True)
finish_df

Error in top5 for KMeans on 'petunjuk': 'TF-IDF'
Error in top5 for AHC on 'petunjuk': 'TF-IDF'
Error in top5 for DBSCAN on 'petunjuk': 'TF-IDF'
Error in top5 for KMeans on 'haram dan larangan': 'TF-IDF'
Error in top5 for AHC on 'haram dan larangan': 'TF-IDF'
Error in top5 for DBSCAN on 'haram dan larangan': 'TF-IDF'
Error in top5 for KMeans on 'zina': 'TF-IDF'
Error in top5 for AHC on 'zina': 'TF-IDF'
Error in top5 for DBSCAN on 'zina': 'TF-IDF'
Error in top5 for KMeans on 'hari kiamat': 'TF-IDF'
Error in top5 for AHC on 'hari kiamat': 'TF-IDF'
Error in top5 for DBSCAN on 'hari kiamat': 'TF-IDF'
Error in top5 for KMeans on 'setan': 'TF-IDF'
Error in top5 for AHC on 'setan': 'TF-IDF'
Error in top5 for DBSCAN on 'setan': 'TF-IDF'
Error in top5 for KMeans on 'nabi ibrahim': 'TF-IDF'
Error in top5 for AHC on 'nabi ibrahim': 'TF-IDF'
Error in top5 for DBSCAN on 'nabi ibrahim': 'TF-IDF'
Error in top5 for KMeans on 'nabi musa': 'TF-IDF'
Error in top5 for AHC on 'nabi musa': 'TF-IDF'
Error in 

Unnamed: 0,Query,Kode,SurahNo,AyahNo,ArabicText,IndonesianText
0,haram dan larangan,b,6,26,وَهُمْ يَنْهَوْنَ عَنْهُ وَيَنْـَٔوْنَ عَنْهُ ...,Mereka melarang (orang lain) mendengarkannya (...
1,haram dan larangan,ac,6,146,وَعَلَى الَّذِيْنَ هَادُوْا حَرَّمْنَا كُلَّ ذ...,Atas orang-orang Yahudi Kami mengharamkan semu...
2,haram dan larangan,a,9,29,قَاتِلُوا الَّذِيْنَ لَا يُؤْمِنُوْنَ بِاللّٰه...,Perangilah orang-orang yang tidak beriman kepa...
3,haram dan larangan,c,9,37,اِنَّمَا النَّسِيْۤءُ زِيَادَةٌ فِى الْكُفْرِ ...,Sesungguhnya pengunduran (bulan haram) itu han...
4,haram dan larangan,b,17,38,كُلُّ ذٰلِكَ كَانَ سَيِّئُهٗ عِنْدَ رَبِّكَ مَ...,Kejahatan dari semua (larangan) itu429) dibenc...
...,...,...,...,...,...,...
90,zina,abc,4,25,وَمَنْ لَّمْ يَسْتَطِعْ مِنْكُمْ طَوْلًا اَنْ ...,Siapa di antara kamu yang tidak mempunyai biay...
91,zina,abc,17,32,وَلَا تَقْرَبُوا الزِّنٰىٓ اِنَّهٗ كَانَ فَاحِ...,Janganlah kamu mendekati zina. Sesungguhnya (z...
92,zina,abc,19,28,يٰٓاُخْتَ هٰرُوْنَ مَا كَانَ اَبُوْكِ امْرَاَ ...,"Wahai saudara perempuan Harun (Maryam), ayahmu..."
93,zina,abc,24,2,اَلزَّانِيَةُ وَالزَّانِيْ فَاجْلِدُوْا كُلَّ ...,"Pezina perempuan dan pezina laki-laki, deralah..."


In [20]:
finish_df.to_csv("Result\quran_expertmapping.csv", index=False, encoding='utf-8-sig')

  finish_df.to_csv("Result\quran_expertmapping.csv", index=False, encoding='utf-8-sig')
