# TF-IDF Semantic Search

In [1]:
import pandas as pd
from pathlib import Path

# sesuaikan path relatif dari folder notebooks/
DATA_PATH = Path("../data/processed/tafsir_clean.csv")

df = pd.read_csv(DATA_PATH)

# buang baris yang tafsir-nya kosong
df = df.dropna(subset=["tafsir"]).reset_index(drop=True)

df.head()


Unnamed: 0,surah,ayah,arabic_text,indonesian_translation,tafsir,tafsir_id
0,Al-Fātiḥah,1,بِسْمِ اللّٰهِ الرَّحْمٰنِ الرَّحِيْمِ,Dengan nama Allah Yang Maha Pengasih lagi Maha...,Aku memulai bacaan Al-Qur'an dengan menyebut n...,Al-Fātiḥah :1
1,Al-Fātiḥah,2,اَلْحَمْدُ لِلّٰهِ رَبِّ الْعٰلَمِيْنَۙ,"Segala puji bagi Allah, Tuhan1) semesta alam",Segala puji kita persembahkan hanya untuk Alla...,Al-Fātiḥah :2
2,Al-Fātiḥah,3,الرَّحْمٰنِ الرَّحِيْمِۙ,"Yang Maha Pengasih lagi Maha Penyayang,","Dialah Yang Maha Pengasih, Pemilik dan sumber ...",Al-Fātiḥah :3
3,Al-Fātiḥah,4,مٰلِكِ يَوْمِ الدِّيْنِۗ,Pemilik hari Pembalasan.2),Dialah satu-satunya Pemilik hari Pembalasan da...,Al-Fātiḥah :4
4,Al-Fātiḥah,5,اِيَّاكَ نَعْبُدُ وَاِيَّاكَ نَسْتَعِيْنُۗ,Hanya kepada Engkaulah kami menyembah dan hany...,"Atas dasar itu semua, hanya kepada Engkaulah k...",Al-Fātiḥah :5


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# kita pakai teks tafsir sebagai basis IR
tafsir_texts = df["tafsir"].tolist()

tfidf = TfidfVectorizer(
    max_df=0.9,        # buang kata yang muncul di >90% dokumen
    min_df=2,          # buang kata yang terlalu jarang (muncul <2 dokumen)
    ngram_range=(1, 2) # unigram + bigram
)

tfidf_matrix = tfidf.fit_transform(tafsir_texts)
tfidf_matrix.shape


(6236, 48811)

In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def search_tafsir_tfidf(query: str, top_k: int = 5) -> pd.DataFrame:
    # vektorkan query
    q_vec = tfidf.transform([query])
    
    # cosine similarity ke semua tafsir
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]  # shape: (n_docs,)
    
    # ambil index top_k
    top_idx = np.argsort(sims)[::-1][:top_k]
    
    results = df.iloc[top_idx].copy()
    results["score_tfidf"] = sims[top_idx]
    return results

# contoh uji
query_demo = "hukum riba dalam Al Quran"
search_tafsir_tfidf(query_demo, top_k=5)


Unnamed: 0,surah,ayah,arabic_text,indonesian_translation,tafsir,tafsir_id,score_tfidf
4325,Az-Zukhruf,1,حٰمۤ ۚ,Ḥā Mīm.,Ha Mim. Kedua huruf ini termasuk huruf-huruf y...,Az-Zukhruf:1,0.343835
4326,Az-Zukhruf,2,وَالْكِتٰبِ الْمُبِيْنِ ۙ,"Demi Kitab (Al-Qur’an) yang jelas,","Demi al-kitab, demikian Allah bersumpah yang d...",Az-Zukhruf:2,0.249132
6131,Al-Bayyinah,2,رَسُوْلٌ مِّنَ اللّٰهِ يَتْلُوْا صُحُفًا مُّطَ...,(yaitu) seorang Rasul dari Allah (Nabi Muhamma...,"Bukti yang nyata itu adalah Nabi Muhammad, seo...",Al-Bayyinah:2,0.207765
281,Al-Baqarah,275,اَلَّذِيْنَ يَأْكُلُوْنَ الرِّبٰوا لَا يَقُوْم...,Orang-orang yang memakan (bertransaksi dengan)...,Orang-orang yang memakan riba yakni melakukan ...,Al-Baqarah :275,0.201989
284,Al-Baqarah,278,يٰٓاَيُّهَا الَّذِيْنَ اٰمَنُوا اتَّقُوا اللّٰ...,"Wahai orang-orang yang beriman, bertakwalah ke...",Wahai orang-orang yang beriman! Bertakwalah ke...,Al-Baqarah :278,0.192283


# BERT / SBERT Semantic Search

In [4]:
%pip install -q sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

In [None]:
# model multibahasa yang cukup ringan (bisa diganti nanti)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer(MODEL_NAME, device=device)

device


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

'cpu'

: 

In [None]:
tafsir_texts = df["tafsir"].tolist()

tafsir_embs = embedder.encode(
    tafsir_texts,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True
)
tafsir_embs.shape  # (n_docs, dim)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_tafsir_bert(query: str, top_k: int = 5) -> pd.DataFrame:
    # embed query
    q_emb = embedder.encode([query], convert_to_numpy=True)[0]  # shape: (dim,)
    
    # cosine similarity
    sims = cosine_similarity([q_emb], tafsir_embs)[0]  # (n_docs,)
    
    # top_k index
    top_idx = np.argsort(sims)[::-1][:top_k]
    
    results = df.iloc[top_idx].copy()
    results["score_bert"] = sims[top_idx]
    return results

# contoh uji
query_demo = "hukum riba dalam Al Quran"
search_tafsir_bert(query_demo, top_k=5)
