In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

# CELL 1: FULL PREPROCESSING & GENERATE FILE LABELING

!pip install Sastrawi tqdm

import pandas as pd
import re
import math
import numpy as np
from collections import Counter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from tqdm.notebook import tqdm
from google.colab import files

tqdm.pandas()

# 1. LOAD DATA & SAMPLING
print("üìÇ Loading dataset...")
try:
    df = pd.read_csv("/content/drive/MyDrive/UAS_TKI/turnbackhoax_2020_2025.csv", encoding='utf-8')
    df = df.head(3000).copy()
    print(f"‚úÖ Data dimuat! Kita proses {len(df)} baris data agar lancar.")
except:
    print("‚ùå ERROR: Upload dulu file 'turnbackhoax_2020_2025.csv' di menu kiri!")

# 2. PREPROCESSING (STEMMING SASTRAWI)
print("\n‚öôÔ∏è Menyiapkan Sastrawi...")
factory = StemmerFactory()
stemmer = factory.create_stemmer()

IND_STOPWORDS = {
    "yang","dan","di","ke","dari","ini","itu","pada","untuk","dengan","sebuah","adalah","oleh","atau",
    "juga","dalam","tidak","karena","sebagai","saat","sangat","lebih","tersebut","namun","supaya","agar"
}

def preprocess_full(text):
    text = str(text).lower()
    text = re.sub(r'http\S+',' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = stemmer.stem(text)
    tokens = [t for t in text.split() if t not in IND_STOPWORDS and len(t)>1]
    return " ".join(tokens)

print("üöÄ Sedang melakukan Stemming (Tunggu bar hijau selesai)...")

df['clean_text'] = df['isi_berita'].progress_apply(preprocess_full)

# 3. BANGUN MODEL (TF-IDF & BM25)
print("\nüèóÔ∏è Membangun Model Information Retrieval...")

# A. TF-IDF
tfidf = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

# B. BM25 Setup
df['tokens'] = df['clean_text'].apply(lambda x: x.split())
N = len(df)
avgdl = df['tokens'].apply(len).mean()
k1, b = 1.5, 0.75
df_counts = Counter()
for tokens in df['tokens']: df_counts.update(set(tokens))
vocab = set(tfidf.get_feature_names_out())
idf = {t: math.log((N - df_counts.get(t,0) + 0.5) / (df_counts.get(t,0) + 0.5) + 1) for t in vocab}
tf_per_doc = [Counter([t for t in tokens if t in vocab]) for tokens in df['tokens']]

# Fungsi Search Hybrid
def search_engine(q):
    q_clean = preprocess_full(q)

    # 1. TF-IDF Search
    q_vec = tfidf.transform([q_clean])
    sims = linear_kernel(q_vec, tfidf_matrix).flatten()

    # 2. BM25 Search (Hanya pada kandidat Top 100 TF-IDF )
    candidates_idx = sims.argsort()[::-1][:100]

    q_tokens = [t for t in q_clean.split() if t in vocab]
    bm25_scores = []

    for idx in candidates_idx:
        score = 0.0
        dl = len(df.iloc[idx]['tokens'])
        tf = tf_per_doc[idx]
        for term in q_tokens:
            f = tf.get(term,0)
            denom = f + k1 * (1 - b + b * (dl / avgdl))
            score += idf.get(term, 0.0) * (f * (k1 + 1)) / denom if denom > 0 else 0
        if score > 0: bm25_scores.append((idx, score))

    # Ambil Top 20 dari kombinasi TD-IDF dan BM25
    top_bm25 = [x[0] for x in sorted(bm25_scores, key=lambda x: x[1], reverse=True)[:20]]
    top_tfidf = list(candidates_idx[:20])
    # Gabung dan ambil unik
    return list(set(top_bm25 + top_tfidf))[:20]


queries = ['salah', 'video', 'foto', 'penipuan', 'jokowi', 'akun', 'indonesia', 'anies', 'covid', '19']
rows = []

print("\nüìù Membuat file kandidat pencarian...")
for q in queries:
    idx_list = search_engine(q)
    for i_loc in idx_list:
        rows.append({
            "query": q,
            "doc_id": i_loc,
            "judul": df.iloc[i_loc]['judul'],
            "snippet": df.iloc[i_loc]['isi_berita'][:200],
            "Relevant": ""
        })

filename = "siap_labeling.csv"
pd.DataFrame(rows).to_csv(filename, index=False, sep=';')
print(f"‚úÖ SELESAI! File '{filename}' akan terdownload otomatis.")
print("üëâ TUGASMU: Buka file itu, isi kolom 'Relevant' dengan 1 (Relevan) atau 0 (Tidak), Save, lalu Upload balik kesini.")
files.download(filename)

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m209.7/209.7 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
üìÇ Loading dataset...
‚úÖ Data dimuat! Kita proses 3000 baris data agar lancar.

‚öôÔ∏è Menyiapkan Sastrawi...
üöÄ Sedang melakukan Stemming (Tunggu bar hijau selesai)...


  0%|          | 0/3000 [00:00<?, ?it/s]


üèóÔ∏è Membangun Model Information Retrieval...

üìù Membuat file kandidat pencarian...
‚úÖ SELESAI! File 'siap_labeling.csv' akan terdownload otomatis.
üëâ TUGASMU: Buka file itu, isi kolom 'Relevant' dengan 1 (Relevan) atau 0 (Tidak), Save, lalu Upload balik kesini.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np

# File yang telah di labeling manual
nama_file = "/content/drive/MyDrive/UAS_TKI/tki__.csv"

print(f"üìÇ Membaca file '{nama_file}'...")

try:

    df = pd.read_csv(nama_file, sep=';')


    df['Relevant'] = pd.to_numeric(df['Relevant'], errors='coerce').fillna(0)

    print("\n" + "="*85)
    print(f"{'Query':<10} | {'k':<3} | {'Precision':<9} | {'Recall':<9} | {'F-Measure':<9} | {'AP':<6}")
    print("-" * 85)

    total_ap = 0
    num_queries = 0

    for query, group in df.groupby('query'):
        relevance = group['Relevant'].values
        total_rel = np.sum(relevance)
        if total_rel == 0: total_rel = 1

        # Hitung AP
        rel_cnt = 0; prec_sum = 0
        for i, r in enumerate(relevance):
            if r == 1:
                rel_cnt += 1
                prec_sum += rel_cnt / (i + 1)
        ap = prec_sum / total_rel
        total_ap += ap
        num_queries += 1

        # Print per k
        for k in [5, 10, 20]:
            rel_k = relevance[:k]
            prec = np.sum(rel_k) / k
            rec = np.sum(rel_k) / total_rel
            f1 = 2*(prec*rec)/(prec+rec) if (prec+rec)>0 else 0

            q_lbl = query[:10] if k==5 else ""
            ap_lbl = f"{ap:.3f} (AP)" if k==5 else ""
            print(f"{q_lbl:<10} | {k:<3} | {prec:.3f}     | {rec:.3f}     | {f1:.3f}     | {ap_lbl}")

    print("-" * 85)
    print(f"Mean Average Precision (MAP): {total_ap/num_queries:.4f}")
    print("="*85)

except Exception as e:
    print(f"‚ùå Error: {e}")

üìÇ Membaca file '/content/drive/MyDrive/UAS_TKI/tki__.csv'...

Query      | k   | Precision | Recall    | F-Measure | AP    
-------------------------------------------------------------------------------------
19         | 5   | 1.000     | 0.263     | 0.417     | 0.985 (AP)
           | 10  | 1.000     | 0.526     | 0.690     | 
           | 20  | 0.950     | 1.000     | 0.974     | 
akun       | 5   | 1.000     | 0.250     | 0.400     | 1.000 (AP)
           | 10  | 1.000     | 0.500     | 0.667     | 
           | 20  | 1.000     | 1.000     | 1.000     | 
anies      | 5   | 1.000     | 0.250     | 0.400     | 1.000 (AP)
           | 10  | 1.000     | 0.500     | 0.667     | 
           | 20  | 1.000     | 1.000     | 1.000     | 
covid      | 5   | 1.000     | 0.250     | 0.400     | 1.000 (AP)
           | 10  | 1.000     | 0.500     | 0.667     | 
           | 20  | 1.000     | 1.000     | 1.000     | 
foto       | 5   | 1.000     | 0.250     | 0.400     | 1.000 (AP)
         