In [None]:
import pandas as pd
import json

from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/MyDrive/folder_dataset/news.json"  # sesuaikan
articles = []

with open(dataset_path, "r") as f:
    for line in f:
        articles.append(json.loads(line))

df = pd.DataFrame(articles)
df["text"] = df["headline"] + ". " + df["short_description"]
df = df.dropna(subset=["text"])

print("Jumlah artikel:", len(df))
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Jumlah artikel: 209527


Unnamed: 0,link,headline,category,short_description,authors,date,text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,Woman Who Called Cops On Black Bird-Watcher Lo...


In [None]:
import re
import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))

def preprocess(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text.lower())
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

df["processed"] = df["text"].apply(preprocess)
docs = df["processed"].tolist()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install rank-bm25
from rank_bm25 import BM25Okapi

bm25 = BM25Okapi([d.split() for d in docs])

def bm25_retrieve(q, k=5):
    q = preprocess(q).split()
    scores = bm25.get_scores(q)
    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return top_idx



In [None]:
!pip install sentence_transformers faiss-cpu
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode(docs, normalize_embeddings=True)
d = emb.shape[1]

index = faiss.IndexFlatIP(d)
index.add(emb)

def faiss_retrieve(q, k=5):
    q_vec = model.encode([preprocess(q)], normalize_embeddings=True)
    scores, idx = index.search(q_vec, k)
    return idx[0]



In [None]:
from transformers import pipeline
generator = pipeline("text2text-generation", model="t5-small")

def generate_answer(question, retrieved_indices):
    context = "\n".join(df.iloc[i]["text"] for i in retrieved_indices)
    prompt = f"Answer based on the context below only.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
    result = generator(prompt, max_length=200)[0]["generated_text"]
    return result

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
query = "healthy news covid booster"
k = 5

bm_idx = bm25_retrieve(query, k)
faiss_idx = faiss_retrieve(query, k)

print("===== BM25 Result =====")
for i in bm_idx:
    print("•", df.iloc[i]["headline"])

print("\n===== FAISS Result =====")
for i in faiss_idx:
    print("•", df.iloc[i]["headline"])

print("\n===== Final Answer (use FAISS retrieval) =====\n")
answer = generate_answer(query, faiss_idx)
print(answer)

Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


===== BM25 Result =====
• The Most Common COVID-19 Booster Shot Questions Answered
• If You're Eligible For A COVID Booster Shot Right Now, Should You Get It?
• Twitter Users Outraged After Trump Calls COVID Booster Shots 'Crazy'
• As Virus Surges, Colorado Opens Up COVID Boosters To All Adults
• NASA Just Tested The Most Powerful Rocket Booster In History

===== FAISS Result =====
• Trump Coronavirus Update: The Latest On The President's Health And The GOP COVID-19 Outbreak
• FDA Working On Booster Vaccine Strategy, Could Release Next Month: Report
• COVID-Sick Zoo Lions Fed 'Tempting' Baby Food, Chicken Broth To Amp Up Appetites
• Overwhelmed By COVID News? Here's What To Monitor And What To Skip.
• Unvaccinated Glenn Beck Gets COVID Again, Says It's 'Getting Into My Lungs'

===== Final Answer (use FAISS retrieval) =====

Coronavirus Update: The Latest On The President's Health And The GOP COVID-19 Outbreak. HuffPost reporters are tracking Trump’s progress and the outbreaks stemming 

In [None]:
from sklearn.metrics import ndcg_score

def evaluate(query, retrieved_idx):
    true_category = "U.S. NEWS"
    relevances = [1 if df.iloc[i]["category"] == true_category else 0 for i in retrieved_idx]
    precision = sum(relevances) / len(relevances)
    recall = sum(relevances) / df[df["category"] == true_category].shape[0]
    ndcg = ndcg_score([relevances], [list(range(len(relevances)))])
    return precision, recall, ndcg

p1, r1, n1 = evaluate(query, bm_idx)
p2, r2, n2 = evaluate(query, faiss_idx)

print("\n📌 Evaluasi BM25")
print("Precision:", round(p1, 3))
print("Recall:", round(r1, 3))
print("nDCG:", round(n1, 3))

print("\n📌 Evaluasi FAISS")
print("Precision:", round(p2, 3))
print("Recall:", round(r2, 3))
print("nDCG:", round(n2, 3))

print("\n🔥 Retrieval terbaik:", "FAISS" if n2 > n1 else "BM25")


📌 Evaluasi BM25
Precision: 0.0
Recall: 0.0
nDCG: 0.0

📌 Evaluasi FAISS
Precision: 0.2
Recall: 0.001
nDCG: 1.0

🔥 Retrieval terbaik: FAISS


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

print("⏳ Loading LLM model...")

# Jika RAM colab kecil bisa ganti: flan-t5-base / flan-t5-small
LLM_NAME = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
model_llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME)

llm_pipeline = pipeline(
    "text2text-generation",
    model=model_llm,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    max_length=512,
)

def llm(prompt: str):
    """
    Pemanggil LLM untuk menjawab berdasarkan prompt yang sudah dirangkai.
    """
    result = llm_pipeline(prompt)[0]["generated_text"]
    return result

print("🔥 LLM siap digunakan!")

⏳ Loading LLM model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


🔥 LLM siap digunakan!


In [None]:
while True:
    user_q = input("\n🟢 Masukkan pertanyaan (atau ketik 'exit' untuk berhenti): ")
    if user_q.lower() == "exit":
        break

    result = rag_ask(user_q, retrieval="faiss")   # bisa diganti "bm25"
    print("\n🧠 Jawaban Sistem:\n", result)
