In [None]:
from modules.search_engines import GoogleSearchEngine, DuckDuckGoSearchEngine
from modules.embedder import HuggingFaceEmbedder
import os
from dotenv import load_dotenv
import pandas as pd
from typing import List, Dict
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import urlparse
from modules.help import load_credible_domains
from modules.prefilter import prefilter_results
from modules.llm_base import LLM, LOCAL_LLM, build_classification_prompt
from tqdm import tqdm


In [None]:
load_dotenv()

## Import Dataset

In [None]:
fake_df = pd.read_csv('./data/Fake.csv')
true_df = pd.read_csv('./data/True.csv')

fake_df['class_'] = 'fake'
true_df['class_'] = 'true'

df = pd.concat([fake_df, true_df], axis=0)
df.shape, fake_df.shape, true_df.shape

In [None]:
df

## Full run: llama 3.1 8B ; Duck Duck Go ; Embbeding Reduction

In [None]:
from tqdm import tqdm

pre_prompt = """
You are an assistant specialized in generating optimized search queries from news articles.
Your task is to produce one single, continuous search query that can be used in search engines (e.g., Google, DuckDuckGo) to retrieve accurate and relevant information about the reported event.
Extract all key elements explicitly mentioned in the news — who, what, when, where, why, and how — including all dates.
Use these elements to form one single query, not quoted strings, not multiple fragments.
The output must be a single line containing exactly one query.
Do not output lists, bullet points, explanations, or multiple queries.
Do not invent information not present in the news.
News headline:
{}
News:
{}
"""

#queries = df['title'].sample(10, random_state=42)
queries = df.sample(10, random_state=42)

search_engine = DuckDuckGoSearchEngine()
embedder = HuggingFaceEmbedder(model_name="./models/distilbert-base-uncased")

groq_llm = LOCAL_LLM(
    model="gemma3:4b",
)

results_list = []

for row in tqdm(queries.itertuples(index=False), total=len(queries), desc="Processing titles"):
    print('title:', row.title)
    print('text:', row.text)
    print('class:', row.class_)
    # PRE
    search_query = groq_llm.generate(prompt=pre_prompt.format(row.title, row.text), temperature=0.0)
    
    # 1️⃣ Search
    results = search_engine.search(search_query, num_results=5)
    
    for r in results:
        print ("--",r)
    
    # 2️⃣ Prefilter
    #results_filtered = prefilter_results(results, title, embedder)
    results_filtered = results
    
    # Count how many results were kept
    filtered_count = len(results_filtered)
    
    # 3️⃣ Build prompt
    prompt = build_classification_prompt(row.title, results_filtered)
    
    # 4️⃣ Call LLM
    try:
        response = groq_llm.generate(prompt=prompt, temperature=0.0)
        results_list.append({
            "title": row.title,
            "llm_output": response,
            "filtered_count": filtered_count
        })
    except Exception as e:
        print(f"Error for title: {row.title} -> {e}")
        results_list.append({
            "title": row.title,
            "llm_output": None,
            "filtered_count": filtered_count
        })

In [None]:
BREAK

In [None]:
results_df = pd.DataFrame(results_list)
results_df.to_csv('./data/llama3_70B_duck_baseline.csv', sep='|', index=False)

In [None]:
results_df = pd.DataFrame(results_list)
merged_df = results_df.merge(
    df[df['title'].isin(queries.tolist())][['title', 'class']], 
    on='title'
)

# Make sure LLM outputs and true classes are aligned
y_true = merged_df['class']
y_pred = merged_df['llm_output']

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=y_true.unique())
cm_df = pd.DataFrame(cm, index=y_true.unique(), columns=y_true.unique())

# Plot confusion matrix
plt.figure(figsize=(8,6))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.ylabel('True Class')
plt.xlabel('LLM Output')
plt.title('LLama 8b - Duck Duck Go - Confusion Matrix')
plt.show()

# Optional: print classification report
print(classification_report(y_true, y_pred))