In [1]:
%pip -q install --upgrade pip
%pip -q install sentence-transformers==2.7.0 faiss-cpu
%pip -q install pandas numpy scikit-learn matplotlib tqdm langdetect
%pip -q install beautifulsoup4 requests
%pip -q install gradio

import os, sys, json, re, time, math, textwrap, random
from pathlib import Path

# Persistent base dir (Colab: /content)
BASE_DIR = Path("/content/fake-news-rag")
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
MODEL_DIR = BASE_DIR / "models"
EMB_DIR = MODEL_DIR / "embeddings"

for d in [BASE_DIR, DATA_DIR, RAW_DIR, PROC_DIR, MODEL_DIR, EMB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Project folders ready:", BASE_DIR)

Project folders ready: /content/fake-news-rag


In [None]:
# Install the OpenAI Python SDK
!pip install openai

import os
from openai import AzureOpenAI

# Set your Azure OpenAI endpoint and API key (from Azure portal)
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_KEY"]  = ""

# Create the AzureOpenAI client
client = AzureOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=""  # use the latest supported API version
)

# Send a chat completion request to your Phi-4 Mini deployment
response = client.chat.completions.create(
    model="Phi-4-mini-reasoning-2",  # your Phi-4 Mini deployment name
    messages=[{"role": "user", "content": "Hello, Phi-4 Mini!"}]
)

# Print the assistant's reply
print(response.choices[0].message.content)

<think>
Okay, let's see what kind of problem I need to solve here. The user mentioned "Phi-4 Mini!" and called me Phi. Hmm, maybe there's a trick here. Let me check the original message again.

The user wrote: "You will be given a problem. Please reason step by step, and put your final answer within \boxed{}: Hello, Phi-4 Mini!" 

Wait, so the problem is just the phrase "Hello, Phi-4 Mini!"? That seems a bit odd. Maybe it's a riddle or a puzzle related to the name itself. Let's break it down.

First, "Phi-4 Mini!" Phi... Oh, Phi could refer to the golden ratio, which is approximately 1.618. The "4 Mini" part might be indicating version 4, but mini? Maybe it's a play on words. Let's think about the letters. 

Phi is the Greek letter φ. The "-4 Mini" could be a reference to something else. Maybe electrical circuits? The PHI-4 could be a model of a device, and Mini might refer to a small version. But I'm not sure. Alternatively, could it be something to do with the periodic table? Phospho

In [3]:
%pip install azure-ai-inference



In [4]:
pip install faiss-gpu-cu12 --index-url https://pypi.org/simple




Now that the necessary package is installed, I'll correct the placeholder for the API key in the client initialization. You should replace `<YOUR_AZURE_API_KEY>` with your actual key, which is recommended to be stored securely using Colab's Secrets Manager.

In [None]:
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
import os

# For Serverless API or Managed Compute endpoints
client = ChatCompletionsClient(
    endpoint="",
    credential=AzureKeyCredential(os.environ["AZURE_OPENAI_API_KEY"]), # Use the API key from environment variables
    api_version=""
)

In [7]:
%pip install newspaper3k

Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Using cached cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Using cached feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Using cached tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Using cached feedfinder2-0.0.4-py3-none-any.whl
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Using cached jieba3k-0.35.1-py3-none-any.whl
Collecting tinysegmenter==0.3 (from newspaper3k)
  Using cached tinysegmenter-0.3-py3-none-any.whl
Collecting sgmllib3k (from feedparser>=5.2.1->newspaper3k)
  Using cached sgmllib3k-1.0.0-py3-none-any.whl
Collecting requests-file>=1.4 (from tldextract>=2.0.1->newspaper3k)
  Using cached requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached newspaper3k-0.

In [8]:
%pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.2


In [9]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("✅ GPU enabled:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("⚠️ No GPU found, using CPU.")


✅ GPU enabled: Tesla T4


In [10]:
%pip install langdetect



In [12]:
# ===== 2) DATASET LOADING =====
import pandas as pd
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re
from pathlib import Path
from newspaper import Article  # Import Article from newspaper

# Define RAW_DIR if not already defined
RAW_DIR = Path("/content/fake-news-rag/data/raw")
RAW_DIR.mkdir(exist_ok=True, parents=True)

# Define dataset paths
FAKE_CSV = RAW_DIR / "Fake.csv"
TRUE_CSV = RAW_DIR / "True.csv"
DAILYSTAR_CSV = RAW_DIR / "dailystar_news.csv"

def ensure_datasets_present():
    """Check if all required datasets exist in RAW_DIR."""
    missing = []
    if not FAKE_CSV.exists():
        missing.append("Fake.csv")
    if not TRUE_CSV.exists():
        missing.append("True.csv")
    if not DAILYSTAR_CSV.exists():
        missing.append("dailystar_news.csv")

    if missing:
        print("❌ Missing datasets:", ", ".join(missing))
        print("👉 Please place them at:", RAW_DIR)
        print("Expected columns:")
        print(" - Fake.csv / True.csv: ['title','text','subject','date']")
        print(" - dailystar_news.csv: at least ['title','text','date'] (or similar)")
    else:
        print("✅ All datasets found (ISOT + DailyStar).")

# Run the check
ensure_datasets_present()

# Load datasets (if available)
if FAKE_CSV.exists() and TRUE_CSV.exists():
    fake_df = pd.read_csv(FAKE_CSV)
    true_df = pd.read_csv(TRUE_CSV)
    print("ISOT Fake shape:", fake_df.shape)
    print("ISOT True shape:", true_df.shape)

if DAILYSTAR_CSV.exists():
    dailystar_df = pd.read_csv(DAILYSTAR_CSV)
    print("DailyStar shape:", dailystar_df.shape)
    print(dailystar_df.head())


✅ All datasets found (ISOT + DailyStar).
ISOT Fake shape: (23481, 4)
ISOT True shape: (21417, 4)
DailyStar shape: (2573, 5)
                                               title  date  \
0               3 BNP men among 5 held for extortion   NaN   
1               Sayeed Hossain Chowdhury passes away   NaN   
2            Youth dies being hit by train in Khulna   NaN   
3                 ‘I won’t play with planes anymore’   NaN   
4  Preventing project delays: Govt to make guidel...   NaN   

                                             content  \
0  Avijit murder convict Farabi freed on bail\nBo...   
1  Avijit murder convict Farabi freed on bail\nBo...   
2  Avijit murder convict Farabi freed on bail\nBo...   
3  Avijit murder convict Farabi freed on bail\nBo...   
4  Avijit murder convict Farabi freed on bail\nBo...   

                                              source error  
0  https://www.thedailystar.net/news/bangladesh/c...   NaN  
1  https://www.thedailystar.net/news/banglad

In [13]:
# ===== 3) PREPROCESSING & CLEANING =====
import numpy as np
import re  # Added missing import
from langdetect import detect, LangDetectException
from pathlib import Path  # Added missing import
import pandas as pd  # Added missing import

# Define PROC_DIR if not already defined
PROC_DIR = Path("/content/fake-news-rag/data/processed")  # Adjust path as needed
PROC_DIR.mkdir(exist_ok=True, parents=True)

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = re.sub(r"<[^>]+>", " ", s)           # HTML
    s = re.sub(r"http\S+|www\.\S+", " ", s)  # URLs
    s = re.sub(r"\s+", " ", s).strip()
    return s

def detect_lang_safe(s: str) -> str:
    try:
        return detect(s) if s else "unknown"
    except LangDetectException:
        return "unknown"

def load_isot(fake_csv: Path, true_csv: Path) -> pd.DataFrame:
    if not fake_csv.exists() or not true_csv.exists():
        return pd.DataFrame(columns=["title","text","subject","date","label"])
    fake = pd.read_csv(fake_csv)
    true = pd.read_csv(true_csv)
    fake["label"] = 0
    true["label"] = 1
    df = pd.concat([fake, true], ignore_index=True)
    # Normalize column names
    if "text" not in df.columns:
        if "content" in df.columns:
            df["text"] = df["content"]
        else:
            df["text"] = ""
    if "title" not in df.columns:
        df["title"] = ""
    return df[["title","text","label"]]

# --- Load ISOT ---
isot_df = load_isot(FAKE_CSV, TRUE_CSV)

if not isot_df.empty:
    isot_df["title"] = isot_df["title"].astype(str).apply(clean_text)
    isot_df["text"]  = isot_df["text"].astype(str).apply(clean_text)
    isot_df["lang"]  = isot_df["text"].apply(detect_lang_safe)
else:
    print("ISOT not loaded; continuing with scraped only.")

# --- Load DailyStar ---
if DAILYSTAR_CSV.exists():
    try:
        dailystar_df = pd.read_csv(DAILYSTAR_CSV)
        if not dailystar_df.empty:
            if "text" not in dailystar_df.columns:
                if "content" in dailystar_df.columns:
                    dailystar_df["text"] = dailystar_df["content"]
                else:
                    dailystar_df["text"] = ""
            if "title" not in dailystar_df.columns:
                dailystar_df["title"] = ""

            dailystar_df["title"] = dailystar_df["title"].astype(str).apply(clean_text)
            dailystar_df["text"]  = dailystar_df["text"].astype(str).apply(clean_text)
            dailystar_df["label"] = np.nan  # unknown
            dailystar_df["lang"]  = dailystar_df["text"].apply(detect_lang_safe)
        else:
            print("DailyStar dataset is empty, skipping.")
    except Exception as e:
        print(f"Error loading DailyStar dataset: {e}")
        dailystar_df = pd.DataFrame(columns=["title","text","label","lang"])
else:
    print("DailyStar CSV not found.")
    dailystar_df = pd.DataFrame(columns=["title","text","label","lang"])

# Save a separate cleaned DailyStar file for reuse
if not dailystar_df.empty:
    DAILYSTAR_PROC_PATH = PROC_DIR / "dailystar_clean.csv"
    dailystar_df.to_csv(DAILYSTAR_PROC_PATH, index=False, encoding="utf-8")
    print(f"Processed DailyStar dataset saved: {DAILYSTAR_PROC_PATH} (rows={len(dailystar_df)})")

# --- Prepare scraped ---
if 'scraped_df' in globals() and not scraped_df.empty:
    scraped_df = scraped_df.rename(columns={"content":"text"})
    scraped_df["title"] = scraped_df["title"].astype(str).apply(clean_text)
    scraped_df["text"]  = scraped_df["text"].astype(str).apply(clean_text)
    scraped_df["label"] = np.nan  # unknown
    scraped_df["lang"]  = scraped_df["text"].apply(detect_lang_safe)
else:
    print("Scraped data not available; creating empty DataFrame")
    scraped_df = pd.DataFrame(columns=["title", "text", "label", "lang"])

# --- Merge (ISOT labeled + DailyStar unlabeled + Scraped unlabeled) ---
combined_df = pd.DataFrame(columns=["title","text","label","lang","source"])
frames = []
if not isot_df.empty:
    a = isot_df.copy()
    a["source"] = "isot"
    frames.append(a)
if not scraped_df.empty:
    b = scraped_df[["title","text","label","lang"]].copy()
    b["source"] = "scraped"
    frames.append(b)
if not dailystar_df.empty:
    c = dailystar_df[["title","text","label","lang"]].copy()
    c["source"] = "dailystar"
    frames.append(c)

if frames:
    combined_df = pd.concat(frames, ignore_index=True)

# --- Drop empties / dedup ---
combined_df = combined_df.dropna(subset=["text"])
combined_df = combined_df[combined_df["text"].str.len() > 30]  # minimal length
combined_df = combined_df.drop_duplicates(subset=["title","text"]).reset_index(drop=True)

# --- Save combined dataset ---
PROC_PATH = PROC_DIR / "clean_dataset.csv"
combined_df.to_csv(PROC_PATH, index=False, encoding="utf-8")
print(f"Processed dataset saved: {PROC_PATH} (rows={len(combined_df)})")
print("Language distribution:")
print(combined_df["lang"].value_counts())
display(combined_df.sample(min(3, len(combined_df))))


Processed DailyStar dataset saved: /content/fake-news-rag/data/processed/dailystar_clean.csv (rows=2573)
Scraped data not available; creating empty DataFrame
Processed dataset saved: /content/fake-news-rag/data/processed/clean_dataset.csv (rows=41106)
Language distribution:
lang
en    41098
de        6
sw        1
vi        1
Name: count, dtype: int64


Unnamed: 0,title,text,label,lang,source
36647,"As Congo refugees pour over border, Angola's b...","DUNDO, Angola (Reuters) - Captured by militia ...",1.0,en,isot
38116,India's Modi heads to Myanmar as Rohingya refu...,NEW DELHI (Reuters) - Indian Prime Minister Na...,1.0,en,isot
5025,"NBC Chairman SHREDS ‘Demented’ Trump, Rips Rat...",NBC Entertainment chairman Bob Greenblatt took...,0.0,en,isot


In [14]:
# ===== 4) EMBEDDINGS + FAISS =====
from sentence_transformers import SentenceTransformer
import faiss
import json
from pathlib import Path
import torch

# Detect GPU device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Define EMB_DIR if not already defined
EMB_DIR = Path("/content/fake-news-rag/models/embeddings")
EMB_DIR.mkdir(exist_ok=True, parents=True)

# Check if combined_df exists and has data
if 'combined_df' not in globals() or len(combined_df) == 0:
    PROC_PATH = PROC_DIR / "clean_dataset.csv"
    if PROC_PATH.exists():
        print("Loading combined_df from processed file...")
        try:
            combined_df = pd.read_csv(PROC_PATH)
            if len(combined_df) == 0:
                raise ValueError("Processed file is empty.")
        except Exception as e:
            print(f"Could not load processed file ({e}). Attempting to re-create combined_df...")

            # Load ISOT
            isot_df = load_isot(FAKE_CSV, TRUE_CSV)
            print(f"Loaded ISOT data: {len(isot_df)} rows")

            # Load DailyStar if available
            if DAILYSTAR_CSV.exists():
                try:
                    dailystar_df = pd.read_csv(DAILYSTAR_CSV)
                    print(f"Loaded DailyStar data: {len(dailystar_df)} rows")

                    if not dailystar_df.empty:
                        dailystar_df["title"] = dailystar_df["title"].astype(str).apply(clean_text)
                        dailystar_df["text"]  = dailystar_df["text"].astype(str).apply(clean_text)
                        dailystar_df["label"] = np.nan  # unknown
                        dailystar_df["lang"]  = dailystar_df["text"].apply(detect_lang_safe)
                        dailystar_df["source"] = "dailystar"
                except Exception as e:
                    print(f"Error loading DailyStar dataset: {e}")
                    dailystar_df = pd.DataFrame(columns=["title","text","label","lang","source"])
            else:
                print("DailyStar CSV not found.")
                dailystar_df = pd.DataFrame(columns=["title","text","label","lang","source"])

            # Mock scraped fallback
            print("Generating mock scraped data as fallback...")
            mock_data = {
                'title': [
                    'Bangladesh Government Announces New Economic Policy',
                    'Dhaka Metro Rail Expansion Project Approved',
                    'New Solar Power Plant Inaugurated in Rural Area',
                    'Prime Minister Meets with Foreign Dignitaries',
                    'Inflation Rate Shows Slight Decrease'
                ],
                'content': [
                    'The government has introduced a new set of economic policies aimed at boosting foreign investment and stabilizing the national currency...',
                    'The long-awaited expansion of the Dhaka Metro Rail project received final approval today...',
                    'A new large-scale solar power plant was inaugurated today in a rural district...',
                    'Prime Minister Sheikh Hasina held meetings with several foreign dignitaries today...',
                    'According to the latest report from the Bureau of Statistics, the national inflation rate saw a slight decrease last month...'
                ],
                'source': [
                    'https://example.com/news/economic-policy',
                    'https://example.com/news/metro-rail',
                    'https://example.com/news/solar-plant',
                    'https://example.com/news/pm-meeting',
                    'https://example.com/news/inflation-report'
                ],
                'error': [None, None, None, None, None]
            }
            scraped_df = pd.DataFrame(mock_data)
            print(f"Generated mock scraped data: {len(scraped_df)} rows")

            # Cleaning & merging
            if not isot_df.empty:
                isot_df["title"] = isot_df["title"].astype(str).apply(clean_text)
                isot_df["text"]  = isot_df["text"].astype(str).apply(clean_text)
                isot_df["lang"]  = isot_df["text"].apply(detect_lang_safe)
                isot_df["source"] = "isot"

            if not scraped_df.empty:
                scraped_df = scraped_df.rename(columns={"content":"text"})
                scraped_df["title"] = scraped_df["title"].astype(str).apply(clean_text)
                scraped_df["text"]  = scraped_df["text"].astype(str).apply(clean_text)
                scraped_df["label"] = np.nan
                scraped_df["lang"]  = scraped_df["text"].apply(detect_lang_safe)
                scraped_df["source"] = "scraped"

            frames = []
            if not isot_df.empty: frames.append(isot_df)
            if not scraped_df.empty: frames.append(scraped_df[["title","text","label","lang","source"]])
            if not dailystar_df.empty: frames.append(dailystar_df[["title","text","label","lang","source"]])

            if frames:
                combined_df = pd.concat(frames, ignore_index=True)
            else:
                combined_df = pd.DataFrame(columns=["title","text","label","lang","source"])
                print("No data frames to concatenate.")

            combined_df = combined_df.dropna(subset=["text"])
            combined_df = combined_df[combined_df["text"].str.len() > 30]
            combined_df = combined_df.drop_duplicates(subset=["title","text"]).reset_index(drop=True)

            if len(combined_df) == 0:
                raise ValueError("Failed to re-create combined_df with enough data.")
            else:
                print(f"Successfully re-created combined_df with {len(combined_df)} rows.")
    else:
        print("No data to embed. Ensure ISOT, DailyStar, or scraping succeeded.")
        raise SystemExit("No data to embed.")

print(f"Embedding {len(combined_df)} documents...")

MODEL_NAME = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(MODEL_NAME, device=device)  # use GPU if available

# Encode title + text
def to_doc(row):
    return f"{row['title']}. {row['text']}"

docs = combined_df.apply(to_doc, axis=1).tolist()

# Compute embeddings on GPU if available
emb = embedder.encode(
    docs,
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True,
    device=device
).astype("float32")

if emb.shape[0] == 0:
    raise ValueError("No embeddings generated. The input documents list might be empty.")

dim = emb.shape[1]
print("Embeddings shape:", emb.shape)

# Use IndexFlatIP for cosine similarity (dot product on normalized vectors)
# Always use CPU index to avoid GPU resource issues
index = faiss.IndexFlatIP(dim)  # CPU version
print("Using FAISS CPU index.")

index.add(emb)
print("FAISS ntotal:", index.ntotal)

# Save index + metadata
FAISS_PATH = EMB_DIR / "faiss.index"
META_PATH  = EMB_DIR / "metadata.json"

if index.ntotal > 0:
    faiss.write_index(index, str(FAISS_PATH))

    metadata = {
        "rows": len(combined_df),
        "dimension": dim,
        "model": MODEL_NAME,
        "documents": [
            {
                "index": i,
                "title": str(row['title'])[:100],
                "text_preview": str(row['text'])[:200] if pd.notna(row['text']) else "",
                "label": row['label'] if 'label' in row and pd.notna(row['label']) else "unknown",
                "source": row['source'] if 'source' in row else "unknown"
            }
            for i, (_, row) in enumerate(combined_df.iterrows())
        ]
    }

    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    print("✅ Saved:", FAISS_PATH, "and", META_PATH)
    print(f"FAISS index contains {index.ntotal} vectors with dimension {dim}")

    # Test query
    sample_query = "news about politics"
    query_embedding = embedder.encode([sample_query], normalize_embeddings=True, device=device).astype("float32")
    D, I = index.search(query_embedding, 3)

    print(f"\nSample query: '{sample_query}'")
    print("Top 3 similar documents:")
    for i, (distance, idx) in enumerate(zip(D[0], I[0])):
        if idx < len(combined_df):
            doc_title = combined_df.iloc[idx]['title']
            print(f"{i+1}. Distance: {distance:.4f}, Title: {doc_title[:80]}...")
else:
    print("⚠️ Skipping FAISS index saving as no embeddings were generated.")

Using device: cuda
Embedding 41106 documents...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/643 [00:00<?, ?it/s]

Embeddings shape: (41106, 384)
Using FAISS CPU index.
FAISS ntotal: 41106
✅ Saved: /content/fake-news-rag/models/embeddings/faiss.index and /content/fake-news-rag/models/embeddings/metadata.json
FAISS index contains 41106 vectors with dimension 384

Sample query: 'news about politics'
Top 3 similar documents:
1. Distance: 0.5570, Title: OBAMA BLAMES RUSSIA For Hillary’s Loss, But NEW HARVARD STUDY Exposes Who REALLY...
2. Distance: 0.5220, Title: NOTHING NEW: ‘Fake’ & Weaponized News Has Long Haunted Our War-Weary World...
3. Distance: 0.5214, Title: FAKE NEWS WEEK: Mainstream Media – All the Fake News That’s Fit to Print...


In [15]:
from pathlib import Path  # Added missing import

def check_existing_models():
    """Check which models already exist to avoid retraining"""
    existing_models = {}

    # Define MODEL_DIR and other necessary directories if not already defined
    BASE_DIR = Path("/content/fake-news-rag")
    MODEL_DIR = BASE_DIR / "models"
    EMB_DIR = MODEL_DIR / "embeddings"
    PROC_DIR = BASE_DIR / "data" / "processed"

    # Check for FAISS index
    FAISS_PATH = EMB_DIR / "faiss.index"
    existing_models['faiss'] = FAISS_PATH.exists()

    # Check for metadata
    META_PATH = EMB_DIR / "metadata.json"
    existing_models['metadata'] = META_PATH.exists()

    # Check for classifier
    CLASSIFIER_PATH = MODEL_DIR / "logistic_classifier.joblib"  # Assuming classifier is saved in MODEL_DIR
    existing_models['classifier'] = CLASSIFIER_PATH.exists()

    # Check for combined dataframe (ISOT + others)
    COMBINED_DF_PATH = PROC_DIR / "combined_dataframe.pkl"  # Or .csv if saved that way
    CLEAN_CSV_PATH = PROC_DIR / "clean_dataset.csv"
    existing_models['combined_df'] = COMBINED_DF_PATH.exists() or CLEAN_CSV_PATH.exists()

    # ✅ Check for DailyStar-specific processed/cleaned files
    DAILYSTAR_CSV_PATH = PROC_DIR / "dailystar_clean.csv"
    DAILYSTAR_PKL_PATH = PROC_DIR / "dailystar_dataframe.pkl"
    existing_models['dailystar'] = DAILYSTAR_CSV_PATH.exists() or DAILYSTAR_PKL_PATH.exists()

    return existing_models

# Check what we already have
existing_models = check_existing_models()
print("Existing models found:")
for model, exists in existing_models.items():
    print(f"  {model}: {'✓' if exists else '✗'}")


Existing models found:
  faiss: ✓
  metadata: ✓
  classifier: ✗
  combined_df: ✓
  dailystar: ✓


In [None]:
# ===== 5) AZURE PHI-4 MINI INTEGRATION =====
import os
import json  # Added missing import
import textwrap  # Added missing import
from typing import List, Dict, Any, Optional
from openai import AzureOpenAI

# ---- Set your secrets here (or use os.environ[...] before running this cell) ----
# os.environ["AZURE_OPENAI_ENDPOINT"]    = "https://<your-resource>.openai.azure.com"
# os.environ["AZURE_OPENAI_API_KEY"]     = "<your-azure-openai-key>"
# os.environ["AZURE_OPENAI_DEPLOYMENT"]  = "Phi-4-mini-reasoning-2"   # <-- your deployment name (not base model name)
# os.environ["AZURE_OPENAI_API_VERSION"] = "2024-10-21"        # pick a stable version available to your resource

AZURE_ENDPOINT   = os.getenv("AZURE_OPENAI_ENDPOINT", "")
AZURE_API_KEY    = os.getenv("AZURE_OPENAI_API_KEY", "")
AZURE_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT", "Phi-4-mini-reasoning")
AZURE_API_VERSION= os.getenv("AZURE_OPENAI_API_VERSION", "")

def get_azure_client() -> Optional[AzureOpenAI]:
    if not AZURE_ENDPOINT or not AZURE_API_KEY:
        print("Azure credentials missing. Using MOCK generation (for dev/testing).")
        return None
    try:
        client = AzureOpenAI(
            azure_endpoint=AZURE_ENDPOINT,
            api_key=AZURE_API_KEY,
            api_version=AZURE_API_VERSION
        )
        # Test the connection with a simple call
        client.models.list()
        print("Azure OpenAI client initialized successfully.")
        return client
    except Exception as e:
        print(f"Failed to initialize Azure client: {e}. Using MOCK generation.")
        return None

client = get_azure_client()

SYSTEM_PROMPT = (
    "You are a factual fact-checking assistant for Bangladesh news. "
    "Classify news as Fake or Real using ONLY the provided context. "
    "If Real, produce a concise timeline (dated bullet lines). "
    "If insufficient context, say INSUFFICIENT CONTEXT."
)

def llm_classify_and_timeline(news_text: str, context_docs: List[str], few_shots: List[Dict[str,str]] = None) -> Dict[str, Any]:
    """
    Returns structured dict:
    {
      'classification': 'Real'|'Fake'|'Unknown',
      'confidence': float|null,
      'timeline': [str]|null,
      'reason': str|optional
    }
    """
    if client is None:
        # MOCK path for offline dev
        # naive heuristic: if "postponed" or "announced" appears → real-ish :)
        if any(k in news_text.lower() for k in ["announced","postponed","government","ministry","election","official"]):
            return {"classification":"Real","confidence":0.7,"timeline":["2025-07-01 - Government announced X"],"reason":"Heuristic mock - contains official keywords"}
        elif any(k in news_text.lower() for k in ["fake","false","rumor","hoax","misinformation"]):
            return {"classification":"Fake","confidence":0.8,"timeline":None,"reason":"Heuristic mock - contains fake indicators"}
        return {"classification":"Unknown","confidence":0.5,"timeline":None,"reason":"Heuristic mock - insufficient keywords"}

    # Build messages with optional few-shot examples
    messages = [{"role":"system","content": SYSTEM_PROMPT}]
    if few_shots:
        for ex in few_shots:
            messages.append({"role":"user", "content": f"News:\n{ex['news']}\nContext:\n{ex['context']}"})
            messages.append({"role":"assistant", "content": ex["answer"]})

    context_joined = "\n\n---\n".join(context_docs[:5])  # Limit to top 5 context docs
    user_prompt = textwrap.dedent(f"""
    TASK:
    1) Decide if the News is Fake or Real using only CONTEXT.
    2) If Real: output a dated timeline (YYYY-MM-DD - short event).
    3) If insufficient evidence: 'INSUFFICIENT CONTEXT'.

    FORMAT (valid JSON):
    {{
      "classification": "Real" | "Fake" | "Unknown",
      "confidence": 0.0-1.0,
      "timeline": ["YYYY-MM-DD - event", "..."] | null,
      "reason": "one-line rationale"
    }}

    NEWS:
    {news_text}

    CONTEXT:
    {context_joined}
    """).strip()

    messages.append({"role":"user","content": user_prompt})

    try:
        resp = client.chat.completions.create(
            model=AZURE_DEPLOYMENT,  # deployment name
            messages=messages,
            temperature=0.2,
            max_tokens=700,
            top_p=0.9,
            response_format={"type": "json_object"}  # Force JSON response
        )
        out = resp.choices[0].message.content

        # Parse JSON
        try:
            parsed = json.loads(out)
            # Validate required fields
            required_fields = ["classification", "confidence", "timeline"]
            for field in required_fields:
                if field not in parsed:
                    parsed[field] = None
            return parsed
        except json.JSONDecodeError:
            return {"classification":"Unknown","confidence":None,"timeline":None,"reason":"Invalid JSON response"}

    except Exception as e:
        print(f"Azure API error: {e}")
        return {"classification":"Unknown","confidence":None,"timeline":None,"reason":f"API error: {str(e)}"}

# Test function
def test_llm_integration():
    """Test the LLM integration with a simple example"""
    test_news = "The government announced new economic policies to boost growth."
    test_context = ["Government announces economic stimulus package on 2024-01-15", "Finance minister held press conference yesterday"]

    print("Testing LLM integration...")
    result = llm_classify_and_timeline(test_news, test_context)
    print("Result:", json.dumps(result, indent=2))

# Uncomment to test
# test_llm_integration()

Azure OpenAI client initialized successfully.


In [17]:
# ===== 6) OPTIONAL: CLASSIFIER ON EMBEDDINGS + FEW-SHOT =====
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import json
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Define PROC_DIR if not already defined
PROC_DIR = Path("/content/fake-news-rag/data/processed")
PROC_DIR.mkdir(exist_ok=True, parents=True)

CLASSIFIER_PATH = PROC_DIR / "logistic_classifier.joblib"

# Try to load classifier if it already exists
if CLASSIFIER_PATH.exists():
    clf = joblib.load(CLASSIFIER_PATH)
    print(f"✅ Loaded existing classifier from {CLASSIFIER_PATH}")
else:
    # Load processed dataset if not already available
    if 'combined_df' not in globals() or len(combined_df) == 0:
        PROC_PATH = PROC_DIR / "clean_dataset.csv"
        if PROC_PATH.exists():
            print("Loading combined_df from processed file for classifier training...")
            try:
                combined_df = pd.read_csv(PROC_PATH)
            except Exception as e:
                print(f"Could not load processed file: {e}. Skipping classifier training.")
                combined_df = pd.DataFrame()
        else:
            print("combined_df not found. Skipping classifier training.")
            combined_df = pd.DataFrame()

    labeled = combined_df.dropna(subset=["label"]).copy()

    if len(labeled) == 0:
        print("No labeled data available for classifier training. Skipping...")
    else:
        print(f"Training classifier on {len(labeled)} labeled examples...")

        def to_doc(row):
            title = str(row['title']) if pd.notna(row['title']) else ''
            text = str(row['text']) if pd.notna(row['text']) else ''
            return f"{title}. {text}"

        X_docs = [to_doc(row) for _, row in labeled.iterrows()]
        X_docs = [doc for doc in X_docs if isinstance(doc, str) and len(doc.strip()) > 0]

        if not X_docs:
            print("No valid documents generated for embedding. Skipping classifier training.")
        else:
            X_emb  = embedder.encode(X_docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
            y      = labeled["label"].astype(int).values[:len(X_emb)]

            if len(X_emb) >= 10:
                X_train, X_test, y_train, y_test = train_test_split(
                    X_emb, y, test_size=0.2, random_state=42, stratify=y
                )

                clf = LogisticRegression(max_iter=300, class_weight="balanced")
                clf.fit(X_train, y_train)
                print("✅ Embedding classifier trained.")

                y_pred = clf.predict(X_test)
                print("Classification Report:")
                print(classification_report(y_test, y_pred, digits=4, target_names=["Fake", "Real"]))
                print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

                # Save for reuse
                joblib.dump(clf, CLASSIFIER_PATH)
                print(f"💾 Classifier saved to: {CLASSIFIER_PATH}")

                np.save(PROC_DIR / "X_test.npy", X_test)
                np.save(PROC_DIR / "y_test.npy", y_test)
                print("X_test and y_test saved for evaluation.")
            else:
                print(f"Not enough data ({len(X_emb)} samples). Need at least 10.")

# Few-shot examples
few_shots = [
    { "news": "Government announces rural healthcare expansion.",
      "context": "Verified article: Government announced expansion in July 2025 with budget allocation of $500 million.",
      "answer": json.dumps({
          "classification": "Real",
          "confidence": 0.92,
          "timeline": ["2025-07-15 - Announcement", "2025-08-01 - Budget approved"],
          "reason": "Matches multiple verified sources"
      })
    },
    { "news": "Secret herb instantly cures COVID claimed by celebrity.",
      "context": "WHO: No herb can cure COVID-19. Health Ministry warning issued.",
      "answer": json.dumps({
          "classification": "Fake",
          "confidence": 0.95,
          "timeline": None,
          "reason": "Contradicted by WHO and medical research"
      })
    },
    { "news": "Prime Minister resigns unexpectedly amid protests.",
      "context": "Govt statement: PM continues duties. Press conference yesterday confirmed.",
      "answer": json.dumps({
          "classification": "Fake",
          "confidence": 0.88,
          "timeline": None,
          "reason": "No credible evidence; contradicted by official sources"
      })
    }
]

print(f"\nPrepared {len(few_shots)} few-shot examples.")

# Prediction function
def predict_with_classifier(news_text, title=""):
    if 'clf' not in globals() or 'embedder' not in globals():
        print("Classifier or embedder not available.")
        return None

    doc = f"{title}. {news_text}" if title else news_text
    embedding = embedder.encode([doc], convert_to_numpy=True, normalize_embeddings=True)

    try:
        prediction = clf.predict(embedding)[0]
        probability = clf.predict_proba(embedding)[0]
        return {
            "classification": "Real" if prediction == 1 else "Fake",
            "confidence": float(max(probability)),
            "source": "LogisticRegression"
        }
    except Exception as e:
        print(f"Prediction error: {e}")
        return None

# Quick test
if 'clf' in globals():
    test_text = "Government announces new economic policies"
    result = predict_with_classifier(test_text)
    if result:
        print(f"\nTest prediction: '{test_text}' → {result}")


Training classifier on 38550 labeled examples...


Batches:   0%|          | 0/1205 [00:00<?, ?it/s]

✅ Embedding classifier trained.
Classification Report:
              precision    recall  f1-score   support

        Fake     0.9376    0.9435    0.9406      3471
        Real     0.9535    0.9486    0.9510      4239

    accuracy                         0.9463      7710
   macro avg     0.9456    0.9461    0.9458      7710
weighted avg     0.9463    0.9463    0.9463      7710

Accuracy: 0.9463
💾 Classifier saved to: /content/fake-news-rag/data/processed/logistic_classifier.joblib
X_test and y_test saved for evaluation.

Prepared 3 few-shot examples.

Test prediction: 'Government announces new economic policies' → {'classification': 'Real', 'confidence': 0.940284550138454, 'source': 'LogisticRegression'}


In [18]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Define PROC_DIR if not already defined
PROC_DIR = Path("/content/fake-news-rag/data/processed")
PROC_DIR.mkdir(exist_ok=True, parents=True)

# Add only NEW classifiers (exclude LogisticRegression since you already have it)
CLASSIFIERS = {
    "linear_svc": LinearSVC(class_weight="balanced"),
    "bernoulli_nb": BernoulliNB(),
    "random_forest": RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42),
    "gradient_boost": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5)
}

# Helper: Load dataset
def load_dataset():
    if 'combined_df' in globals() and len(combined_df) > 0:
        return combined_df

    PROC_PATH = PROC_DIR / "clean_dataset.csv"
    if PROC_PATH.exists():
        print("Loading combined_df from processed file for classifier training...")
        try:
            return pd.read_csv(PROC_PATH)
        except Exception as e:
            print(f"Could not load processed file: {e}.")
    print("combined_df not found. Skipping classifier training.")
    return pd.DataFrame()

# Train all classifiers (except LogisticRegression which you already did)
combined_df = load_dataset()
labeled = combined_df.dropna(subset=["label"]).copy()

if len(labeled) == 0:
    print("No labeled data available for classifier training. Skipping...")
else:
    print(f"Training classifiers on {len(labeled)} labeled examples...")

    def to_doc(row):
        title = str(row['title']) if pd.notna(row['title']) else ''
        text = str(row['text']) if pd.notna(row['text']) else ''
        return f"{title}. {text}"

    X_docs = [to_doc(row) for _, row in labeled.iterrows()]
    X_docs = [doc for doc in X_docs if isinstance(doc, str) and len(doc.strip()) > 0]

    if not X_docs:
        print("No valid documents generated for embedding. Skipping classifier training.")
    else:
        X_emb = embedder.encode(X_docs, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
        y     = labeled["label"].astype(int).values[:len(X_emb)]

        if len(X_emb) >= 10:
            X_train, X_test, y_train, y_test = train_test_split(
                X_emb, y, test_size=0.2, random_state=42, stratify=y
            )

            for name, model in CLASSIFIERS.items():
                print(f"\n=== Training {name.upper()} ===")
                try:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)

                    print("Classification Report:")
                    print(classification_report(y_test, y_pred, digits=4, target_names=["Fake", "Real"]))
                    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

                    # Save model
                    path = PROC_DIR / f"{name}_classifier.joblib"
                    joblib.dump(model, path)
                    print(f"💾 {name} model saved to: {path}")

                except Exception as e:
                    print(f"❌ Error training {name}: {e}")

            # Save test data once
            np.save(PROC_DIR / "X_test.npy", X_test)
            np.save(PROC_DIR / "y_test.npy", y_test)
            print("X_test and y_test saved for evaluation.")
        else:
            print(f"Not enough data ({len(X_emb)} samples). Need at least 10.")

# Prediction function for new classifiers
def predict_with_classifier(news_text, title="", model_name="random_forest"):
    path = PROC_DIR / f"{model_name}_classifier.joblib"
    if not path.exists():
        print(f"Model {model_name} not trained or not found.")
        return None

    clf = joblib.load(path)
    doc = f"{title}. {news_text}" if title else news_text
    embedding = embedder.encode([doc], convert_to_numpy=True, normalize_embeddings=True)

    try:
        prediction = clf.predict(embedding)[0]
        confidence = None
        if hasattr(clf, "predict_proba"):
            probability = clf.predict_proba(embedding)[0]
            confidence = float(max(probability))

        return {
            "classification": "Real" if prediction == 1 else "Fake",
            "confidence": confidence,
            "source": model_name
        }
    except Exception as e:
        print(f"Prediction error with {model_name}: {e}")
        return None

# Quick test with RandomForest
test_text = "Government announces new economic policies"
result = predict_with_classifier(test_text, model_name="random_forest")
if result:
    print(f"\nTest prediction ({result['source']}): '{test_text}' → {result}")


Training classifiers on 38550 labeled examples...


Batches:   0%|          | 0/1205 [00:00<?, ?it/s]


=== Training LINEAR_SVC ===
Classification Report:
              precision    recall  f1-score   support

        Fake     0.9553    0.9551    0.9552      3471
        Real     0.9632    0.9634    0.9633      4239

    accuracy                         0.9597      7710
   macro avg     0.9593    0.9592    0.9593      7710
weighted avg     0.9597    0.9597    0.9597      7710

Accuracy: 0.9597
💾 linear_svc model saved to: /content/fake-news-rag/data/processed/linear_svc_classifier.joblib

=== Training BERNOULLI_NB ===
Classification Report:
              precision    recall  f1-score   support

        Fake     0.8190    0.8433    0.8309      3471
        Real     0.8685    0.8474    0.8578      4239

    accuracy                         0.8455      7710
   macro avg     0.8437    0.8453    0.8444      7710
weighted avg     0.8462    0.8455    0.8457      7710

Accuracy: 0.8455
💾 bernoulli_nb model saved to: /content/fake-news-rag/data/processed/bernoulli_nb_classifier.joblib

=== Train

In [19]:
from pathlib import Path

PROC_DIR = Path("/content/fake-news-rag/data/processed")
print((PROC_DIR / "X_test.npy").exists())
print((PROC_DIR / "y_test.npy").exists())


True
True


In [22]:
# ===== 🚀 FAST TRAINING PIPELINE =====
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import numpy as np
import os
from pathlib import Path
import pandas as pd

# ==== Directories ====
BASE_DIR = Path("/content/fake-news-rag")
PROC_DIR = BASE_DIR / "data" / "processed"
os.makedirs(PROC_DIR, exist_ok=True)

# ==== Load data ====
if 'combined_df' not in globals():
    try:
        combined_df = pd.read_csv(PROC_DIR / "clean_dataset.csv")
        print("✅ Loaded combined_df from processed file")
    except:
        print("❌ No data available for training improvements")
        combined_df = pd.DataFrame()

# ==== Prepare dataset ====
labeled_data = combined_df.dropna(subset=["label"]).copy() if not combined_df.empty else pd.DataFrame()
if len(labeled_data) < 20:
    print("⚠️ Not enough labeled data for training improvements. Need at least 20 samples.")
else:
    print(f"\n=== 🚀 Training Improvements ===")
    print(f"Starting with {len(labeled_data)} labeled samples")

    # Balance dataset
    class_counts = labeled_data["label"].value_counts()
    min_class_count = min(class_counts)
    df_balanced = (
        labeled_data if min_class_count < 10 else pd.concat([
            labeled_data[labeled_data["label"]==0].sample(min_class_count, random_state=42),
            labeled_data[labeled_data["label"]==1].sample(min_class_count, random_state=42)
        ])
    )

    # Combine title + text
    documents = df_balanced.apply(lambda r: f"{r['title']} {r['text']}".strip(), axis=1).tolist()
    labels = df_balanced["label"].astype(int).values

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        documents, labels, test_size=0.2, random_state=42, stratify=labels
    )
    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

    # Embeddings
    print("🔄 Creating embeddings...")
    X_train_emb = embedder.encode(X_train, show_progress_bar=True, convert_to_numpy=True)
    X_test_emb = embedder.encode(X_test, show_progress_bar=True, convert_to_numpy=True)

    # Dimensionality reduction for tree-based models (keeps 100 features)
    svd = TruncatedSVD(n_components=100, random_state=42)
    X_train_red = svd.fit_transform(X_train_emb)
    X_test_red = svd.transform(X_test_emb)

    # ==== Classifiers ====
    CLASSIFIERS = {
        "LogisticRegression": (
            LogisticRegression(max_iter=1000, class_weight="balanced"),
            {"C": [0.1, 1, 10]}
        ),
        "RandomForest": (
            RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1),
            {"n_estimators": [100, 200], "max_depth": [10, None]}
        ),
        "SVM": (
            SVC(class_weight="balanced", probability=True),
            {"C": [0.5, 1, 2], "kernel": ["linear", "rbf"]}
        ),
        "LinearSVC": (
            LinearSVC(class_weight="balanced", max_iter=5000),
            {"C": [0.1, 1, 10]}
        ),
        "BernoulliNB": (
            BernoulliNB(),
            {"alpha": [0.1, 1, 5]}
        ),
        "XGBoost": (
            XGBClassifier(
                tree_method="hist",  # fast training
                eval_metric="logloss",
                use_label_encoder=False,
                n_jobs=-1,
                random_state=42
            ),
            {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1], "max_depth": [3, 5]}
        ),
        "LightGBM": (
            LGBMClassifier(random_state=42, n_jobs=-1),
            {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1], "max_depth": [-1, 5]}
        )
    }

    # ==== Training Loop ====
    best_overall, best_f1 = None, -1
    for name, (model, param_dist) in CLASSIFIERS.items():
        print(f"\n=== 🔹 Training {name} ===")
        try:
            # Use reduced embeddings for tree models
            Xtr, Xte = (X_train_red, X_test_red) if name in ["RandomForest", "XGBoost", "LightGBM"] else (X_train_emb, X_test_emb)

            clf = RandomizedSearchCV(
                model,
                param_distributions=param_dist,
                n_iter=5,   # test only 5 random param combos
                cv=3,
                scoring="f1",
                n_jobs=-1,
                random_state=42
            )
            clf.fit(Xtr, y_train)

            print(f"✅ Best {name} params: {clf.best_params_}")
            print(f"✅ CV F1: {clf.best_score_:.4f}")

            y_pred = clf.predict(Xte)
            acc, f1 = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
            print(f"✅ Test Accuracy: {acc:.4f}, Test F1: {f1:.4f}")

            # Save model
            model_path = PROC_DIR / f"{name.lower()}_classifier.joblib"
            joblib.dump(clf.best_estimator_, model_path)
            print(f"💾 Saved {name} to {model_path}")

            if f1 > best_f1:
                best_f1, best_overall = f1, clf.best_estimator_
                globals()['clf'] = best_overall

        except Exception as e:
            print(f"⚠️ Skipped {name} due to error: {e}")

    print(f"\n🏆 Best overall model: {best_overall.__class__.__name__} with F1 = {best_f1:.4f}")

    # Save embeddings
    np.save(PROC_DIR / "X_train_emb.npy", X_train_emb)
    np.save(PROC_DIR / "X_test_emb.npy", X_test_emb)
    np.save(PROC_DIR / "y_train.npy", y_train)
    np.save(PROC_DIR / "y_test.npy", y_test)
    print("✅ Training data embeddings saved")

print("\n=== 🚀 Fast Training Complete ===")
import json

# ==== Save best model metrics ====
if best_overall is not None:
    metrics = {
        "best_model": best_overall.__class__.__name__,
        "accuracy": float(accuracy_score(y_test, best_overall.predict(X_test_emb))),
        "f1": float(f1_score(y_test, best_overall.predict(X_test_emb)))
    }
    METRICS_PATH = PROC_DIR / "best_model_metrics.json"
    with open(METRICS_PATH, "w") as f:
        json.dump(metrics, f, indent=4)
    print(f"💾 Saved best model metrics to {METRICS_PATH}")




=== 🚀 Training Improvements ===
Starting with 38550 labeled samples
Training samples: 27769, Test samples: 6943
🔄 Creating embeddings...


Batches:   0%|          | 0/868 [00:00<?, ?it/s]

Batches:   0%|          | 0/217 [00:00<?, ?it/s]


=== 🔹 Training LogisticRegression ===




✅ Best LogisticRegression params: {'C': 10}
✅ CV F1: 0.9551
✅ Test Accuracy: 0.9516, Test F1: 0.9518
💾 Saved LogisticRegression to /content/fake-news-rag/data/processed/logisticregression_classifier.joblib

=== 🔹 Training RandomForest ===




✅ Best RandomForest params: {'n_estimators': 200, 'max_depth': None}
✅ CV F1: 0.8913
✅ Test Accuracy: 0.8874, Test F1: 0.8873
💾 Saved RandomForest to /content/fake-news-rag/data/processed/randomforest_classifier.joblib

=== 🔹 Training SVM ===




✅ Best SVM params: {'kernel': 'rbf', 'C': 2}
✅ CV F1: 0.9692
✅ Test Accuracy: 0.9703, Test F1: 0.9705
💾 Saved SVM to /content/fake-news-rag/data/processed/svm_classifier.joblib

=== 🔹 Training LinearSVC ===




✅ Best LinearSVC params: {'C': 10}
✅ CV F1: 0.9599
✅ Test Accuracy: 0.9568, Test F1: 0.9570
💾 Saved LinearSVC to /content/fake-news-rag/data/processed/linearsvc_classifier.joblib

=== 🔹 Training BernoulliNB ===




✅ Best BernoulliNB params: {'alpha': 1}
✅ CV F1: 0.8444
✅ Test Accuracy: 0.8457, Test F1: 0.8439
💾 Saved BernoulliNB to /content/fake-news-rag/data/processed/bernoullinb_classifier.joblib

=== 🔹 Training XGBoost ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best XGBoost params: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}
✅ CV F1: 0.9216
✅ Test Accuracy: 0.9191, Test F1: 0.9188
💾 Saved XGBoost to /content/fake-news-rag/data/processed/xgboost_classifier.joblib

=== 🔹 Training LightGBM ===
[LightGBM] [Info] Number of positive: 13885, number of negative: 13884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 27769, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500018 -> initscore=0.000072
[LightGBM] [Info] Start training from score 0.000072
✅ Best LightGBM params: {'n_estimators': 200, 'max_depth': -1, 'learning_rate': 0.1}
✅ CV F1: 0.9240
✅ Test Accuracy: 0.9192, Test F1: 0.9189
💾 Saved LightGBM to /content/fake-news-rag/data/processed/lightgbm_classifier.joblib

🏆 Best overall model: SVC 



💾 Saved best model metrics to /content/fake-news-rag/data/processed/best_model_metrics.json


In [41]:
import json, joblib

if best_overall is not None:
    metrics = {
        "best_model": best_overall.__class__.__name__,
        "accuracy": float(accuracy_score(y_test, best_overall.predict(X_test_emb))),
        "f1": float(f1_score(y_test, best_overall.predict(X_test_emb)))
    }

    # Save JSON for human readability
    with open(PROC_DIR / "best_model_metrics.json", "w") as f:
        json.dump(metrics, f, indent=4)

    # Save JOBLIB with a consistent filename
    joblib.dump(metrics, PROC_DIR / "best_model_metrics.joblib")

    print("💾 Saved best model and metrics!")


💾 Saved best model and metrics!


In [42]:
import joblib

metrics_path = PROC_DIR / "best_model_metrics.joblib"

if metrics_path.exists():
    metrics = joblib.load(metrics_path)
    print(f"✅ Loaded metrics: {metrics}")
else:
    print("⚠️ No saved metrics found, accuracy will not be displayed.")
    metrics = {"accuracy": None, "f1": None}


✅ Loaded metrics: {'best_model': 'SVC', 'accuracy': 0.9703298286043497, 'f1': 0.970487106017192}


In [44]:
# ===== 🚀 FAST RAG PIPELINE =====
import math
import joblib
import faiss

# ✅ Load saved metrics consistently (always same file)
metrics_path = PROC_DIR / "best_model_metrics.joblib"

if metrics_path.exists():
    metrics = joblib.load(metrics_path)
    print(f"✅ Metrics loaded: Accuracy={metrics.get('accuracy')}, F1={metrics.get('f1')}")
else:
    print("⚠️ No saved metrics found, accuracy will not be displayed.")
    metrics = {"accuracy": None, "f1": None}



def faiss_reload() -> faiss.Index:
    """Reload or build FAISS index"""
    if FAISS_PATH.exists():
        return faiss.read_index(str(FAISS_PATH))

    print("⚠️ FAISS index not found. Creating new index...")
    docs = combined_df.apply(to_doc, axis=1).tolist()
    emb = embedder.encode(
        docs,
        batch_size=128,  # 🔥 larger batch for speed
        convert_to_numpy=True,
        show_progress_bar=True,
        normalize_embeddings=True
    ).astype("float32")

    dim = emb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(emb)
    faiss.write_index(index, str(FAISS_PATH))
    return index


# Load FAISS index once
faiss_index = faiss_reload()
print(f"✅ FAISS index loaded with {faiss_index.ntotal} vectors")


def retrieve_top_k(query_text: str, k: int = 5):
    """Retrieve top-k similar documents with FAISS"""
    q = embedder.encode(
        [query_text],
        batch_size=1,
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

    D, I = faiss_index.search(q, k)
    rows = []
    for score, idx in zip(D[0], I[0]):
        if 0 <= idx < len(combined_df):
            row = combined_df.iloc[int(idx)]
            rows.append({
                "score": float(score),
                "title": row["title"],
                "text": row["text"],
                "label": None if pd.isna(row["label"]) else int(row["label"]),
                "lang": row.get("lang", "unknown"),
                "source": row.get("source", ""),
                "idx": int(idx)
            })
    return rows


def rag_infer(news_title: str, news_text: str, k: int = 5, use_llm: bool = True) -> dict:
    """
    Retrieve docs + classify with fast model + (optionally) LLM.
    """
    inp = f"{news_title}. {news_text}".strip()
    retrieved = retrieve_top_k(inp, k=k)
    context_docs = [f"{r['title']}\n{r['text']}" for r in retrieved]

    # ===== Fast classifier prediction =====
    fast_label, fast_conf = None, None
    if 'clf' in globals() and hasattr(clf, "predict_proba"):
        try:
            fast_emb = embedder.encode(
                [inp],
                convert_to_numpy=True,
                normalize_embeddings=True
            )
            probs = clf.predict_proba(fast_emb)[0]
            fast_label = int(np.argmax(probs))
            fast_conf = float(np.max(probs))
        except Exception as e:
            print(f"⚠️ Classifier prediction failed: {e}")

    # ===== LLM classification =====
    llm_out = llm_classify_and_timeline(inp, context_docs, few_shots=few_shots) if use_llm else None

    # ===== Merge policy =====
    final_class, final_conf, timeline, reason = None, None, None, ""
    if llm_out and llm_out.get("classification") in ["Real", "Fake"]:
        final_class = llm_out["classification"]
        final_conf = llm_out.get("confidence")
        timeline = llm_out.get("timeline")
        reason = llm_out.get("reason", "")
    elif fast_label is not None:
        final_class = "Real" if fast_label == 1 else "Fake"
        final_conf = fast_conf
        reason = "LLM insufficient context; used embedding classifier." if llm_out else "LLM disabled; used embedding classifier."
    else:
        final_class, final_conf, reason = "Unknown", None, "No classifier or LLM decision available."

    return {
        "input": {"title": news_title, "text": news_text},
        "retrieved": retrieved,
        "fast_classifier": {
            "label": ("Real" if fast_label == 1 else "Fake") if fast_label is not None else None,
            "confidence": fast_conf,
            "accuracy": metrics.get("accuracy")
        },
        "llm_output": llm_out,
        "final": {
            "label": final_class,
            "confidence": final_conf,
            "timeline": timeline,
            "reason": reason,
            "accuracy": metrics.get("accuracy")
        }
    }


# ===== Quick Smoke Test =====
print("\n🚀 Running smoke test...")
sample = rag_infer(
    news_title="Government announces new health initiative",
    news_text="The government of Bangladesh announced funding for rural clinics."
)

print(f"\n✅ Final classification: {sample['final']['label']} "
      f"(confidence: {sample['final']['confidence']}, "
      f"accuracy: {sample['final']['accuracy']})")
print(f"Reason: {sample['final']['reason']}")
print(f"Retrieved {len(sample['retrieved'])} relevant documents\n")

for i, doc in enumerate(sample['retrieved'][:3], 1):
    print(f"{i}. Score: {doc['score']:.4f} - {doc['title'][:60]}...")

if sample['final'].get("timeline"):
    print(f"\n🗓 Timeline: {sample['final']['timeline']}")


✅ Metrics loaded: Accuracy=0.9703298286043497, F1=0.970487106017192
✅ FAISS index loaded with 41106 vectors

🚀 Running smoke test...
⚠️ Classifier prediction failed: X has 384 features, but LGBMClassifier is expecting 100 features as input.




Azure API error: Error code: 500 - {'error': {'code': 'Internal Server Error', 'message': 'An error occurred during the request processing: All connection attempts failed | Type: ConnectError | Traceback (most recent call last):||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 66, in map_httpcore_exceptions||    yield||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 366, in handle_async_request||    resp = await self._pool.handle_async_request(req)||           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 256, in handle_async_request||    raise exc from None||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 236, in handle_async_request||    response = await connection.handle_async_request(||           

In [None]:
# Step : Install deps
!pip install openai chromadb newspaper3k sentence-transformers lxml_html_clean

# Step 2: Init Azure client
from openai import AzureOpenAI
import os

# Set your Azure credentials (better to use environment variables)
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version="2024-05-01-preview"
)

# Step 3: Scrape latest news
from newspaper import Article
import requests
from bs4 import BeautifulSoup

def scrape_article(url):
    """Robust article scraping with fallback"""
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Newspaper3k failed: {e}, trying manual scrape...")
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Try to extract main content
            for tag in ['article', 'main', '.content', '.article-body']:
                elements = soup.select(tag)
                if elements:
                    return ' '.join([elem.get_text().strip() for elem in elements])

            # Fallback: get all paragraphs
            paragraphs = soup.find_all('p')
            return ' '.join([p.get_text().strip() for p in paragraphs])
        except Exception as e2:
            print(f"Manual scrape also failed: {e2}")
            return None

url = "https://www.thedailystar.net/news/bangladesh/politics/news/pm-meets-opposition-leaders-3472166"
doc_text = scrape_article(url)

if not doc_text:
    # Fallback example text
    doc_text = "Prime Minister Sheikh Hasina met with opposition leaders today to discuss national issues. The meeting focused on political reconciliation and upcoming elections."

# Step 4: Embed into vector store
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings # Import necessary types

# Initialize embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Create a wrapper class for the embedding function
class SentenceTransformerEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model):
        self._model = model

    def __call__(self, input: Documents) -> Embeddings:
        # Encode to numpy array first, then convert to list
        embeddings = self._model.encode(list(input), convert_to_numpy=True)
        return embeddings.tolist() # Convert numpy array to list


# Create Chroma client
chroma_client = chromadb.Client()
# Use the wrapper class as the embedding function
collection = chroma_client.get_or_create_collection("news", embedding_function=SentenceTransformerEmbeddingFunction(embedder)) # Changed from create_collection

# Add article with embedding
collection.add(
    documents=[doc_text],
    metadatas=[{"source": url, "type": "news_article"}],
    ids=["doc1"]
)

print("Article added to vector store successfully")

# Step 5: User query
query = "Did Sheikh Hasina address the nation today?"

# Embed + retrieve
results = collection.query(
    query_texts=[query],
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

retrieved_docs = "\n\n---\n".join([doc for doc in results['documents'][0]])

print(f"Retrieved {len(results['documents'][0])} relevant documents")

# Step 6: Ask Azure LLM with retrieved context
try:
    response = client.chat.completions.create(
        model="Phi-4-mini-reasoning-2",  # your deployment name
        messages=[
            {
                "role": "system",
                "content": "You are a factual fact-checking assistant for Bangladesh news. Always cite sources and be precise. If the information isn't in the provided sources, say you don't have enough information."
            },
            {
                "role": "user",
                "content": f"Based on the following sources, please fact-check this claim:\n\nCLAIM: {query}\n\nSOURCES:\n{retrieved_docs}\n\nPlease provide a clear answer with citations from the sources."
            }
        ],
        temperature=0.1,
        max_tokens=500
    )

    answer = response.choices[0].message.content
    print("\n" + "="*60)
    print("🤖 AI FACT-CHECK RESULT:")
    print("="*60)
    print(answer)
    print("="*60)

except Exception as e:
    print(f"Azure API Error: {e}")
    print("\nMock response (for demo purposes):")
    mock_response = """
Based on the provided sources, I cannot confirm that Sheikh Hasina addressed the nation today.

The available information indicates that Prime Minister Sheikh Hasina met with opposition leaders to discuss national issues and political reconciliation. There is no mention of a national address in the provided sources.

Source: The Daily Star article about PM meeting opposition leaders.

For the most current information about national addresses, please check official government channels or recent news updates.
"""
    print(mock_response)

# Additional: Add more documents for better retrieval
additional_urls = [
    "https://www.thedailystar.net/news/bangladesh",
    "https://www.prothomalo.com",
    "https://bdnews24.com"
]

print("\nTo improve results, you can add more news articles:")
for add_url in additional_urls:
    print(f"- {add_url}")

# Utility function to add more articles
def add_news_article(url, collection):
    """Add a news article to the vector database"""
    text = scrape_article(url)
    if text and len(text) > 100:  # Minimum length check
        doc_id = f"doc_{len(collection.get()['ids']) + 1}"
        collection.add(
            documents=[text],
            metadatas=[{"source": url, "type": "news_article"}],
            ids=[doc_id]
        )
        print(f"✓ Added: {url}")
        return True
    return False

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?2

In [46]:
# ===== 8) EVALUATION (ISOT + Live + DailyStar) =====
from pathlib import Path
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, ConfusionMatrixDisplay, classification_report
)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json, time, random
import os

# Optional (used in Mode 2)
try:
    from newspaper import Article
except Exception:
    Article = None

# --- Directories (keep in sync with earlier steps) ---
BASE_DIR = Path("/content/fake-news-rag")
PROC_DIR = BASE_DIR / "data" / "processed"
RAW_DIR  = BASE_DIR / "data" / "raw"
PROC_DIR.mkdir(parents=True, exist_ok=True)

# --- Helpers ---
def safe_load_numpy(path: Path, allow_missing=False):
    """Load a numpy file safely; return None if missing and allow_missing=True."""
    try:
        return np.load(path)
    except FileNotFoundError:
        if not allow_missing:
            print(f"⚠️ Missing file: {path}")
        return None
    except Exception as e:
        print(f"⚠️ Could not load {path}: {e}")
        return None

def ensure_embedder_available():
    if 'embedder' not in globals():
        raise RuntimeError("Embedder not found in globals. Run the embeddings step first.")
    return True

def ensure_classifier_available():
    if 'clf' not in globals():
        raise RuntimeError("Classifier 'clf' not found in globals. Train or load it first.")
    return True

def pretty_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Fake(0)","Real(1)"])
    plt.figure(figsize=(8, 6))
    disp.plot(cmap='Blues', values_format='d')
    plt.title(title)
    plt.show()

def compute_basic_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", pos_label=1, zero_division=0
    )
    return acc, prec, rec, f1

# ====================================================
# MODE 1: ISOT BASELINE (Classifier on held-out ISOT)
# ====================================================
print("\n=== 📊 Baseline Evaluation on ISOT Dataset ===")

isot_metrics = {
    "accuracy": None,
    "precision": None,
    "recall": None,
    "f1_score": None
}

try:
    ensure_classifier_available()

    # Try multiple filenames for X_test embeddings (support both older and improved training steps)
    X_TEST_CANDIDATES = [PROC_DIR / "X_test_emb.npy", PROC_DIR / "X_test.npy"]
    X_test_emb = None
    for cand in X_TEST_CANDIDATES:
        X_test_emb = safe_load_numpy(cand, allow_missing=True)
        if X_test_emb is not None:
            print(f"Loaded X_test embeddings from {cand}")
            break

    # Fallback: if not found on disk, check if X_test is in globals
    if X_test_emb is None and 'X_test' in globals():
        # If shaped like embeddings, use directly; otherwise try to embed (requires embedder)
        if isinstance(X_test, np.ndarray) and X_test.ndim == 2:
            X_test_emb = X_test
            print("Using in-memory X_test embeddings from globals()")
        else:
            # Treat as documents ⇒ embed them
            ensure_embedder_available()
            if isinstance(X_test, (list, tuple)):
                docs = [str(x) for x in X_test if isinstance(x, (str, bytes))]
            else:
                # As a last resort, convert to list of strings
                docs = [str(X_test)]
            if len(docs) > 0:
                print("Embedding X_test documents from memory...")
                X_test_emb = embedder.encode(docs, show_progress_bar=True, convert_to_numpy=True)
            else:
                X_test_emb = None

    # y_test (labels)
    y_test = safe_load_numpy(PROC_DIR / "y_test.npy", allow_missing=True)
    if y_test is None and 'y_test' in globals():
        y_test = y_test  # use in-memory
        print("Using in-memory y_test from globals()")

    if X_test_emb is None or y_test is None or len(X_test_emb) == 0 or len(y_test) == 0:
        print("⚠️ Skipping ISOT evaluation - test embeddings or labels missing/empty.")
    else:
        if len(X_test_emb) != len(y_test):
            n = min(len(X_test_emb), len(y_test))
            print(f"⚠️ Length mismatch: truncating to {n} samples for evaluation.")
            X_test_emb = X_test_emb[:n]
            y_test = y_test[:n]

        y_pred = clf.predict(X_test_emb)
        acc, prec, rec, f1 = compute_basic_metrics(y_test, y_pred)

        print(f"[Embedding Classifier] Acc={acc:.4f}  P={prec:.4f}  R={rec:.4f}  F1={f1:.4f}")
        print("\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred, target_names=["Fake", "Real"], digits=4))
        pretty_confusion_matrix(y_test, y_pred, title="Confusion Matrix - Embedding Classifier (ISOT)")

        isot_metrics = {
            "accuracy": float(acc),
            "precision": float(prec),
            "recall": float(rec),
            "f1_score": float(f1)
        }

except Exception as e:
    print(f"⚠️ No classifier evaluation possible: {e}")

# ====================================================
# MODE 2: LIVE NEWS EVALUATION (RAG pipeline)
# ====================================================
print("\n=== 🌐 Real-Time Evaluation on Live News ===")

def fetch_live_news(limit=5):
    """Fetch a few recent news articles for evaluation (using requests + BeautifulSoup + newspaper3k)."""
    try:
        import requests
        from bs4 import BeautifulSoup
        # We’ll be polite but robust
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
        }

        news_sources = [
            "https://www.thedailystar.net/top-news",
            "https://www.dhakatribune.com/articles/news/",
            "https://bdnews24.com/latest-news"
        ]

        articles = []
        for source_url in news_sources:
            try:
                resp = requests.get(source_url, headers=headers, timeout=10)
                if resp.status_code != 200:
                    continue
                soup = BeautifulSoup(resp.text, 'html.parser')

                # Collect links that look like article pages
                links = []
                for a in soup.find_all('a', href=True):
                    href = a['href']
                    if any(tok in href for tok in ['/article/', '/news/']):
                        if href.startswith('/'):
                            # Normalize relative links
                            base = source_url.split('/articles')[0].rstrip('/')
                            href = f"{base}{href}"
                        links.append(href)

                # Deduplicate while preserving order
                seen = set()
                unique_links = []
                for l in links:
                    if l not in seen:
                        seen.add(l)
                        unique_links.append(l)

                # Parse a few with newspaper3k if available
                use_links = unique_links[: max(2, limit // len(news_sources) + 1)]
                for link in use_links:
                    try:
                        art = Article(link)
                        art.download()
                        art.parse()
                        if art.title and art.text:
                            articles.append({
                                "title": art.title,
                                "text": art.text,
                                "url": link
                            })
                            if len(articles) >= limit:
                                break
                    except Exception:
                        continue

                if len(articles) >= limit:
                    break

            except Exception as e:
                print(f"❌ Failed to fetch from {source_url}: {e}")
                continue

        if len(articles) == 0:
            print("Using fallback example articles...")
            articles = [
                {
                    "title": "Bangladesh Government Announces New Economic Policy",
                    "text": "The government has introduced new economic measures to boost growth and stabilize markets. The policy focuses on inflation control and export promotion.",
                    "url": "https://example.com/economic-policy"
                },
                {
                    "title": "Dhaka Metro Rail Expansion Project Approved",
                    "text": "The metro rail expansion project has received final approval with funding from international development partners. Construction will begin next month.",
                    "url": "https://example.com/metro-expansion"
                }
            ]
        return articles

    except Exception as e:
        print(f"❌ Error in fetch_live_news: {e}")
        return []

live_articles = fetch_live_news(limit=3)
live_eval = {
    "count": 0,
    "items": []
}

if live_articles and 'rag_infer' in globals():
    print(f"Fetched {len(live_articles)} live articles for evaluation")
    for i, art in enumerate(live_articles, 1):
        print(f"\n--- Article {i}: {art['title'][:80]} ---")
        t0 = time.time()
        try:
            out = rag_infer(art["title"], art["text"], k=5, use_llm=True)
            latency = time.time() - t0

            label = out.get("final", {}).get("label", "Unknown")
            conf = out.get("final", {}).get("confidence", None)
            reason = out.get("final", {}).get("reason", "")

            if conf is not None:
                print(f"Prediction: {label} | Confidence: {conf:.3f}")
            else:
                print(f"Prediction: {label}")
            print(f"Reason: {reason}")
            print(f"⏱ Latency: {latency:.2f}s")

            # Top retrieved
            if out.get("retrieved"):
                top = out["retrieved"][0]
                score = top.get("score", None)
                ttitle = top.get("title", "")[:80]
                if score is not None:
                    print(f"Top retrieved: {ttitle} (score={score:.3f})")
                else:
                    print(f"Top retrieved: {ttitle}")

            live_eval["items"].append({
                "title": art["title"],
                "url": art.get("url"),
                "prediction": label,
                "confidence": float(conf) if conf is not None else None,
                "latency_sec": float(latency)
            })
        except Exception as e:
            print(f"❌ Error processing article: {e}")
            continue
    live_eval["count"] = len(live_eval["items"])
else:
    if not live_articles:
        print("⚠️ No live news could be fetched for evaluation.")
    elif 'rag_infer' not in globals():
        print("⚠️ `rag_infer` not available. Skipping live RAG evaluation.")

# ====================================================
# MODE 3: DAILY STAR (Unlabeled) – Classifier-only pass
# ====================================================
print("\n=== 📰 Daily Star Evaluation (Unlabeled Predictions) ===")

dailystar_summary = {
    "count": 0,
    "real": 0,
    "fake": 0,
    "output_path": None
}

try:
    ensure_classifier_available()
    ensure_embedder_available()

    # Try RAW first; if not found, try PROCESSED (depending on where you saved it earlier)
    DAILYSTAR_CSV_CANDIDATES = [
        RAW_DIR / "dailystar_news.csv",
        PROC_DIR / "dailystar_news.csv",
        PROC_DIR / "dailystar_clean.csv"
    ]

    DAILYSTAR_PATH = None
    for c in DAILYSTAR_CSV_CANDIDATES:
        if c.exists():
            DAILYSTAR_PATH = c
            break

    if DAILYSTAR_PATH is None:
        print("⚠️ Daily Star dataset not found in expected locations. Skipping.")
    else:
        ds_df = pd.read_csv(DAILYSTAR_PATH)
        print(f"Loaded {len(ds_df)} Daily Star articles from: {DAILYSTAR_PATH}")

        # Ensure columns exist
        if not {"title", "text"}.issubset(set(ds_df.columns)):
            print("⚠️ Daily Star CSV missing 'title' or 'text' columns. Skipping.")
        else:
            docs = [f"{str(t)}. {str(x)}" for t, x in zip(ds_df["title"], ds_df["text"])]
            ds_emb = embedder.encode(docs, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)

            # Prefer predict_proba if available
            has_proba = hasattr(clf, "predict_proba")
            ds_preds = clf.predict(ds_emb)
            ds_probs = clf.predict_proba(ds_emb) if has_proba else None

            ds_df["prediction"] = np.where(ds_preds == 1, "Real", "Fake")
            if ds_probs is not None:
                ds_df["confidence"] = ds_probs.max(axis=1)
            else:
                ds_df["confidence"] = None

            # Simple distribution
            counts = ds_df["prediction"].value_counts().to_dict()
            dailystar_summary["count"] = int(len(ds_df))
            dailystar_summary["real"]  = int(counts.get("Real", 0))
            dailystar_summary["fake"]  = int(counts.get("Fake", 0))

            # Save predictions
            DS_OUT_PATH = PROC_DIR / "dailystar_predictions.csv"
            ds_df.to_csv(DS_OUT_PATH, index=False)
            dailystar_summary["output_path"] = str(DS_OUT_PATH)
            print(f"✅ Saved Daily Star predictions to: {DS_OUT_PATH}")

            # Show a few samples
            print("\nSample predictions:")
            for i in range(min(3, len(ds_df))):
                title = str(ds_df.iloc[i]["title"])[:80]
                pred  = ds_df.iloc[i]["prediction"]
                conf  = ds_df.iloc[i]["confidence"]
                if conf is not None:
                    print(f"- {title} → {pred} (conf {conf:.3f})")
                else:
                    print(f"- {title} → {pred}")

except Exception as e:
    print(f"⚠️ Daily Star evaluation error: {e}")

# ====================================================
# RETRIEVAL QUALITY CHECK
# ====================================================
print("\n=== 🔍 Retrieval Quality Check ===")
retrieve_tests = [
    "Bangladesh elections",
    "Dhaka metro rail project",
    "global oil prices",
    "COVID treatment"
]

retrieval_checked = 0
if 'retrieve_top_k' in globals():
    for query in retrieve_tests:
        print(f"\nQuery: '{query}'")
        try:
            results = retrieve_top_k(query, k=2)  # small k for quick check
            if results:
                retrieval_checked += 1
                for i, res in enumerate(results, 1):
                    snippet = res["text"][:100] + ("..." if len(res["text"]) > 100 else "")
                    source_info = res.get('source', 'unknown')
                    if isinstance(source_info, str) and len(source_info) > 60:
                        source_info = source_info[:60] + "..."
                    score = res.get("score", None)
                    score_part = f" | Score: {score:.3f}" if isinstance(score, (float, int)) else ""
                    print(f"  {i}. Source: {source_info}{score_part} | Snippet: {snippet}")
            else:
                print("  No results found")
        except Exception as e:
            print(f"  ❌ Error retrieving for query '{query}': {e}")
else:
    print("⚠️ `retrieve_top_k` not available. Skipping retrieval check.")

# ====================================================
# SAVE EVALUATION SUMMARY
# ====================================================
EVAL_PATH = PROC_DIR / "evaluation_results.json"

eval_results = {
    "timestamp": time.time(),
    "isot_classifier": isot_metrics,
    "live_news": {
        "count": int(live_eval.get("count", 0)),
        "avg_latency_sec": float(np.mean([x["latency_sec"] for x in live_eval["items"]])) if live_eval.get("items") else None
    },
    "dailystar": dailystar_summary,
    "retrieval_queries_tested": int(len(retrieve_tests)),
    "retrieval_queries_with_results": int(retrieval_checked)
}

try:
    with open(EVAL_PATH, "w", encoding="utf-8") as f:
        json.dump(eval_results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Evaluation results saved to: {EVAL_PATH}")
except Exception as e:
    print(f"❌ Failed to save evaluation results: {e}")

print("\n=== 📈 Evaluation Summary ===")
print(f"ISOT Classifier Performance: {isot_metrics}")
print(f"Live Articles Processed: {eval_results['live_news']['count']}")
print(f"Daily Star Preds: {dailystar_summary['count']} "
      f"(Real={dailystar_summary['real']}, Fake={dailystar_summary['fake']})")
print(f"Retrieval Queries Tested: {eval_results['retrieval_queries_tested']} "
      f"(with results: {eval_results['retrieval_queries_with_results']})")



=== 📊 Baseline Evaluation on ISOT Dataset ===
Loaded X_test embeddings from /content/fake-news-rag/data/processed/X_test_emb.npy
⚠️ No classifier evaluation possible: X has 384 features, but LGBMClassifier is expecting 100 features as input.

=== 🌐 Real-Time Evaluation on Live News ===




Using fallback example articles...
Fetched 2 live articles for evaluation

--- Article 1: Bangladesh Government Announces New Economic Policy ---
⚠️ Classifier prediction failed: X has 384 features, but LGBMClassifier is expecting 100 features as input.




Azure API error: Error code: 500 - {'error': {'code': 'Internal Server Error', 'message': 'An error occurred during the request processing: All connection attempts failed | Type: ConnectError | Traceback (most recent call last):||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 66, in map_httpcore_exceptions||    yield||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 366, in handle_async_request||    resp = await self._pool.handle_async_request(req)||           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 256, in handle_async_request||    raise exc from None||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 236, in handle_async_request||    response = await connection.handle_async_request(||           



Azure API error: Error code: 500 - {'error': {'code': 'Internal Server Error', 'message': 'An error occurred during the request processing: All connection attempts failed | Type: ConnectError | Traceback (most recent call last):||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 66, in map_httpcore_exceptions||    yield||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpx/_transports/default.py", line 366, in handle_async_request||    resp = await self._pool.handle_async_request(req)||           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 256, in handle_async_request||    raise exc from None||  File "/opt/microsoft-maas/billingproxy/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 236, in handle_async_request||    response = await connection.handle_async_request(||           

In [55]:
# ===== 9) REAL-TIME DEMO (GRADIO) =====
import gradio as gr
from newspaper import Article
import datetime
import joblib
from pathlib import Path

# ===== Setup processed directory and metrics =====
PROC_DIR = Path("processed")
PROC_DIR.mkdir(exist_ok=True)

metrics_path = PROC_DIR / "best_model_metrics.joblib"

# Auto-create dummy metrics if file doesn't exist
if metrics_path.exists():
    metrics = joblib.load(metrics_path)
    print(f"✅ Metrics loaded: Accuracy={metrics.get('accuracy')}, F1={metrics.get('f1')}")
else:
    metrics = {"accuracy": 0.0, "f1": 0.0}  # dummy values
    joblib.dump(metrics, metrics_path)
    print("ℹ️ Metrics file not found. Dummy metrics created and saved.")

# ===== Enhanced RAG inference =====
def rag_infer(news_title: str, news_text: str, k: int = 5, use_llm: bool = True, extra_context: str = None) -> dict:
    inp = f"{news_title}. {news_text}".strip()
    retrieved = retrieve_top_k(inp, k=k)  # must have your retrieve_top_k defined
    context_docs = [f"{r['title']}\n{r['text']}" for r in retrieved]

    if extra_context and len(extra_context.strip()) > 0:
        context_docs.insert(0, f"SCRAPED_CONTEXT:\n{extra_context}")

    # Fast classifier
    fast_label, fast_conf = None, None
    if 'clf' in globals() and hasattr(clf, 'predict'):
        try:
            fast_emb = embedder.encode([inp], convert_to_numpy=True, normalize_embeddings=True)
            fast_label = int(clf.predict(fast_emb)[0])
            fast_conf = float(max(clf.predict_proba(fast_emb)[0]))
        except Exception as e:
            print(f"⚠️ Classifier prediction failed: {e}")

    # LLM output
    llm_out = None
    if use_llm:
        llm_out = llm_classify_and_timeline(inp, context_docs, few_shots=few_shots)

    # Final decision
    if use_llm and llm_out and llm_out.get("classification") in ["Real", "Fake"]:
        final_class = llm_out["classification"]
        final_conf  = llm_out.get("confidence")
        timeline    = llm_out.get("timeline")
        reason      = llm_out.get("reason", "")
    elif fast_label is not None:
        final_class = "Real" if fast_label == 1 else "Fake"
        final_conf  = fast_conf
        timeline    = None
        reason      = "LLM insufficient or disabled; used embedding classifier."
    else:
        final_class, final_conf, timeline, reason = "Unknown", None, None, "No classifier available."

    return {
        "input": {"title": news_title, "text": news_text},
        "retrieved": retrieved,
        "fast_classifier": {
            "label": ("Real" if fast_label == 1 else "Fake") if fast_label is not None else None,
            "confidence": fast_conf,
            "accuracy": metrics.get("accuracy")
        },
        "llm_output": llm_out,
        "final": {
            "label": final_class,
            "confidence": final_conf,
            "timeline": timeline,
            "reason": reason,
            "accuracy": metrics.get("accuracy")
        }
    }

# ===== Gradio Prediction Function =====
def ui_predict(news_text):
    try:
        if not news_text or len(news_text.strip()) < 10:
            return "❌ Please provide at least 10 characters of news text", ""

        # Clean input
        news_text_clean = clean_text(news_text)  # must have clean_text defined
        title = news_text_clean.split(".")[0][:120] if news_text_clean else "News"

        # Scrape if URL
        scraped_context, scraped_title = "", title
        if news_text.startswith(("http://", "https://")):
            try:
                art = Article(news_text)
                art.download()
                art.parse()
                scraped_context = art.text.strip()
                scraped_title = art.title if art.title else title
            except Exception as e:
                scraped_context = f"⚠️ Failed to scrape article: {str(e)}"
        else:
            scraped_context = news_text_clean

        # Run RAG inference
        out = rag_infer(scraped_title, scraped_context, k=5, use_llm=True, extra_context=scraped_context)
        final = out["final"]

        # Build result markdown
        result = f"## 🎯 **Fact-Checking Result** ({datetime.date.today()})\n\n"
        label = final.get("label", "Unknown")
        if label == "Real":
            result += "<span style='color: green; font-weight: bold; font-size: 20px;'>✅ REAL NEWS</span>"
        elif label == "Fake":
            result += "<span style='color: red; font-weight: bold; font-size: 20px;'>❌ FAKE NEWS</span>"
        else:
            result += "<span style='color: orange; font-weight: bold; font-size: 20px;'>❓ UNCERTAIN</span>"

        if final.get("confidence"):
            result += f"  \n**Confidence:** {final['confidence']:.3f}"
        if final.get("accuracy"):
            result += f"  \n**Model Accuracy (on test set):** {final['accuracy']:.3f}"
        if final.get("reason"):
            result += f"  \n**Reason:** {final['reason']}"

        if final.get("timeline"):
            result += "\n\n## 📅 **Timeline of Related Events:**"
            for line in final["timeline"][:5]:
                result += f"\n- 📌 {line}"

        # Evidence
        retrieved_view = "## 🔍 **Retrieved Context Sources:**\n\n"
        if out["retrieved"]:
            for i, r in enumerate(out["retrieved"][:3], start=1):
                source_type = "📁 Other"
                if r.get("source") == "isot":
                    source_type = "📰 Archive (ISOT)"
                elif r.get("source") == "scraped":
                    source_type = "🌐 Live Scraped News"
                snippet = (r["text"][:400] + "…") if len(r["text"]) > 400 else r["text"]
                retrieved_view += f"### Source {i} (Relevance: {r['score']:.3f})\n**{source_type}**\n{snippet}\n---\n"
        else:
            retrieved_view += "⚠️ No relevant context found in current databases."

        return result, retrieved_view

    except Exception as e:
        print(f"⚠️ UI Prediction Error: {e}")
        return f"❌ Error: {str(e)}", "Please try again with different text."

# ===== Gradio UI =====
with gr.Blocks(title="Bangladesh News Fact-Checker", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 📰 Bangladesh News Fact-Checker
    ## Real-time RAG + Azure LLM + Live Scraping

    *Fact-check Bangladeshi news using AI and live context retrieval*
    """)

    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### 📝 Enter News Article or URL")
            inp = gr.Textbox(lines=10, label="Paste article text or URL", placeholder="URL or article text", max_lines=15)
            btn = gr.Button("🔍 Analyze & Verify", variant="primary")

        with gr.Column(scale=3):
            gr.Markdown("### 📊 Results")
            out_md = gr.Markdown(value="*Results will appear here after analysis...*")
            gr.Markdown("### 🔍 Evidence")
            ctx_md = gr.Markdown(value="*Context will appear here...*")

    gr.Examples(
        examples=[
            "https://www.thedailystar.net/news/bangladesh/politics/news/pm-meets-opposition-leaders-3472166",
            "Prime Minister announced new healthcare funding of 500 million taka",
            "A secret herbal cure eliminates COVID-19 instantly without treatment"
        ],
        inputs=inp,
        label="💡 Try these:"
    )

    btn.click(ui_predict, inputs=[inp], outputs=[out_md, ctx_md])

# Launch
print("🚀 Starting Gradio interface...")
demo.launch(share=True, server_name="0.0.0.0", show_error=True)


ℹ️ Metrics file not found. Dummy metrics created and saved.
🚀 Starting Gradio interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3775b54688f3363a3d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [52]:
# Save/Load small project config (optional)
CONFIG_PATH = BASE_DIR / "config.json"

def save_config(**kwargs):
    cfg = {}
    if CONFIG_PATH.exists():
        try:
            cfg = json.loads(CONFIG_PATH.read_text())
        except Exception:
            cfg = {}
    cfg.update(kwargs)
    CONFIG_PATH.write_text(json.dumps(cfg, indent=2))
    print("Config saved:", CONFIG_PATH)

def load_config():
    if CONFIG_PATH.exists():
        return json.loads(CONFIG_PATH.read_text())
    return {}

# Example:
# save_config(azure_endpoint=AZURE_ENDPOINT, deployment=AZURE_DEPLOYMENT)
# print(load_config())
