In [1]:
import requests, pandas as pd, re, time, hashlib, math, json
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datetime import datetime, timezone

# -----------------------------
# 1. RSS FEEDS
# -----------------------------
RSS_FEEDS = [
    "https://vnexpress.net/rss/tin-moi-nhat.rss",
    "https://vnexpress.net/rss/thoi-su.rss",
    "https://vnexpress.net/rss/the-gioi.rss",
    "https://vnexpress.net/rss/kinh-doanh.rss",
    "https://vnexpress.net/rss/giai-tri.rss",
    "https://vnexpress.net/rss/the-thao.rss",
    "https://vnexpress.net/rss/phap-luat.rss",
    "https://vnexpress.net/rss/giao-duc.rss",
    "https://vnexpress.net/rss/suc-khoe.rss",
    "https://vnexpress.net/rss/doi-song.rss",
    "https://vnexpress.net/rss/khoa-hoc.rss",
    "https://vnexpress.net/rss/so-hoa.rss",
    "https://vnexpress.net/rss/oto-xe-may.rss",
    "https://vnexpress.net/rss/y-kien.rss",
    "https://tuoitre.vn/rss/tin-moi.rss",
    "https://tuoitre.vn/rss/thoi-su.rss",
    "https://tuoitre.vn/rss/the-gioi.rss",
    "https://tuoitre.vn/rss/kinh-doanh.rss",
    "https://tuoitre.vn/rss/van-hoa.rss",
    "https://tuoitre.vn/rss/the-thao.rss",
    "https://tuoitre.vn/rss/phap-luat.rss",
    "https://tuoitre.vn/rss/giao-duc.rss",
    "https://tuoitre.vn/rss/suc-khoe.rss",
    "https://tuoitre.vn/rss/nhip-song.rss",
    "https://tuoitre.vn/rss/cong-nghe.rss",
    "https://tuoitre.vn/rss/xe.rss",
    "https://thanhnien.vn/rss/home.rss",
    "https://thanhnien.vn/rss/thoi-su.rss",
    "https://thanhnien.vn/rss/the-gioi.rss",
    "https://thanhnien.vn/rss/tai-chinh-kinh-doanh.rss",
    "https://thanhnien.vn/rss/doi-song.rss",
    "https://thanhnien.vn/rss/van-hoa.rss",
    "https://thanhnien.vn/rss/the-thao.rss",
    "https://thanhnien.vn/rss/giao-duc.rss",
    "https://thanhnien.vn/rss/cong-nghe.rss",
    "https://thanhnien.vn/rss/xe.rss",
    "https://dantri.com.vn/rss/home.rss",
    "https://dantri.com.vn/rss/xa-hoi.rss",
    "https://dantri.com.vn/rss/the-gioi.rss",
    "https://dantri.com.vn/rss/kinh-doanh.rss",
    "https://dantri.com.vn/rss/the-thao.rss",
    "https://dantri.com.vn/rss/giao-duc-khuyen-hoc.rss",
    "https://dantri.com.vn/rss/van-hoa.rss",
    "https://dantri.com.vn/rss/phap-luat.rss",
    "https://dantri.com.vn/rss/suc-khoe.rss",
    "https://dantri.com.vn/rss/oto-xe-may.rss",
    "https://vietnamnet.vn/rss/tin-moi-nong.rss",
    "https://vietnamnet.vn/rss/thoi-su.rss",
    "https://vietnamnet.vn/rss/the-gioi.rss",
    "https://vietnamnet.vn/rss/kinh-doanh.rss",
    "https://vietnamnet.vn/rss/giai-tri.rss",
    "https://vietnamnet.vn/rss/the-thao.rss",
    "https://vietnamnet.vn/rss/giao-duc.rss",
    "https://vietnamnet.vn/rss/suc-khoe.rss",
    "https://vietnamnet.vn/rss/cong-nghe.rss",
    "https://zingnews.vn/rss/tin-moi.rss",
    "https://zingnews.vn/rss/the-gioi.rss",
    "https://zingnews.vn/rss/thoi-su.rss",
    "https://zingnews.vn/rss/kinh-doanh-tai-chinh.rss",
    "https://zingnews.vn/rss/the-thao.rss",
    "https://zingnews.vn/rss/giai-tri.rss",
    "https://zingnews.vn/rss/giao-duc.rss",
    "https://zingnews.vn/rss/cong-nghe.rss",
]

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; FakeNewsVN/1.0)"}

# -----------------------------
# 2. Helper functions
# -----------------------------
def clean_html(s: str) -> str:
    if not isinstance(s, str): return ""
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"&[a-z]+;", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def crawl_feed(url, limit=300):
    try:
        r = requests.get(url, headers=HEADERS, timeout=20)
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, "xml")
        items = soup.find_all("item")[:limit]
        out = []
        for it in items:
            title = clean_html(it.title.text if it.title else "")
            desc  = clean_html(it.description.text if it.description else "")
            link  = (it.link.text if it.link else "").strip()
            pub   = (it.pubDate.text if it.pubDate else "").strip()
            out.append({"title": title, "desc": desc, "link": link, "pubDate": pub, "source": url})
        print(f"✅ {url} -> {len(out)}")
        return out
    except Exception as e:
        print(f"❌ {url} error: {e}")
        return []

# -----------------------------
# 3. Crawl multi-thread
# -----------------------------
rows = []
with ThreadPoolExecutor(max_workers=12) as ex:
    futs = [ex.submit(crawl_feed, u, 300) for u in RSS_FEEDS]
    for f in as_completed(futs):
        rows.extend(f.result())

df = pd.DataFrame(rows)

# merge text
df["text"] = (df["title"].fillna("") + " " + df["desc"].fillna("")).str.strip()

# deduplicate
key = (df["title"].fillna("") + "|" + df["link"].fillna("")).apply(lambda s: hashlib.md5(s.encode("utf-8")).hexdigest())
df["key"] = key
df = df.drop_duplicates(subset=["key"]).drop(columns=["key"])

# convert pubDate -> datetime
df["pubDate"] = pd.to_datetime(df["pubDate"], errors="coerce", utc=True)

print("Total after dedup:", len(df))
df["label"] = "REAL"
df.to_csv("real.csv", index=False, encoding="utf-8-sig")
print("📁 Saved -> real.csv (", len(df), "rows )")

✅ https://vnexpress.net/rss/thoi-su.rss -> 60
✅ https://vnexpress.net/rss/phap-luat.rss -> 60
✅ https://vnexpress.net/rss/the-gioi.rss -> 60
✅ https://vnexpress.net/rss/tin-moi-nhat.rss -> 53
✅ https://vnexpress.net/rss/suc-khoe.rss -> 60
✅ https://vnexpress.net/rss/the-thao.rss -> 60
✅ https://vnexpress.net/rss/giao-duc.rss -> 60
✅ https://vnexpress.net/rss/doi-song.rss -> 60
✅ https://vnexpress.net/rss/kinh-doanh.rss -> 60
✅ https://vnexpress.net/rss/so-hoa.rss -> 60
✅ https://vnexpress.net/rss/giai-tri.rss -> 60
✅ https://vnexpress.net/rss/khoa-hoc.rss -> 60
✅ https://vnexpress.net/rss/oto-xe-may.rss -> 60
✅ https://vnexpress.net/rss/y-kien.rss -> 60
✅ https://tuoitre.vn/rss/thoi-su.rss -> 50
✅ https://tuoitre.vn/rss/tin-moi.rss -> 50
✅ https://tuoitre.vn/rss/the-gioi.rss -> 50
✅ https://tuoitre.vn/rss/kinh-doanh.rss -> 50
✅ https://tuoitre.vn/rss/van-hoa.rss -> 50
✅ https://tuoitre.vn/rss/giao-duc.rss -> 50
✅ https://tuoitre.vn/rss/suc-khoe.rss -> 50
✅ https://tuoitre.vn/rss/the-th

In [None]:
import pandas as pd, numpy as np, re, math, time, tldextract, json
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, gradio as gr
from datetime import datetime, timezone
from typing import List, Dict, Any

# ----------------------------
# 1) Load & chuẩn hoá dữ liệu
# ----------------------------
DF_PATH = "real.csv"

def load_dataframe(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "text" not in df.columns:
        df["title"] = df.get("title","").fillna("")
        df["desc"]  = df.get("desc","").fillna("")
        df["text"]  = (df["title"] + " " + df["desc"]).str.strip()
    # parse thời gian
    def parse_date(x):
        try:
            return pd.to_datetime(x, utc=True, errors="coerce")
        except:
            return pd.NaT
    df["pubDate"] = df.get("pubDate","").astype(str)
    df["dt"] = df["pubDate"].apply(parse_date)
    df["dt"] = df["dt"].fillna(pd.Timestamp(1970,1,1, tz="UTC"))
    # domain
    def get_domain(row):
        link = str(row.get("link","") or "")
        src  = str(row.get("source","") or "")
        url  = link if link else src
        if not url:
            return ""
        ext = tldextract.extract(url)
        # domain.suffix (ví dụ vnexpress.net)
        return ".".join([p for p in [ext.domain, ext.suffix] if p])
    df["domain"] = df.apply(get_domain, axis=1)
    # chuẩn text
    df["text"] = df["text"].fillna("").astype(str)
    return df

# ---------------------------------------
# 2) Scorer: trust theo domain + recency
# ---------------------------------------
TRUST = {
    "vnexpress.net": 1.2,
    "tuoitre.vn": 1.2,
    "thanhnien.vn": 1.15,
    "dantri.com.vn": 1.15,
    "vietnamnet.vn": 1.1,
    "zingnews.vn": 1.1,
}
def trust_weight(domain: str) -> float:
    return TRUST.get(domain, 1.0)

def recency_weight(dt: pd.Timestamp, half_life_days=21.0) -> float:
    now = datetime.now(timezone.utc)
    age_days = max(0, (now - dt).days if pd.notna(dt) else 3650)
    return math.exp(-age_days / half_life_days)  # 0..1

# ------------------------------------------------
# 3) Retriever + Models (BM25, Dense, Cross, NLI)
# ------------------------------------------------
class Pipeline:
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.corpus: List[str] = self.df["text"].tolist()

        # BM25
        tokenized = [doc.split() for doc in self.corpus]
        self.bm25 = BM25Okapi(tokenized)

        # Dense embedder
        self.embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
        self.corpus_emb = self.embedder.encode(self.corpus, convert_to_tensor=True, show_progress_bar=True)

        # Cross-encoder (optional)
        self.reranker = None
        try:
            self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
        except Exception:
            self.reranker = None

        # NLI model
        self.NLI_MODEL = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tok = AutoTokenizer.from_pretrained(self.NLI_MODEL)
        self.nli_model = AutoModelForSequenceClassification.from_pretrained(self.NLI_MODEL).to(self.device)
        self.id2label = {0:"entailment", 1:"neutral", 2:"contradiction"}

    @torch.no_grad()
    def nli_scores(self, premise: str, hypothesis: str) -> Dict[str, float]:
        inputs = self.tok(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        logits = self.nli_model(**inputs).logits
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()[0]
        return {
            self.id2label[0]: float(probs[0]),
            self.id2label[1]: float(probs[1]),
            self.id2label[2]: float(probs[2]),
        }

    def verify_claim(
        self,
        claim: str,
        k_bm25=25,
        k_dense=25,
        top_m=12,
        need_agree_sources=2,
        need_disagree_sources=2,
    ) -> Dict[str, Any]:
        df = self.df
        corpus = self.corpus

        # BM25
        bm25_scores = self.bm25.get_scores(claim.split())
        top_bm = np.argsort(bm25_scores)[::-1][:k_bm25]

        # Dense
        q_emb = self.embedder.encode(claim, convert_to_tensor=True)
        dense_scores = util.cos_sim(q_emb, self.corpus_emb)[0].detach().cpu().numpy()
        top_de = np.argsort(dense_scores)[::-1][:k_dense]

        # hợp nhất
        cand_idxs = list(set(top_bm.tolist() + top_de.tolist()))

        # Cross-encoder rerank (optional)
        ce_map = {i: 0.0 for i in cand_idxs}
        if self.reranker is not None:
            pairs = [(claim, corpus[i]) for i in cand_idxs]
            try:
                ce_scores = self.reranker.predict(pairs)  # cao là tốt
                ce_map = {i: float(s) for i, s in zip(cand_idxs, ce_scores)}
            except Exception:
                pass

        # min-max
        def minmax(arr):
            arr = np.array(arr, dtype="float32")
            mn, mx = float(arr.min()), float(arr.max())
            if mx - mn < 1e-9:
                return np.zeros_like(arr)
            return (arr - mn) / (mx - mn)

        bm_norm = minmax([bm25_scores[i] for i in cand_idxs])
        de_norm = minmax([dense_scores[i] for i in cand_idxs])
        ce_norm = minmax([ce_map[i] for i in cand_idxs])

        # recency + trust
        recs, trusts = [], []
        for i in cand_idxs:
            recs.append(recency_weight(df.iloc[i]["dt"]))
            trusts.append(trust_weight(df.iloc[i]["domain"]))
        recs = np.array(recs, dtype="float32")
        trusts = np.array(trusts, dtype="float32")

        final_scores = 0.35*bm_norm + 0.35*de_norm + 0.20*ce_norm + 0.10*(recs*trusts)
        order = np.argsort(final_scores)[::-1][:top_m]
        ranked = [cand_idxs[idx] for idx in order]

        # NLI + quyết định
        evidences = []
        entail_by_domain, contra_by_domain = {}, {}
        entail_sum, contra_sum = 0.0, 0.0

        for i in ranked:
            row = df.iloc[i]
            ev_text = str(row["text"])
            nli = self.nli_scores(ev_text, claim)
            d = row["domain"]
            if nli["entailment"] > nli["contradiction"]:
                entail_by_domain[d] = max(entail_by_domain.get(d, 0.0), nli["entailment"])
                entail_sum += nli["entailment"]
            else:
                contra_by_domain[d] = max(contra_by_domain.get(d, 0.0), nli["contradiction"])
                contra_sum += nli["contradiction"]

            evidences.append({
                "id": f"doc_{i}",
                "evidence": ev_text[:400] + ("..." if len(ev_text) > 400 else ""),
                "bm25": float(bm25_scores[i]),
                "dense": float(dense_scores[i]),
                "ce": float(ce_map.get(i, 0.0)),
                "recency_w": float(recency_weight(row["dt"])),
                "trust_w": float(trust_weight(row["domain"])),
                "entail": round(nli["entailment"], 4),
                "neutral": round(nli["neutral"], 4),
                "contra": round(nli["contradiction"], 4),
                "domain": row.get("domain",""),
                "source": row.get("source",""),
                "pubDate": str(row.get("pubDate","")),
                "link": row.get("link",""),
            })

        agree_sources = sum(1 for _d,_v in entail_by_domain.items() if _v >= 0.6)
        disagree_sources = sum(1 for _d,_v in contra_by_domain.items() if _v >= 0.6)

        label, conf = "UNSURE", 0.5
        # ===== Decision Logic (fix) =====
        # Ưu tiên 1: Nếu có ít nhất N nguồn tin cậy (trust ≥ 1.1) đồng thuận mạnh → REAL
        strong_agree = sum(
        1 for ev in evidences
        if ev.get("entail", 0) >= 0.55 and ev.get("trust_w", 1.0) >= 1.1
        )

        # Ưu tiên 2: Nếu có ít nhất N nguồn phủ định mạnh → FAKE
        strong_disagree = sum(
        1 for ev in evidences
        if ev.get("contra", 0) >= 0.7
        )

        if strong_agree >= need_agree_sources:
            label = "REAL"
            conf = 0.9
        elif strong_disagree >= need_disagree_sources:
            label = "FAKE"
            conf = 0.9
        elif entail_sum > contra_sum:
            label = "REAL"
            conf = entail_sum / (entail_sum + contra_sum + 1e-6)
        elif contra_sum > entail_sum:
            label = "FAKE"
            conf = contra_sum / (entail_sum + contra_sum + 1e-6)
        else:
            label = "UNSURE"
            conf = 0.5

        

        return {
            "claim": claim,
            "label": label,
            "confidence": round(float(conf), 3),
            "evidences": evidences,
            "stats": {
                "agree_sources": int(agree_sources),
                "disagree_sources": int(disagree_sources),
                "entail_sum": round(float(entail_sum),3),
                "contra_sum": round(float(contra_sum),3),
            }
        }

# --------------------------
# 4) Khởi tạo pipeline lazy
# --------------------------
PIPE = None

def init_pipeline(csv_path: str):
    global PIPE
    df = load_dataframe(csv_path)
    PIPE = Pipeline(df)
    return f"Loaded {len(df)} documents from {csv_path}. Corpus ready."

# --------------------------
# 5) Gradio UI callbacks
# --------------------------
def do_verify(claim, k_bm25, k_dense, top_m, need_agree, need_disagree):
    if not claim or len(claim.strip()) == 0:
        return "Nhập nội dung cần kiểm chứng.", {}, pd.DataFrame()
    if PIPE is None:
        return "Pipeline chưa sẵn sàng.", {}, pd.DataFrame()
    out = PIPE.verify_claim(
        claim=claim.strip(),
        k_bm25=int(k_bm25),
        k_dense=int(k_dense),
        top_m=int(top_m),
        need_agree_sources=int(need_agree),
        need_disagree_sources=int(need_disagree),
    )
    # hiển thị label
    headline = f"**Kết luận:** {out['label']} — **độ tin cậy:** {out['confidence']}"
    # stats ra JSON
    stats = {
        "agree_sources": out["stats"]["agree_sources"],
        "disagree_sources": out["stats"]["disagree_sources"],
        "entail_sum": out["stats"]["entail_sum"],
        "contra_sum": out["stats"]["contra_sum"],
    }
    # bảng evidence
    ev = pd.DataFrame(out["evidences"], columns=[
        "id","domain","pubDate","entail","contra","recency_w","trust_w",
        "bm25","dense","ce","source","link","evidence"
    ])
    return headline, stats, ev

def reload_csv(file):
    path = file.name if file is not None else DF_PATH
    msg = init_pipeline(path)
    return gr.update(value=msg)

# --------------------------
# 6) Build Gradio app
# --------------------------
with gr.Blocks(title="Fake News Verifier") as demo:
    gr.Markdown("## 🕵️‍♂️ AI Agent kiểm chứng tin – Verifier Demo")

    with gr.Row():
        claim = gr.Textbox(label="Nhập phát biểu / tin cần kiểm chứng", lines=3, placeholder="Ví dụ: 'Bộ GD-ĐT công bố lịch thi mới...'")
    with gr.Accordion("Thiết lập nâng cao", open=False):
        with gr.Row():
            k_bm25 = gr.Slider(5, 100, value=25, step=1, label="k_bm25")
            k_dense = gr.Slider(5, 100, value=25, step=1, label="k_dense")
            top_m = gr.Slider(5, 30, value=12, step=1, label="top_m (evidence hiển thị)")
        with gr.Row():
            need_agree = gr.Slider(1, 5, value=2, step=1, label="Nguồn đồng thuận tối thiểu")
            need_disagree = gr.Slider(1, 5, value=2, step=1, label="Nguồn phản bác tối thiểu")

    with gr.Row():
        run_btn = gr.Button("🔎 Verify", variant="primary")

    result_md = gr.Markdown()
    stats_json = gr.JSON(label="Thống kê")
    ev_df = gr.Dataframe(label="Bằng chứng", wrap=True)

    gr.Markdown("---")
    gr.Markdown("### Dữ liệu cơ sở")
    with gr.Row():
        csv_file = gr.File(label="Tải CSV mới (tuỳ chọn). Cột cần có: title/desc hoặc text, link/source, pubDate", file_types=[".csv"])
        load_btn = gr.Button("📥 Nạp dữ liệu")

    status = gr.Textbox(label="Trạng thái", interactive=False)

    # events
    run_btn.click(do_verify, [claim, k_bm25, k_dense, top_m, need_agree, need_disagree], [result_md, stats_json, ev_df])
    load_btn.click(reload_csv, [csv_file], [status])

    # init on launch
    demo.load(lambda: init_pipeline(DF_PATH), inputs=None, outputs=status)

if __name__ == "__main__":
    demo.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Batches: 100%|██████████| 158/158 [00:11<00:00, 14.04it/s]
