In [1]:
import requests, pandas as pd, re, time, hashlib, math, json
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datetime import datetime, timezone

# -----------------------------
# 1. RSS FEEDS
# -----------------------------
RSS_FEEDS = [
    "https://vnexpress.net/rss/tin-moi-nhat.rss",
    "https://vnexpress.net/rss/thoi-su.rss",
    "https://vnexpress.net/rss/the-gioi.rss",
    "https://vnexpress.net/rss/kinh-doanh.rss",
    "https://vnexpress.net/rss/giai-tri.rss",
    "https://vnexpress.net/rss/the-thao.rss",
    "https://vnexpress.net/rss/phap-luat.rss",
    "https://vnexpress.net/rss/giao-duc.rss",
    "https://vnexpress.net/rss/suc-khoe.rss",
    "https://vnexpress.net/rss/doi-song.rss",
    "https://vnexpress.net/rss/khoa-hoc.rss",
    "https://vnexpress.net/rss/so-hoa.rss",
    "https://vnexpress.net/rss/oto-xe-may.rss",
    "https://vnexpress.net/rss/y-kien.rss",
    "https://tuoitre.vn/rss/tin-moi.rss",
    "https://tuoitre.vn/rss/thoi-su.rss",
    "https://tuoitre.vn/rss/the-gioi.rss",
    "https://tuoitre.vn/rss/kinh-doanh.rss",
    "https://tuoitre.vn/rss/van-hoa.rss",
    "https://tuoitre.vn/rss/the-thao.rss",
    "https://tuoitre.vn/rss/phap-luat.rss",
    "https://tuoitre.vn/rss/giao-duc.rss",
    "https://tuoitre.vn/rss/suc-khoe.rss",
    "https://tuoitre.vn/rss/nhip-song.rss",
    "https://tuoitre.vn/rss/cong-nghe.rss",
    "https://tuoitre.vn/rss/xe.rss",
    "https://thanhnien.vn/rss/home.rss",
    "https://thanhnien.vn/rss/thoi-su.rss",
    "https://thanhnien.vn/rss/the-gioi.rss",
    "https://thanhnien.vn/rss/tai-chinh-kinh-doanh.rss",
    "https://thanhnien.vn/rss/doi-song.rss",
    "https://thanhnien.vn/rss/van-hoa.rss",
    "https://thanhnien.vn/rss/the-thao.rss",
    "https://thanhnien.vn/rss/giao-duc.rss",
    "https://thanhnien.vn/rss/cong-nghe.rss",
    "https://thanhnien.vn/rss/xe.rss",
    "https://dantri.com.vn/rss/home.rss",
    "https://dantri.com.vn/rss/xa-hoi.rss",
    "https://dantri.com.vn/rss/the-gioi.rss",
    "https://dantri.com.vn/rss/kinh-doanh.rss",
    "https://dantri.com.vn/rss/the-thao.rss",
    "https://dantri.com.vn/rss/giao-duc-khuyen-hoc.rss",
    "https://dantri.com.vn/rss/van-hoa.rss",
    "https://dantri.com.vn/rss/phap-luat.rss",
    "https://dantri.com.vn/rss/suc-khoe.rss",
    "https://dantri.com.vn/rss/oto-xe-may.rss",
    "https://vietnamnet.vn/rss/tin-moi-nong.rss",
    "https://vietnamnet.vn/rss/thoi-su.rss",
    "https://vietnamnet.vn/rss/the-gioi.rss",
    "https://vietnamnet.vn/rss/kinh-doanh.rss",
    "https://vietnamnet.vn/rss/giai-tri.rss",
    "https://vietnamnet.vn/rss/the-thao.rss",
    "https://vietnamnet.vn/rss/giao-duc.rss",
    "https://vietnamnet.vn/rss/suc-khoe.rss",
    "https://vietnamnet.vn/rss/cong-nghe.rss",
    "https://zingnews.vn/rss/tin-moi.rss",
    "https://zingnews.vn/rss/the-gioi.rss",
    "https://zingnews.vn/rss/thoi-su.rss",
    "https://zingnews.vn/rss/kinh-doanh-tai-chinh.rss",
    "https://zingnews.vn/rss/the-thao.rss",
    "https://zingnews.vn/rss/giai-tri.rss",
    "https://zingnews.vn/rss/giao-duc.rss",
    "https://zingnews.vn/rss/cong-nghe.rss",
]

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; FakeNewsVN/1.0)"}

# -----------------------------
# 2. Helper functions
# -----------------------------
def clean_html(s: str) -> str:
    if not isinstance(s, str): return ""
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"&[a-z]+;", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def crawl_feed(url, limit=300):
    try:
        r = requests.get(url, headers=HEADERS, timeout=20)
        r.encoding = "utf-8"
        soup = BeautifulSoup(r.text, "xml")
        items = soup.find_all("item")[:limit]
        out = []
        for it in items:
            title = clean_html(it.title.text if it.title else "")
            desc  = clean_html(it.description.text if it.description else "")
            link  = (it.link.text if it.link else "").strip()
            pub   = (it.pubDate.text if it.pubDate else "").strip()
            out.append({"title": title, "desc": desc, "link": link, "pubDate": pub, "source": url})
        print(f"✅ {url} -> {len(out)}")
        return out
    except Exception as e:
        print(f"❌ {url} error: {e}")
        return []

# -----------------------------
# 3. Crawl multi-thread
# -----------------------------
rows = []
with ThreadPoolExecutor(max_workers=12) as ex:
    futs = [ex.submit(crawl_feed, u, 300) for u in RSS_FEEDS]
    for f in as_completed(futs):
        rows.extend(f.result())

df = pd.DataFrame(rows)

# merge text
df["text"] = (df["title"].fillna("") + " " + df["desc"].fillna("")).str.strip()

# deduplicate
key = (df["title"].fillna("") + "|" + df["link"].fillna("")).apply(lambda s: hashlib.md5(s.encode("utf-8")).hexdigest())
df["key"] = key
df = df.drop_duplicates(subset=["key"]).drop(columns=["key"])

# convert pubDate -> datetime
df["pubDate"] = pd.to_datetime(df["pubDate"], errors="coerce", utc=True)

print("Total after dedup:", len(df))
df["label"] = "REAL"
df.to_csv("real.csv", index=False, encoding="utf-8-sig")
print("📁 Saved -> real.csv (", len(df), "rows )")

✅ https://vnexpress.net/rss/thoi-su.rss -> 60
✅ https://vnexpress.net/rss/phap-luat.rss -> 60
✅ https://vnexpress.net/rss/the-gioi.rss -> 60
✅ https://vnexpress.net/rss/tin-moi-nhat.rss -> 53
✅ https://vnexpress.net/rss/suc-khoe.rss -> 60
✅ https://vnexpress.net/rss/the-thao.rss -> 60
✅ https://vnexpress.net/rss/giao-duc.rss -> 60
✅ https://vnexpress.net/rss/doi-song.rss -> 60
✅ https://vnexpress.net/rss/kinh-doanh.rss -> 60
✅ https://vnexpress.net/rss/so-hoa.rss -> 60
✅ https://vnexpress.net/rss/giai-tri.rss -> 60
✅ https://vnexpress.net/rss/khoa-hoc.rss -> 60
✅ https://vnexpress.net/rss/oto-xe-may.rss -> 60
✅ https://vnexpress.net/rss/y-kien.rss -> 60
✅ https://tuoitre.vn/rss/thoi-su.rss -> 50
✅ https://tuoitre.vn/rss/tin-moi.rss -> 50
✅ https://tuoitre.vn/rss/the-gioi.rss -> 50
✅ https://tuoitre.vn/rss/kinh-doanh.rss -> 50
✅ https://tuoitre.vn/rss/van-hoa.rss -> 50
✅ https://tuoitre.vn/rss/giao-duc.rss -> 50
✅ https://tuoitre.vn/rss/suc-khoe.rss -> 50
✅ https://tuoitre.vn/rss/the-th

In [1]:
import pandas as pd, numpy as np, re, math, time, tldextract
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, json
from datetime import datetime, timezone

# ===== 1) Load & chuẩn hoá dữ liệu =====
df = pd.read_csv("real.csv")
if "text" not in df.columns:
    df["title"] = df.get("title","").fillna("")
    df["desc"]  = df.get("desc","").fillna("")
    df["text"]  = (df["title"] + " " + df["desc"]).str.strip()

# parse thời gian
def parse_date(x):
    try:
        return pd.to_datetime(x, utc=True, errors="coerce")
    except:
        return pd.NaT

df["pubDate"] = df.get("pubDate","").astype(str)
df["dt"] = df["pubDate"].apply(parse_date)
df["dt"] = df["dt"].fillna(pd.Timestamp(1970,1,1, tz="UTC"))

# trích domain gốc từ cột 'source' hoặc 'link'
def get_domain(row):
    link = str(row.get("link","") or "")
    src  = str(row.get("source","") or "")
    url  = link if link else src
    if not url:
        return ""
    ext = tldextract.extract(url)
    return ".".join([p for p in [ext.domain, ext.suffix] if p])

df["domain"] = df.apply(get_domain, axis=1)

# ===== 2) Scorer: tin cậy theo domain + trọng số thời gian =====
# tuỳ bạn chỉnh, domain phổ biến được +w
TRUST = {
    "vnexpress.net": 1.2,
    "tuoitre.vn": 1.2,
    "thanhnien.vn": 1.15,
    "dantri.com.vn": 1.15,
    "vietnamnet.vn": 1.1,
    "zingnews.vn": 1.1,
}
def trust_weight(domain: str) -> float:
    return TRUST.get(domain, 1.0)

def recency_weight(dt: pd.Timestamp, half_life_days=21.0) -> float:
    now = datetime.now(timezone.utc)
    age_days = max(0, (now - dt).days if pd.notna(dt) else 3650)
    return math.exp(-age_days / half_life_days)  # 0..1

# ===== 3) Retriever: BM25 + Dense + Cross-Encoder rerank =====
corpus = df["text"].astype(str).tolist()

# BM25
tokenized = [doc.split() for doc in corpus]
bm25 = BM25Okapi(tokenized)

# Dense embedder (mạnh hơn MiniLM)
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
corpus_emb = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

# Cross-encoder reranker (multilingual tương đối ổn)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # nếu lỗi mạng, bạn có thể bỏ reranker

# ===== 4) NLI model (không cần tiktoken/sentencepiece) =====
NLI_MODEL = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
device = 0 if torch.cuda.is_available() else -1
tok = AutoTokenizer.from_pretrained(NLI_MODEL)
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
if device >= 0:
    nli_model = nli_model.to("cuda")
id2label = {0:"entailment", 1:"neutral", 2:"contradiction"}

@torch.no_grad()
def nli_scores(premise: str, hypothesis: str):
    inputs = tok(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512)
    if device >= 0:
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    logits = nli_model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()[0]
    return {id2label[i]: float(probs[i]) for i in range(3)}

# ===== 5) Verify function (nâng cấp) =====
def verify_claim(
    claim: str,
    k_bm25=25,
    k_dense=25,
    top_m=12,
    need_agree_sources=2,
    need_disagree_sources=2
):
    # BM25
    bm25_scores = bm25.get_scores(claim.split())
    top_bm = np.argsort(bm25_scores)[::-1][:k_bm25]

    # Dense
    q_emb = embedder.encode(claim, convert_to_tensor=True)
    dense_scores = util.cos_sim(q_emb, corpus_emb)[0].detach().cpu().numpy()
    top_de = np.argsort(dense_scores)[::-1][:k_dense]

    # hợp nhất ứng viên
    cand_idxs = list(set(top_bm.tolist() + top_de.tolist()))

    # Cross-encoder rerank (optional)
    pairs = [(claim, corpus[i]) for i in cand_idxs]
    try:
        ce_scores = reranker.predict(pairs)  # cao là tốt
        ce_map = {i: float(s) for i, s in zip(cand_idxs, ce_scores)}
    except Exception:
        ce_map = {i: 0.0 for i in cand_idxs}

    # chuẩn hoá điểm BM25/Dense trên cand set
    def minmax(arr):
        arr = np.array(arr, dtype="float32")
        mn, mx = float(arr.min()), float(arr.max())
        if mx - mn < 1e-9:
            return np.zeros_like(arr)
        return (arr - mn) / (mx - mn)

    bm_norm = minmax([bm25_scores[i] for i in cand_idxs])
    de_norm = minmax([dense_scores[i] for i in cand_idxs])
    ce_norm = minmax([ce_map[i] for i in cand_idxs])

    # thêm recency + trust
    recs, trusts = [], []
    for i in cand_idxs:
        recs.append(recency_weight(df.iloc[i]["dt"]))
        trusts.append(trust_weight(df.iloc[i]["domain"]))
    recs = np.array(recs, dtype="float32")
    trusts = np.array(trusts, dtype="float32")

    # tổng hợp điểm (bạn có thể chỉnh trọng số)
    final_scores = 0.35*bm_norm + 0.35*de_norm + 0.2*ce_norm + 0.10*(recs*trusts)
    order = np.argsort(final_scores)[::-1][:top_m]
    ranked = [cand_idxs[idx] for idx in order]

    # NLI + quyết định
    evidences = []
    entail_by_domain = {}
    contra_by_domain = {}
    entail_sum, contra_sum = 0.0, 0.0

    for i in ranked:
        row = df.iloc[i]
        ev_text = str(row["text"])
        nli = nli_scores(ev_text, claim)
        d = row["domain"]
        if nli["entailment"] > nli["contradiction"]:
            entail_by_domain[d] = max(entail_by_domain.get(d, 0.0), nli["entailment"])
            entail_sum += nli["entailment"]
        else:
            contra_by_domain[d] = max(contra_by_domain.get(d, 0.0), nli["contradiction"])
            contra_sum += nli["contradiction"]

        evidences.append({
            "id": f"doc_{i}",
            "evidence": ev_text[:400] + ("..." if len(ev_text) > 400 else ""),
            "bm25_dense_ce": {
                "bm25": float(bm25_scores[i]),
                "dense": float(dense_scores[i]),
                "ce": float(ce_map.get(i, 0.0))
            },
            "recency_w": float(recency_weight(row["dt"])),
            "trust_w": float(trust_weight(row["domain"])),
            "nli": {k: round(v, 4) for k, v in nli.items()},
            "source": row.get("source",""),
            "pubDate": str(row.get("pubDate","")),
            "link": row.get("link","")
        })

    # yêu cầu bằng chứng từ >= N nguồn khác nhau
    agree_sources = sum(1 for _d,_v in entail_by_domain.items() if _v >= 0.6)
    disagree_sources = sum(1 for _d,_v in contra_by_domain.items() if _v >= 0.6)

    label, conf = "UNSURE", 0.5
    if disagree_sources >= need_disagree_sources and (contra_sum > 1.2*max(entail_sum,1e-6)):
        label = "FAKE"
        conf = min(0.95, contra_sum / (contra_sum + entail_sum + 1e-6))
    elif agree_sources >= need_agree_sources and (entail_sum > 1.2*max(contra_sum,1e-6)):
        label = "REAL"
        conf = min(0.95, entail_sum / (contra_sum + entail_sum + 1e-6))

    return {
        "claim": claim,
        "label": label,
        "confidence": round(float(conf), 3),
        "evidences": evidences[:top_m],
        "stats": {
            "agree_sources": int(agree_sources),
            "disagree_sources": int(disagree_sources),
            "entail_sum": round(float(entail_sum),3),
            "contra_sum": round(float(contra_sum),3)
        }
    }


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 158/158 [00:11<00:00, 13.71it/s]


In [5]:
tests = [
    "Tổng bí thư là Tô Lâm"
]
for t in tests:
    out = verify_claim(t)
    print(json.dumps(out, ensure_ascii=False, indent=2))


{
  "claim": "Tổng bí thư là Tô Lâm",
  "label": "REAL",
  "confidence": 0.916,
  "evidences": [
    {
      "id": "doc_123",
      "evidence": "Tổng Bí thư Tô Lâm tiếp lãnh đạo quốc hội Trung Quốc Tổng Bí thư Tô Lâm tiếp Ủy viên Thường vụ Bộ Chính trị đảng Cộng sản Trung Quốc Triệu Lạc Tế đang thăm chính thức Việt Nam.",
      "bm25_dense_ce": {
        "bm25": 28.05139692877716,
        "dense": 0.749365508556366,
        "ce": 7.371626377105713
      },
      "recency_w": 1.0,
      "trust_w": 1.2,
      "nli": {
        "entailment": 0.9965,
        "neutral": 0.0031,
        "contradiction": 0.0004
      },
      "source": "https://vnexpress.net/rss/the-gioi.rss",
      "pubDate": "2025-08-31 12:05:39+00:00",
      "link": "https://vnexpress.net/tong-bi-thu-to-lam-tiep-lanh-dao-quoc-hoi-trung-quoc-4933791.html"
    },
    {
      "id": "doc_4815",
      "evidence": "Tổng Bí thư tiếp Ủy viên trưởng Nhân đại toàn quốc Trung Quốc Chiều 31/8/2025, tại Trụ sở Trung ương Đảng, Tổng Bí t