In [7]:
import json, random, torch
from bs4 import BeautifulSoup
import requests
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from sklearn.metrics import accuracy_score, f1_score

# -------------------- Load model + tokenizer --------------------
model = DistilBertForSequenceClassification.from_pretrained("saved_model")
tokenizer = DistilBertTokenizerFast.from_pretrained("saved_model")
model.eval()
device = torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [15]:
# -------------------- Load dataset URLs (chỉ để lấy URL ngẫu nhiên) --------------------
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

sample_data = random.sample(data, min(100, len(data)))
urls = [item["url"] for item in sample_data]
true_labels = [item["label"] for item in sample_data]  # dùng để tính accuracy

In [16]:
# -------------------- Crawl plain text --------------------
def fetch_plain_text(url, max_len=5000):
    try:
        resp = requests.get(url, timeout=5, headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(resp.content, "html.parser")
        for tag in soup(["script","style","footer","nav","iframe","form","button"]):
            tag.extract()
        text = " ".join(soup.get_text().split())
        return (url + " " + text)[:max_len]
    except:
        return url  # fallback nếu crawl lỗi

# -------------------- Prepare texts --------------------
texts = [fetch_plain_text(u) for u in urls]

In [17]:
# -------------------- Tokenize --------------------
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
input_ids = encodings["input_ids"].to(device)
attention_mask = encodings["attention_mask"].to(device)

# -------------------- Predict --------------------
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    preds_ids = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# -------------------- Map labels --------------------
label2id = {"chính trị":0, "cờ bạc":1, "18+":2, "nội dung khác":3}
id2label = {v:k for k,v in label2id.items()}
pred_labels = [id2label[i] for i in preds_ids]

# -------------------- Accuracy & F1 --------------------
acc = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average="weighted")

print(f"Accuracy: {acc:.4f} | F1-score: {f1:.4f}\n")

# -------------------- Show predictions --------------------
for u, t, p in zip(urls, true_labels, pred_labels):
    print(f"URL: {u}\nTrue: {t} | Predicted: {p}\n")

Accuracy: 0.8900 | F1-score: 0.8946

URL: https://vtv90.vip
True: cờ bạc | Predicted: cờ bạc

URL: https://fi88vn.store
True: cờ bạc | Predicted: cờ bạc

URL: https://chandungquyenluc.blogspot.se
True: nội dung khác | Predicted: nội dung khác

URL: https://hit20.club
True: chính trị | Predicted: chính trị

URL: https://link-vao-fun88.online
True: cờ bạc | Predicted: cờ bạc

URL: https://world-dive.com
True: cờ bạc | Predicted: nội dung khác

URL: https://789win-03.com
True: cờ bạc | Predicted: cờ bạc

URL: https://truyenff.net
True: cờ bạc | Predicted: cờ bạc

URL: https://qq88.cafe
True: cờ bạc | Predicted: cờ bạc

URL: https://bet88cc.xyz
True: cờ bạc | Predicted: cờ bạc

URL: https://phimmoichillv.net
True: chính trị | Predicted: chính trị

URL: https://8877999.com
True: cờ bạc | Predicted: cờ bạc

URL: https://b52club.wiki
True: cờ bạc | Predicted: cờ bạc

URL: https://98wincom.club
True: cờ bạc | Predicted: cờ bạc

URL: https://bolapquechoa.blogspot.ru
True: nội dung khác | Predic