# Dự án NLP: Phân loại Văn bản Tiếng Việt

Notebook này triển khai quy trình hoàn chỉnh để phân loại tin tức tiếng Việt thành 20 chủ đề khác nhau.

**Quy trình thực hiện:**
1.  **Tải & Tiền xử lý dữ liệu:** Chuẩn hóa Unicode, tách từ (tokenization), loại bỏ từ dừng (stopwords).
2.  **Mô hình Machine Learning cơ bản:** Naive Bayes, Logistic Regression, SVM (LinearSVC).
3.  **Mô hình Deep Learning:** LSTM (Long Short-Term Memory).
4.  **Mô hình Deep Learning 2:** TextCNN (Convolutional Neural Networks).
5.  **Đánh giá & Dự đoán:** Xuất báo cáo hiệu năng và hệ thống dự đoán thời gian thực.

In [None]:
# 1. THIẾT LẬP MÔI TRƯỜNG & THƯ VIỆN
import os
import gc
import joblib
import shutil
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import copy

# PyTorch & Transformers
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn.functional as F

# Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.calibration import CalibratedClassifierCV
import scipy.sparse

# Matplotlib & Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_fscore_support

# Thư viện xử lý tiếng Việt
from pyvi import ViTokenizer

# Cấu hình đường dẫn
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR if (CURRENT_DIR / "data").exists() else CURRENT_DIR.parent

DATA_DIR = PROJECT_ROOT / "data" / "final"
MODEL_DIR = PROJECT_ROOT / "models"
REPORT_DIR = PROJECT_ROOT / "reports"
JSONL_PATH = DATA_DIR / "nlp_dataset.jsonl"

# Tạo thư mục nếu chưa tồn tại
for d in [MODEL_DIR, REPORT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Thiết bị đang sử dụng: {DEVICE}")

Thiết bị đang sử dụng: cuda


In [2]:
# 2. TẢI VÀ TIỀN XỬ LÝ DỮ LIỆU

STOPWORD_PATH = PROJECT_ROOT / "data" / "final" / "vietnamese-stopwords-dash.txt"

def load_stopwords(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return set([line.strip() for line in f.readlines()])
    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file stopwords tại {filepath}")
        return {"thì", "là", "mà"}

STOPWORDS = load_stopwords(STOPWORD_PATH)
print(f"Đã tải {len(STOPWORDS)} từ dừng.")

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def preprocess_text(text):
    text = normalize_text(text)
    tokenized = ViTokenizer.tokenize(text)
    words = tokenized.split()
    clean_words = [w for w in words if w.lower() not in STOPWORDS]
    return " ".join(clean_words)

if JSONL_PATH.exists():
    print("Đang tải dữ liệu từ file JSONL...")
    df = pd.read_json(JSONL_PATH, lines=True)
    
    if 'raw_text' not in df.columns:
        df['raw_text'] = df['text'].apply(normalize_text)
    
    print("Đang xử lý tách từ và lọc stopwords...")
    tqdm.pandas(desc="Xử lý văn bản")
    df['text'] = df['raw_text'].progress_apply(preprocess_text)
    
    df.to_json(JSONL_PATH, orient="records", lines=True)
else:
    print("Lỗi: Không tìm thấy file dữ liệu.")

target_col = 'label_name'
if target_col not in df.columns and 'label' in df.columns:
    df[target_col] = df['label']

le = LabelEncoder()
df['label_id'] = le.fit_transform(df[target_col])
classes = le.classes_
num_classes = len(classes)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target_col])

print(f"Số lượng Train: {len(train_df)} | Số lượng Test: {len(test_df)}")
print(f"Danh sách {num_classes} nhãn: {classes}")

del df
gc.collect()

Đã tải 1942 từ dừng.
Đang tải dữ liệu từ file JSONL...
Đang xử lý tách từ và lọc stopwords...


Xử lý văn bản: 100%|██████████| 115188/115188 [08:16<00:00, 231.78it/s]


Số lượng Train: 92150 | Số lượng Test: 23038
Danh sách 20 nhãn: ['Bất động sản' 'Chứng khoán' 'Công nghệ' 'Du lịch' 'Gia đình'
 'Giao thông' 'Giáo dục' 'Giải trí' 'Khoa học' 'Khởi nghiệp' 'Kinh doanh'
 'Nông nghiệp' 'Pháp luật' 'Sức khỏe' 'Thế giới' 'Thể thao'
 'Thời sự – Chính trị' 'Văn hóa' 'Đời sống' 'Ẩm thực']


20

# 3. MÔ HÌNH MACHINE LEARNING CƠ BẢN

In [3]:
# Tạo đặc trưng TF-IDF
print("Đang tạo vector TF-IDF...")
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])

# 1. Naive Bayes
print("Đang huấn luyện Naive Bayes...")
nb = MultinomialNB()
nb.fit(X_train, train_df['label_id'])
acc_nb = accuracy_score(test_df['label_id'], nb.predict(X_test))
print(f"Naive Bayes Accuracy: {acc_nb:.4f}")

# 2. Logistic Regression
print("Đang huấn luyện Logistic Regression...")
lr = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr.fit(X_train, train_df['label_id'])
acc_lr = accuracy_score(test_df['label_id'], lr.predict(X_test))
print(f"Logistic Regression Accuracy: {acc_lr:.4f}")

# 3. SVM (LinearSVC) 
print("Đang huấn luyện SVM (LinearSVC) với chế độ chuẩn hóa xác suất...")
linear_svc = LinearSVC(dual=False, random_state=42, max_iter=1000)
svm = CalibratedClassifierCV(linear_svc, method='sigmoid', cv=5) 
svm.fit(X_train, train_df['label_id'])
acc_svm = accuracy_score(test_df['label_id'], svm.predict(X_test))
print(f"SVM (Calibrated) Accuracy: {acc_svm:.4f}")

Đang tạo vector TF-IDF...
Đang huấn luyện Naive Bayes...
Naive Bayes Accuracy: 0.8264
Đang huấn luyện Logistic Regression...
Logistic Regression Accuracy: 0.8769
Đang huấn luyện SVM (LinearSVC) với chế độ chuẩn hóa xác suất...
SVM (Calibrated) Accuracy: 0.8831


# 4. DEEP LEARNING (LSTM)

In [10]:
INPUT_DIM = X_train.shape[1]
NUM_CLASSES = len(le.classes_)
print(f"Input Dimension: {INPUT_DIM}")
print(f"Number of Classes: {NUM_CLASSES}")

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")

class SparseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if scipy.sparse.issparse(self.X):
            row = self.X[idx].toarray().squeeze()
        else:
            row = self.X[idx]
        return torch.tensor(row, dtype=torch.float32), self.y[idx]

y_train_vals = train_df['label_id'].values
y_test_vals = test_df['label_id'].values

train_dataset = SparseDataset(X_train, y_train_vals)
test_dataset = SparseDataset(X_test, y_test_vals)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# --- LSTM MODEL ---
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, num_classes):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers=2, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

model_lstm = LSTM(input_size=INPUT_DIM, hidden_dim=256, num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)

# Early Stopping Config for LSTM
best_acc_lstm = 0.0
patience_lstm = 5
counter_lstm = 0
best_lstm_wts = copy.deepcopy(model_lstm.state_dict())

print("Training LSTM")
for epoch in range(15): 
    model_lstm.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"LSTM Epoch {epoch+1}")
    
    for x_batch, y_batch in pbar:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
        
        optimizer.zero_grad()
        output = model_lstm(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    model_lstm.eval()
    preds_temp = []
    labels_temp = []
    with torch.no_grad():
        for x_val, y_val in test_loader:
            x_val = x_val.to(DEVICE)
            out = model_lstm(x_val)
            preds_temp.extend(torch.argmax(out, dim=1).cpu().numpy())
            labels_temp.extend(y_val.numpy())
    
    val_acc = accuracy_score(labels_temp, preds_temp)
    print(f" >> Val Accuracy: {val_acc:.4f}")

    if val_acc > best_acc_lstm:
        best_acc_lstm = val_acc
        best_lstm_wts = copy.deepcopy(model_lstm.state_dict())
        counter_lstm = 0
    else:
        counter_lstm += 1
        if counter_lstm >= patience_lstm:
            print("Early Stopping triggered for LSTM.")
            break

model_lstm.load_state_dict(best_lstm_wts)
print(f"Best LSTM Accuracy: {best_acc_lstm:.4f}")

model_lstm.eval()
preds_lstm = []
with torch.no_grad():
    for x_batch, _ in test_loader:
        x_batch = x_batch.to(DEVICE)
        out = model_lstm(x_batch)
        preds_lstm.extend(torch.argmax(out, dim=1).cpu().numpy())

Input Dimension: 20000
Number of Classes: 20
Device: cuda
Training LSTM


LSTM Epoch 1: 100%|██████████| 1440/1440 [00:15<00:00, 90.21it/s, loss=0.4073]


 >> Val Accuracy: 0.8746


LSTM Epoch 2: 100%|██████████| 1440/1440 [00:15<00:00, 95.13it/s, loss=0.1557]


 >> Val Accuracy: 0.8792


LSTM Epoch 3: 100%|██████████| 1440/1440 [00:15<00:00, 95.34it/s, loss=0.1526]


 >> Val Accuracy: 0.8692


LSTM Epoch 4: 100%|██████████| 1440/1440 [00:14<00:00, 96.35it/s, loss=0.1094]


 >> Val Accuracy: 0.8687


LSTM Epoch 5: 100%|██████████| 1440/1440 [00:15<00:00, 95.49it/s, loss=0.1414]


 >> Val Accuracy: 0.8655


LSTM Epoch 6: 100%|██████████| 1440/1440 [00:14<00:00, 96.01it/s, loss=0.0723]


 >> Val Accuracy: 0.8640


LSTM Epoch 7: 100%|██████████| 1440/1440 [00:14<00:00, 96.25it/s, loss=0.0382]


 >> Val Accuracy: 0.8620
Early Stopping triggered for LSTM.
Best LSTM Accuracy: 0.8792


# 5. DEEP LEARNING MODEL 2: TextMLP

In [5]:
class TextMLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextMLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        
        self.fc2 = nn.Linear(512, 128)
        self.bn2 = nn.BatchNorm1d(128)
        
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        
        self.out = nn.Linear(64, num_classes)
        
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        x = self.dropout(F.relu(self.bn3(self.fc3(x))))
        return self.out(x)

# --- 3. HUẤN LUYỆN ---
model_mlp = TextMLP(input_size=INPUT_DIM, num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_mlp.parameters(), lr=0.001, weight_decay=1e-5)

best_acc = 0.0
patience = 5
counter = 0
best_weights = copy.deepcopy(model_mlp.state_dict())

print(f"Training MLP (Input Dim: {INPUT_DIM})...")

for epoch in range(20):
    model_mlp.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    
    for x, y in pbar:
        x, y = x.to(DEVICE), y.to(DEVICE)
        
        optimizer.zero_grad()
        output = model_mlp(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    model_mlp.eval()
    preds, labels = [], []
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(DEVICE)
            output = model_mlp(x)
            preds.extend(torch.argmax(output, dim=1).cpu().numpy())
            labels.extend(y.numpy())
            
    val_acc = accuracy_score(labels, preds)
    print(f" >> Val Accuracy: {val_acc:.4f}")
    
    if val_acc > best_acc:
        best_acc = val_acc
        best_weights = copy.deepcopy(model_mlp.state_dict())
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early Stopping.")
            break

model_mlp.load_state_dict(best_weights)
print(f"Best MLP Accuracy: {best_acc:.4f}")

# Lưu biến dự đoán để dùng cho báo cáo sau này
preds_mlp = []
model_mlp.eval()
with torch.no_grad():
    for x, _ in test_loader:
        x = x.to(DEVICE)
        output = model_mlp(x)
        preds_mlp.extend(torch.argmax(output, dim=1).cpu().numpy())

Training MLP (Input Dim: 20000)...


Epoch 1: 100%|██████████| 1440/1440 [00:13<00:00, 103.01it/s, loss=0.6654]


 >> Val Accuracy: 0.8706


Epoch 2: 100%|██████████| 1440/1440 [00:14<00:00, 102.36it/s, loss=0.5114]


 >> Val Accuracy: 0.8733


Epoch 3: 100%|██████████| 1440/1440 [00:13<00:00, 104.49it/s, loss=0.4761]


 >> Val Accuracy: 0.8764


Epoch 4: 100%|██████████| 1440/1440 [00:14<00:00, 102.14it/s, loss=0.2524]


 >> Val Accuracy: 0.8741


Epoch 5: 100%|██████████| 1440/1440 [00:14<00:00, 99.14it/s, loss=0.5309] 


 >> Val Accuracy: 0.8760


Epoch 6: 100%|██████████| 1440/1440 [00:14<00:00, 99.69it/s, loss=0.3587] 


 >> Val Accuracy: 0.8752


Epoch 7: 100%|██████████| 1440/1440 [00:14<00:00, 100.46it/s, loss=0.2540]


 >> Val Accuracy: 0.8749


Epoch 8: 100%|██████████| 1440/1440 [00:14<00:00, 98.45it/s, loss=0.2201] 


 >> Val Accuracy: 0.8740
Early Stopping.
Best MLP Accuracy: 0.8764


# 6. ĐÁNH GIÁ & BÁO CÁO

In [13]:
print("Dang thu thap ket qua du doan...")
model_preds = {}

def get_ml_pred(model_var_name, file_name, x_input):
    model = None
    if model_var_name in globals() and globals()[model_var_name] is not None:
        model = globals()[model_var_name]
    elif (MODEL_DIR / file_name).exists():
        try:
            print(f"Dang load lai {model_var_name} tu file...")
            model = joblib.load(MODEL_DIR / file_name)
        except: pass
    
    if model is not None:
        try:
            return model.predict(x_input)
        except Exception as e:
            print(f"Loi du doan {model_var_name}: {e}")
    return None

pred_nb = get_ml_pred('nb', 'naive_bayes.pkl', X_test)
if pred_nb is not None: model_preds['Naive Bayes'] = pred_nb

pred_lr = get_ml_pred('lr', 'logistic_regression.pkl', X_test)
if pred_lr is not None: model_preds['Logistic Regression'] = pred_lr

pred_svm = get_ml_pred('svm', 'svm_linear.pkl', X_test)
if pred_svm is not None: model_preds['SVM'] = pred_svm

if 'preds_lstm' in globals():
    model_preds['LSTM'] = np.array(preds_lstm)

if 'preds_mlp' in globals():
    model_preds['TextMLP'] = np.array(preds_mlp)

summary_data = []
report_path = REPORT_DIR / "detailed_classification_report.xlsx"
print(f"Dang xuat Excel report: {report_path.name}")

with pd.ExcelWriter(report_path) as writer:
    for model_name, y_pred in model_preds.items():
        if len(y_pred) != len(test_df):
            print(f"Warning: {model_name} length mismatch. Skipping.")
            continue
            
        # 1. Tính toán các chỉ số
        acc = accuracy_score(test_df['label_id'], y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(test_df['label_id'], y_pred, average='weighted')
        
        summary_data.append({
            "Model": model_name,
            "Accuracy": acc,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })
        
        # 2. Lưu report chi tiết từng class vào Excel
        clf_report = classification_report(test_df['label_id'], y_pred, target_names=classes, output_dict=True)
        df_report = pd.DataFrame(clf_report).transpose()
        sheet_name = model_name[:31]
        df_report.to_excel(writer, sheet_name=sheet_name)

if summary_data:
    # 3. HIỂN THỊ BẢNG TỔNG HỢP (Yêu cầu của bạn)
    results_df = pd.DataFrame(summary_data).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
    
    print("\n" + "="*60)
    print("BANG TONG HOP KET QUA (WEIGHTED AVERAGE)")
    print("="*60)
    
    # Hiển thị bảng với định dạng %
    display(results_df.style.format({
        "Accuracy": "{:.2%}",
        "Precision": "{:.2%}",
        "Recall": "{:.2%}",
        "F1-Score": "{:.2%}"
    }).background_gradient(cmap='Blues'))
    
    # 4. Vẽ biểu đồ Accuracy
    plt.figure(figsize=(10, 6))
    sns.barplot(data=results_df, x="Accuracy", y="Model", palette="viridis", hue="Model", legend=False)
    plt.title("So sanh Accuracy giua cac mo hinh")
    plt.xlim(0, 1.15)
    plt.grid(axis='x', linestyle='--', alpha=0.7)

    for i, row in results_df.iterrows():
        plt.text(row.Accuracy + 0.01, i, f"{row.Accuracy:.2%}", va='center', fontweight='bold', color='black')

    plt.savefig(REPORT_DIR / "model_comparison.png", bbox_inches='tight')
    plt.show()

print(f"Dang ve Confusion Matrix cho {len(model_preds)} mo hinh...")

for model_name, y_pred in model_preds.items():
    if len(y_pred) != len(test_df): continue

    cm = confusion_matrix(test_df['label_id'], y_pred)
    
    plt.figure(figsize=(12, 10))
    
    color_map = 'Blues'
    if 'SVM' in model_name: color_map = 'Greens'
    if 'TextMLP' in model_name: color_map = 'Purples'
    if 'LSTM' in model_name: color_map = 'Oranges'
    
    sns.heatmap(cm, annot=True, fmt='d', cmap=color_map, 
                xticklabels=classes, yticklabels=classes)
    
    plt.title(f"Confusion Matrix - {model_name}", fontsize=15, fontweight='bold', pad=20)
    plt.ylabel('Nhan thuc te')
    plt.xlabel('Nhan du doan')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    safe_name = model_name.replace(" ", "_")
    plt.savefig(REPORT_DIR / f"confusion_matrix_{safe_name}.png")
    plt.close()

print("Dang luu cac mo hinh...")

if 'le' in globals(): joblib.dump(le, MODEL_DIR / "label_encoder.pkl")
if 'vectorizer' in globals(): joblib.dump(vectorizer, MODEL_DIR / "tfidf_vectorizer_dl.pkl") 
elif 'tfidf' in globals(): joblib.dump(tfidf, MODEL_DIR / "tfidf_vectorizer.pkl")

if 'nb' in globals() and nb is not None: joblib.dump(nb, MODEL_DIR / "naive_bayes.pkl")
if 'lr' in globals() and lr is not None: joblib.dump(lr, MODEL_DIR / "logistic_regression.pkl")
if 'svm' in globals() and svm is not None: joblib.dump(svm, MODEL_DIR / "svm_linear.pkl")

if 'model_lstm' in globals():
    torch.save(model_lstm.state_dict(), MODEL_DIR / "lstm_model.pth")
    print(" - Da luu LSTM Model (lstm_model.pth).")

if 'model_mlp' in globals():
    torch.save(model_mlp.state_dict(), MODEL_DIR / "mlp_model.pth")
    print(" - Da luu TextMLP Model (mlp_model.pth).")

print("Hoan tat toan bo quy trinh!")

Dang thu thap ket qua du doan...
Dang xuat Excel report: detailed_classification_report.xlsx


IndexError: At least one sheet must be visible

# 7. HỆ THỐNG DỰ ĐOÁN THỰC TẾ (INFERENCE)

In [12]:
import torch
import torch.nn as nn
import joblib
import torch.nn.functional as F
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pathlib import Path
from pyvi import ViTokenizer

CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR if (CURRENT_DIR / "data").exists() else CURRENT_DIR.parent
MODEL_DIR = PROJECT_ROOT / "models"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {DEVICE}")

# --- 1. ĐỊNH NGHĨA MODEL ---

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, num_classes):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers=2, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

class TextMLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.out = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        x = self.dropout(F.relu(self.bn3(self.fc3(x))))
        return self.out(x)

# --- 2. LOAD MODELS & VECTORIZER ---
le, tfidf = None, None
nb, lr, svm = None, None, None
lstm_model, mlp_model = None, None

try:
    # Load Label Encoder & Vectorizer
    if (MODEL_DIR / "label_encoder.pkl").exists(): 
        le = joblib.load(MODEL_DIR / "label_encoder.pkl")
    elif (MODEL_DIR / "le.pkl").exists():
        le = joblib.load(MODEL_DIR / "le.pkl")
    
    if (MODEL_DIR / "tfidf_vectorizer_dl.pkl").exists():
        tfidf = joblib.load(MODEL_DIR / "tfidf_vectorizer_dl.pkl")
    elif (MODEL_DIR / "tfidf_vectorizer.pkl").exists():
        tfidf = joblib.load(MODEL_DIR / "tfidf_vectorizer.pkl")
    
    # Load ML Models
    if (MODEL_DIR / "naive_bayes.pkl").exists(): nb = joblib.load(MODEL_DIR / "naive_bayes.pkl")
    if (MODEL_DIR / "logistic_regression.pkl").exists(): lr = joblib.load(MODEL_DIR / "logistic_regression.pkl")
    if (MODEL_DIR / "svm_linear.pkl").exists(): svm = joblib.load(MODEL_DIR / "svm_linear.pkl")

    if tfidf and le:
        input_dim = len(tfidf.vocabulary_)
        num_classes = len(le.classes_)

        # Load LSTM (TF-IDF Version)
        if (MODEL_DIR / "lstm_model.pth").exists():
            lstm_model = LSTM(input_size=input_dim, hidden_dim=256, num_classes=num_classes)
            lstm_model.load_state_dict(torch.load(MODEL_DIR / "lstm_model.pth", map_location=DEVICE))
            lstm_model.to(DEVICE).eval()

        # Load TextMLP (Thay thế TextCNN)
        if (MODEL_DIR / "mlp_model.pth").exists():
            mlp_model = TextMLP(input_size=input_dim, num_classes=num_classes)
            mlp_model.load_state_dict(torch.load(MODEL_DIR / "mlp_model.pth", map_location=DEVICE))
            mlp_model.to(DEVICE).eval()

except Exception as e:
    print(f"Load Error: {e}")

# --- 3. HÀM DỰ ĐOÁN ---
def predict_all_models(url_or_text):
    print(f"\nInput: {url_or_text[:60]}...")
    
    content = url_or_text
    if url_or_text.startswith("http"):
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            resp = requests.get(url_or_text, headers=headers, timeout=10)
            soup = BeautifulSoup(resp.content, 'html.parser')
            content = ' '.join([p.get_text() for p in soup.find_all('p')]) 
            if len(content) < 50: return print("Short content.")
        except Exception as e: return print(f"URL Error: {e}")

    text_seg = ViTokenizer.tokenize(content)
    
    if not tfidf:
        return print("TF-IDF Vectorizer missing.")

    vec_sparse = tfidf.transform([text_seg]) 
    
    print("-" * 75)
    print(f"{'MODEL':<20} | {'LABEL':<35} | {'CONF'}")
    print("-" * 75)

    if le:
        # ML Predictions
        if nb:
            print(f"{'Naive Bayes':<20} | {le.inverse_transform(nb.predict(vec_sparse))[0].upper():<35} | {nb.predict_proba(vec_sparse).max():.2%}")
        if lr:
            print(f"{'Logistic Reg':<20} | {le.inverse_transform(lr.predict(vec_sparse))[0].upper():<35} | {lr.predict_proba(vec_sparse).max():.2%}")
        if svm:
            try: p_str = f"{svm.predict_proba(vec_sparse).max():.2%}"
            except: p_str = "N/A"
            print(f"{'SVM':<20} | {le.inverse_transform(svm.predict(vec_sparse))[0].upper():<35} | {p_str}")

        # Deep Learning Predictions (Input là Dense Tensor)
        vec_dense = torch.tensor(vec_sparse.toarray(), dtype=torch.float32).to(DEVICE)

        if lstm_model:
            with torch.no_grad():
                out = lstm_model(vec_dense)
                prob, idx = torch.max(torch.softmax(out, dim=1), dim=1)
                print(f"{'LSTM':<20} | {le.inverse_transform([idx.item()])[0].upper():<35} | {prob.item():.2%}")
        
        if mlp_model:
            with torch.no_grad():
                out = mlp_model(vec_dense)
                prob, idx = torch.max(torch.softmax(out, dim=1), dim=1)
                print(f"{'TextMLP':<20} | {le.inverse_transform([idx.item()])[0].upper():<35} | {prob.item():.2%}")
    
    print("-" * 75)

# --- 4. CHẠY THỬ ---
link_test = "https://vnexpress.net/nu-sinh-gianh-hc-vang-sea-games-duoc-truong-thuong-hon-100-trieu-dong-4996452.html"
predict_all_models(link_test)

Device: cuda

Input: https://vnexpress.net/nu-sinh-gianh-hc-vang-sea-games-duoc-t...
---------------------------------------------------------------------------
MODEL                | LABEL                               | CONF
---------------------------------------------------------------------------
Naive Bayes          | GIÁO DỤC                            | 48.52%
Logistic Reg         | GIÁO DỤC                            | 48.32%
SVM                  | GIÁO DỤC                            | 52.42%
LSTM                 | GIÁO DỤC                            | 82.24%
TextMLP              | GIÁO DỤC                            | 91.67%
---------------------------------------------------------------------------
