# üß† NLP Project: Final Optimized Version (V5)

Phi√™n b·∫£n t·ªëi ∆∞u h√≥a to√†n di·ªán:
1.  **Machine Learning**: NB, LR, SVM (Baseline).
2.  **Deep Learning**: LSTM (Fixed architecture).
3.  **PhoBERT**: Optimized (Max Len 512, Save Best Model).
4.  **Analysis**: Confusion Matrix & Error Inspection.
5.  **Deployment**: Auto-save models.

In [1]:
# --- 1. SETUP & IMPORTS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import gc
import shutil
import joblib
from pathlib import Path
from tqdm import tqdm
from collections import Counter
import unicodedata

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# DEEP LEARNING
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from pyvi import ViTokenizer

# --- C·∫§U H√åNH ---
CURRENT_DIR = Path.cwd()
if (CURRENT_DIR / "data").exists():
    PROJECT_ROOT = CURRENT_DIR
elif (CURRENT_DIR.parent / "data").exists():
    PROJECT_ROOT = CURRENT_DIR.parent
else:
    PROJECT_ROOT = CURRENT_DIR

DATA_DIR = PROJECT_ROOT / "data" / "final"
JSONL_PATH = DATA_DIR / "nlp_dataset.jsonl"
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR = PROJECT_ROOT / "reports"
REPORT_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device: {device}")

‚úÖ Device: cuda


In [2]:
# --- 2. LOAD DATA (AUTO FIX & CLEAN) ---
STOPWORDS = set([
    "th√¨", "l√†", "m√†", "c·ªßa", "nh·ªØng", "c√°c", "ƒë·ªÉ", "v√†", "v·ªõi", "c√≥", 
    "trong", "ƒë√£", "ƒëang", "s·∫Ω", "ƒë∆∞·ª£c", "b·ªã", "t·∫°i", "v√¨", "nh∆∞", "n√†y",
    "cho", "v·ªÅ", "m·ªôt", "ng∆∞·ªùi", "khi", "ra", "v√†o", "l√™n", "xu·ªëng",
    "t√¥i", "ch√∫ng_t√¥i", "b·∫°n", "h·ªç", "ch√∫ng_ta", "theo", "√¥ng", "b√†",
    "nhi·ªÅu", "√≠t", "r·∫•t", "qu√°", "l·∫Øm", "nh∆∞ng", "tuy_nhi√™n", "n·∫øu", "d√π",
    "b√†i", "vi·∫øt", "·∫£nh", "video", "clip", "ngu·ªìn"
])

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def remove_stopwords(text):
    words = text.split()
    return " ".join([w for w in words if w.lower() not in STOPWORDS])

print("‚è≥ ƒêang t·∫£i d·ªØ li·ªáu...")
rebuild = False
required_cols = {'text', 'raw_text', 'label_name'}

if JSONL_PATH.exists():
    try:
        df = pd.read_json(JSONL_PATH, lines=True)
        if not required_cols.issubset(df.columns):
            print("‚ö†Ô∏è File c≈© thi·∫øu c·ªôt -> T√°i t·∫°o...")
            rebuild = True
    except: rebuild = True
else:
    rebuild = True

if rebuild and DATA_DIR.exists():
    print("‚ôªÔ∏è ƒêang qu√©t d·ªØ li·ªáu g·ªëc...")
    data = []
    files = list(DATA_DIR.glob("**/*.txt"))
    # files = files[:2000] # Uncomment ƒë·ªÉ test nhanh
    for file_path in tqdm(files, desc="Processing"):
        try:
            try: 
                with open(file_path, "r", encoding="utf-16") as f: content = f.read().strip()
            except: 
                with open(file_path, "r", encoding="utf-8") as f: content = f.read().strip()
            
            if content:
                content = normalize_text(content)
                tokenized = ViTokenizer.tokenize(content)
                clean = remove_stopwords(tokenized)
                data.append({
                    "text": clean, "raw_text": content,
                    "label_name": file_path.parent.name, "filename": file_path.name
                })
        except: continue
    df = pd.DataFrame(data)
    df.to_json(JSONL_PATH, orient="records", lines=True)

# Encode Nh√£n
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label_name'])
classes = le.classes_
num_classes = len(classes)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_name'])
print(f"üìä Train: {len(train_df)} | Test: {len(test_df)} | Classes: {num_classes}")
del df; gc.collect()

‚è≥ ƒêang t·∫£i d·ªØ li·ªáu...
üìä Train: 92150 | Test: 23038 | Classes: 20


35

## üõ†Ô∏è 1. Machine Learning Baselines

In [3]:
print("‚è≥ T·∫°o TF-IDF...")
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])

# 1. NB
print("‚öîÔ∏è Training Naive Bayes...")
nb = MultinomialNB()
nb.fit(X_train, train_df['label_id'])
acc_nb = accuracy_score(test_df['label_id'], nb.predict(X_test))
print(f"‚úÖ NB Accuracy: {acc_nb:.4f}")

# 2. LR
print("‚öîÔ∏è Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, train_df['label_id'])
acc_lr = accuracy_score(test_df['label_id'], lr.predict(X_test))
print(f"‚úÖ LR Accuracy: {acc_lr:.4f}")

# 3. SVM
print("‚öîÔ∏è Training SVM...")
svm = LinearSVC(dual=False, random_state=42)
svm.fit(X_train, train_df['label_id'])
y_pred_svm = svm.predict(X_test)
acc_svm = accuracy_score(test_df['label_id'], y_pred_svm)
print(f"‚úÖ SVM Accuracy: {acc_svm:.4f}")
print(classification_report(test_df['label_id'], y_pred_svm, target_names=classes))

‚è≥ T·∫°o TF-IDF...
‚öîÔ∏è Training Naive Bayes...
‚úÖ NB Accuracy: 0.8235
‚öîÔ∏è Training Logistic Regression...
‚úÖ LR Accuracy: 0.8801
‚öîÔ∏è Training SVM...
‚úÖ SVM Accuracy: 0.8863
                     precision    recall  f1-score   support

       B·∫•t ƒë·ªông s·∫£n       0.89      0.91      0.90      1114
        Ch·ª©ng kho√°n       0.93      0.92      0.92       734
          C√¥ng ngh·ªá       0.94      0.97      0.96      1369
            Du l·ªãch       0.87      0.88      0.88      1056
           Gia ƒë√¨nh       0.88      0.83      0.86       604
         Giao th√¥ng       0.88      0.86      0.87       580
           Gi√°o d·ª•c       0.83      0.85      0.84       634
           Gi·∫£i tr√≠       0.81      0.77      0.79       809
           Khoa h·ªçc       0.85      0.86      0.86       932
        Kh·ªüi nghi·ªáp       0.81      0.65      0.72       760
         Kinh doanh       0.87      0.87      0.87      1646
        N√¥ng nghi·ªáp       0.82      0.76      0.

## üß† 2. LSTM (Fixed Architecture)

In [4]:
print("‚è≥ Training LSTM...")
counter = Counter()
for t in train_df['text']: counter.update(t.split())
vocab = {w: i+2 for i, (w, _) in enumerate(counter.most_common(20000))}
vocab['<PAD>'] = 0; vocab['<UNK>'] = 1
MAX_LEN_LSTM = 500

def text_to_seq(text, vocab, max_len):
    seq = [vocab.get(w, 1) for w in text.split()]
    if len(seq) < max_len: seq += [0]*(max_len-len(seq))
    return seq[:max_len]

class LSTMDataset(Dataset):
    def __init__(self, df): 
        self.x = [text_to_seq(t, vocab, MAX_LEN_LSTM) for t in df['text']]
        self.y = df['label_id'].values
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return torch.tensor(self.x[idx]), torch.tensor(self.y[idx])

train_loader = DataLoader(LSTMDataset(train_df), batch_size=64, shuffle=True)
test_loader = DataLoader(LSTMDataset(test_df), batch_size=64)

# --- CUSTOM CLASS ƒê·ªÇ FIX L·ªñI TUPLE ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, (h_n, c_n) = self.lstm(embedded)
        last_hidden = h_n[-1] # L·∫•y hidden state cu·ªëi c√πng
        out = self.fc(last_hidden)
        return out

model_lstm = LSTMClassifier(len(vocab)+2, 100, 100, num_classes).to(device)
opt = optim.Adam(model_lstm.parameters(), lr=0.001)
crit = nn.CrossEntropyLoss()

for epoch in range(10):
    model_lstm.train()
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        opt.zero_grad(); loss = crit(model_lstm(x), y); loss.backward(); opt.step()

model_lstm.eval()
preds_lstm = []
with torch.no_grad():
    for x, _ in test_loader:
        preds_lstm.extend(torch.argmax(model_lstm(x.to(device)), dim=1).cpu().numpy())

acc_lstm = accuracy_score(test_df['label_id'], preds_lstm)
print(f"‚úÖ LSTM Accuracy: {acc_lstm:.4f}")

‚è≥ Training LSTM...


  return t.to(
Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:08<00:00, 179.02it/s]
Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.75it/s]
Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.27it/s]
Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.51it/s]
Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.16it/s]
Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.44it/s]
Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.14it/s]
Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.63it/s]
Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.21it/s]
Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1440/1440 [00:07<00:00, 185.20it/s]


‚úÖ LSTM Accuracy: 0.8441


## üî• 3. PhoBERT (Optimized: Max Len 512 & Save Best Model)

In [None]:
# --- 3. PHOBERT (FIXED & OPTIMIZED FOR 256 TOKENS) ---
import torch
import shutil
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

print("‚è≥ Training PhoBERT (Phi√™n b·∫£n V2 - MaxLen 256)...")

# --- C·∫§U H√åNH CHU·∫®N CHO PHOBERT ---
# PhoBERT ch·ªâ h·ªó tr·ª£ t·ªëi ƒëa 256 token. Kh√¥ng ƒë∆∞·ª£c set cao h∆°n.
MAX_LEN_BERT = 256  
BATCH_SIZE = 32      # TƒÉng l√™n 32 v√¨ 256 token t·ªën √≠t VRAM h∆°n
LEARNING_RATE = 2e-5 # LR chu·∫©n cho PhoBERT
EPOCHS = 5           

# Folder l∆∞u model t·ªët nh·∫•t
PHOBERT_DIR = MODEL_DIR / "phobert_best"
PHOBERT_DIR.mkdir(parents=True, exist_ok=True)

# Dataset Class
class PhoBERTDataset(Dataset):
    def __init__(self, df):
        self.texts = df['raw_text'].tolist()
        self.labels = df['label_id'].tolist()
        self.tokenizer = tokenizer
        self.max_len = 256 # Gi·ªõi h·∫°n c·ªßa model

    def __len__(self): return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # 1. Tokenize to√†n b·ªô b√†i (kh√¥ng c·∫Øt v·ªôi)
        tokens = self.tokenizer.encode(text, add_special_tokens=True)
        
        # 2. X·ª≠ l√Ω HEAD + TAIL (Chi·∫øn thu·∫≠t l·∫≠t k√®o)
        if len(tokens) > self.max_len:
            # L·∫•y 128 token ƒë·∫ßu v√† 128 token cu·ªëi (Tr·ª´ 2 token ƒë·∫∑c bi·ªát CLS v√† SEP)
            head_len = 128
            tail_len = self.max_len - head_len
            
            # Gh√©p ƒë·∫ßu + ƒëu√¥i
            input_ids = tokens[:head_len] + tokens[-tail_len:]
        else:
            # N·∫øu ng·∫Øn th√¨ pad th√™m s·ªë 0 cho ƒë·ªß
            padding_len = self.max_len - len(tokens)
            input_ids = tokens + [self.tokenizer.pad_token_id] * padding_len
            
        # Chuy·ªÉn th√†nh Tensor
        input_ids = torch.tensor(input_ids)
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(self.labels[idx])
        }

train_loader_bert = DataLoader(PhoBERTDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
test_loader_bert = DataLoader(PhoBERTDataset(test_df), batch_size=BATCH_SIZE)

# Kh·ªüi t·∫°o Model (D√πng b·∫£n V2 x·ªãn h∆°n)
MODEL_NAME = "vinai/phobert-base-v2" 
print(f"   ‚û§ ƒêang t·∫£i model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_classes).to(device)
opt = AdamW(model_bert.parameters(), lr=LEARNING_RATE)

# --- V√íNG L·∫∂P TRAIN SAVE BEST ---
best_acc = 0.0

for epoch in range(EPOCHS):
    print(f"\nüåÄ Epoch {epoch+1}/{EPOCHS}:")
    
    # 1. Training
    model_bert.train()
    progress_bar = tqdm(train_loader_bert, desc="Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        opt.zero_grad()
        outputs = model_bert(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        opt.step()
        
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    # 2. Evaluation
    model_bert.eval()
    preds_temp, targets_temp = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader_bert, desc="Evaluating"):
            outputs = model_bert(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
            preds_temp.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            targets_temp.extend(batch['labels'].numpy())
    
    current_acc = accuracy_score(targets_temp, preds_temp)
    print(f"   üìà Accuracy: {current_acc:.4f}")
    
    # 3. Save Best
    if current_acc > best_acc:
        print(f"   üî• K·ª∑ l·ª•c m·ªõi! (Old: {best_acc:.4f} -> New: {current_acc:.4f})")
        model_bert.save_pretrained(PHOBERT_DIR)
        tokenizer.save_pretrained(PHOBERT_DIR)
        best_acc = current_acc
    else:
        print(f"   ‚ö†Ô∏è Kh√¥ng v∆∞·ª£t qua k·ª∑ l·ª•c ({best_acc:.4f})")

print(f"\nüèÜ Model t·ªët nh·∫•t ƒë·∫°t Accuracy: {best_acc:.4f}")

‚è≥ Training PhoBERT (Optimized)...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üåÄ Epoch 1/5:


Training:   0%|          | 0/11519 [00:00<?, ?it/s]


RuntimeError: The expanded size of the tensor (512) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [8, 512].  Tensor sizes: [1, 258]

## üîç 4. Analysis & Report (Load Best Model)

In [None]:
# Load l·∫°i Model t·ªët nh·∫•t ƒë·ªÉ ph√¢n t√≠ch
print("‚è≥ Loading Best PhoBERT for Analysis...")
best_model = AutoModelForSequenceClassification.from_pretrained(PHOBERT_DIR).to(device)
best_model.eval()

final_preds, final_targets = [], []
with torch.no_grad():
    for batch in tqdm(test_loader_bert, desc="Final Inference"):
        outputs = best_model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device))
        final_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        final_targets.extend(batch['labels'].numpy())

# 1. Confusion Matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(final_targets, final_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.title(f'Confusion Matrix (PhoBERT Best - Acc: {best_acc:.4f})')
plt.xticks(rotation=45, ha='right'); plt.show()

# 2. Classification Report
print("\nüìä Detailed Report:")
print(classification_report(final_targets, final_preds, target_names=classes))

# 3. Soi m·∫´u sai
print("\nüßê C√ÅC M·∫™U SAI ƒêI·ªÇN H√åNH:")
wrong_idx = [i for i, (p, t) in enumerate(zip(final_preds, final_targets)) if p != t]
import random
if wrong_idx:
    for idx in random.sample(wrong_idx, min(5, len(wrong_idx))):
        print("-"*80)
        print(f"üî¥ Th·ª±c t·∫ø: {classes[final_targets[idx]]} | üîµ D·ª± ƒëo√°n: {classes[final_preds[idx]]}")
        # Hack ƒë·ªÉ l·∫•y text g·ªëc t·ª´ dataset th√¥ng qua index (l∆∞u √Ω: test_loader kh√¥ng shuffle n√™n index kh·ªõp v·ªõi test_df)
        print(f"üìñ {test_df.iloc[idx]['raw_text'][:200]}...")

# 4. Save Final Report
results = pd.DataFrame([
    {"Model": "SVM", "Accuracy": acc_svm},
    {"Model": "Logistic Regression", "Accuracy": acc_lr},
    {"Model": "Naive Bayes", "Accuracy": acc_nb},
    {"Model": "LSTM", "Accuracy": acc_lstm},
    {"Model": "PhoBERT (Best)", "Accuracy": best_acc}
]).sort_values(by="Accuracy", ascending=False)
display(results)
results.to_excel(REPORT_DIR / "final_leaderboard.xlsx", index=False)

In [None]:
# --- 5. SAVE ALL MODELS ---
print("üíæ ƒêang l∆∞u c√°c model c√≤n l·∫°i...")
# Sklearn
joblib.dump(le, MODEL_DIR / "label_encoder.pkl")
joblib.dump(tfidf, MODEL_DIR / "tfidf_vectorizer.pkl")
joblib.dump(svm, MODEL_DIR / "svm_linear.pkl")
joblib.dump(lr, MODEL_DIR / "logistic_regression.pkl")

# LSTM
lstm_checkpoint = {
    'vocab': vocab, 'model_state': model_lstm.state_dict(),
    'config': {'vocab_size': len(vocab)+2, 'embed_dim': 100, 'hidden_dim': 100, 'num_classes': num_classes, 'max_len': MAX_LEN_LSTM}
}
torch.save(lstm_checkpoint, MODEL_DIR / "lstm_model.pth")
print("‚úÖ All Done!")