In [1]:
# ===============================
# 📦 CELL 1: INSTALL PACKAGES (FIXED VERSION)
# ===============================
# Install core packages first
!pip install transformers datasets torch scikit-learn matplotlib seaborn tqdm gensim nltk -q

# Install Vietnamese NLP packages with error handling
import subprocess
import sys

def install_with_fallback(package_name, fallback_msg=""):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name, "-q"])
        print(f"✅ {package_name} installed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Failed to install {package_name}: {e}")
        if fallback_msg:
            print(f"   {fallback_msg}")
        return False

# Try to install Vietnamese packages
print("📦 Installing Vietnamese NLP packages...")
pyvi_available = install_with_fallback("pyvi", "Will use basic tokenization instead")
underthesea_available = install_with_fallback("underthesea", "Will use alternative tokenization")

# Optional packages
install_with_fallback("wordcloud", "Word cloud generation will be skipped")

print("\n✅ Core packages installed successfully!")
print(f"📊 PyVi available: {pyvi_available}")
print(f"📊 Underthesea available: {underthesea_available}")

# Alternative minimal installation if Vietnamese packages fail
if not pyvi_available and not underthesea_available:
    print("\n⚠️ Vietnamese tokenizers not available - using basic preprocessing")
    print("This will still work but with reduced Vietnamese text processing quality")


📦 Installing Vietnamese NLP packages...
✅ pyvi installed successfully
⚠️ Failed to install underthesea: Command '['/usr/bin/python3', '-m', 'pip', 'install', 'underthesea', '-q']' returned non-zero exit status 2.
   Will use alternative tokenization
✅ wordcloud installed successfully

✅ Core packages installed successfully!
📊 PyVi available: True
📊 Underthesea available: False


In [2]:
# ===============================
# 📂 CELL 2: MOUNT DRIVE & IMPORTS
# ===============================
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import torch
import warnings
import re
import html
import os
import pickle
import joblib
from collections import Counter
warnings.filterwarnings('ignore')

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔥 Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding,
    EarlyStoppingCallback, AutoConfig
)
from datasets import Dataset

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt', quiet=True)

# Vietnamese text processing
try:
    from pyvi import ViTokenizer
    PYVI_AVAILABLE = True
except ImportError:
    print("⚠️ PyVi not available, will use basic tokenization")
    PYVI_AVAILABLE = False

try:
    from underthesea import word_tokenize
    UNDERTHESEA_AVAILABLE = True
except ImportError:
    print("⚠️ Underthesea not available, will use alternative tokenization")
    UNDERTHESEA_AVAILABLE = False

print("✅ All imports successful!")


🔥 Using device: cuda
GPU: Tesla T4
Memory: 14.7 GB
⚠️ Underthesea not available, will use alternative tokenization
✅ All imports successful!


In [4]:
# ===============================
# 🔧 CELL 3: ADVANCED VIETNAMESE TEXT PREPROCESSOR
# ===============================
class AdvancedVietnameseTextPreprocessor:
    def __init__(self):
        self.html_tags = re.compile('<.*?>')

        # Vietnamese stopwords (expanded list)
        self.vietnamese_stopwords = {
            'và', 'là', 'có', 'được', 'này', 'đó', 'các', 'một', 'không', 'để', 'trong',
            'của', 'với', 'về', 'từ', 'theo', 'như', 'trên', 'dưới', 'sau', 'trước',
            'đã', 'sẽ', 'đang', 'bị', 'cho', 'tại', 'do', 'vì', 'nên', 'mà', 'hay',
            'hoặc', 'nhưng', 'tuy', 'dù', 'nếu', 'khi', 'lúc', 'bây_giờ', 'hiện_tại',
            'ngày', 'tháng', 'năm', 'giờ', 'phút', 'giây', 'rồi', 'đây', 'kia'
        }

        # Common replacements for text normalization
        self.replacements = {
            # Number normalization
            r'\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}': ' <DATE> ',  # dates
            r'\d+[.,]\d+': ' <NUMBER> ',  # decimal numbers
            r'\d+': ' <NUMBER> ',  # integers

            # Special characters normalization
            r'[!]{2,}': ' <EXCLAMATION> ',
            r'[?]{2,}': ' <QUESTION> ',
            r'[.]{3,}': ' <DOTS> ',

            # Repeated characters
            r'(.)\1{2,}': r'\1\1',  # reduce repeated chars to max 2
        }

        # Common fake news indicators in Vietnamese
        self.fake_indicators = {
            'nóng', 'hot', 'shock', 'khẩn_cấp', 'cảnh_báo', 'nguy_hiểm',
            'bí_mật', 'tiết_lộ', 'phát_hiện', 'đột_phá', '100%', 'chắc_chắn',
            'tuyệt_đối', 'không_bao_giờ', 'luôn_luôn', 'mãi_mãi'
        }

    def clean_html(self, text):
        """Advanced HTML cleaning"""
        if pd.isna(text):
            return ""

        # Decode HTML entities
        text = html.unescape(str(text))

        # Remove HTML tags but keep some structure
        text = self.html_tags.sub(' ', text)

        # Remove script and style content
        text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)

        return text

    def normalize_text(self, text):
        """Advanced text normalization"""
        if pd.isna(text):
            return ""

        text = str(text).lower()

        # Apply replacements
        for pattern, replacement in self.replacements.items():
            text = re.sub(pattern, replacement, text)

        # Remove URLs and emails
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' <URL> ', text)
        text = re.sub(r'\S+@\S+', ' <EMAIL> ', text)

        # Remove phone numbers
        text = re.sub(r'(\+84|0)[0-9]{9,10}', ' <PHONE> ', text)

        # Normalize whitespace
        text = ' '.join(text.split())

        return text

    def tokenize_vietnamese(self, text):
        """Vietnamese-aware tokenization"""
        if pd.isna(text) or not text.strip():
            return []

        # Use Vietnamese tokenizer if available
        if UNDERTHESEA_AVAILABLE:
            try:
                tokens = word_tokenize(text)
                return [token for token in tokens if len(token) > 1]
            except:
                pass

        if PYVI_AVAILABLE:
            try:
                tokenized = ViTokenizer.tokenize(text)
                return [token for token in tokenized.split() if len(token) > 1]
            except:
                pass

        # Fallback to simple tokenization
        return simple_preprocess(text, min_len=2, max_len=50)

    def remove_stopwords(self, tokens):
        """Remove Vietnamese stopwords"""
        return [token for token in tokens if token not in self.vietnamese_stopwords]

    def extract_features(self, text):
        """Extract linguistic features"""
        features = {
            'length': len(str(text)),
            'word_count': len(str(text).split()),
            'avg_word_length': np.mean([len(word) for word in str(text).split()]) if str(text).split() else 0,
            'exclamation_count': str(text).count('!'),
            'question_count': str(text).count('?'),
            'uppercase_ratio': sum(1 for c in str(text) if c.isupper()) / len(str(text)) if str(text) else 0,
            'digit_count': sum(1 for c in str(text) if c.isdigit()),
            'fake_indicator_count': sum(1 for word in str(text).lower().split() if word in self.fake_indicators)
        }
        return features

    def clean_and_process(self, text, remove_stopwords=False, extract_features=False):
        """Complete text processing pipeline"""
        # Clean HTML
        clean_text = self.clean_html(text)

        # Normalize
        normalized = self.normalize_text(clean_text)

        # Extract features if requested
        features = self.extract_features(normalized) if extract_features else None

        # Tokenize
        tokens = self.tokenize_vietnamese(normalized)

        # Remove stopwords if requested
        if remove_stopwords:
            tokens = self.remove_stopwords(tokens)

        processed_text = ' '.join(tokens)

        return processed_text, features

print("✅ Advanced Vietnamese Text Preprocessor defined!")


✅ Advanced Vietnamese Text Preprocessor defined!


In [5]:
# ===============================
# 📋 CELL 4: ENHANCED DATASET PREPROCESSOR
# ===============================
class EnhancedNewsDatasetPreprocessor:
    def __init__(self):
        self.text_preprocessor = AdvancedVietnameseTextPreprocessor()
        self.feature_scaler = StandardScaler()

    def combine_text_features(self, df, use_weights=True):
        """Enhanced text combination with weights"""
        combined_texts = []
        linguistic_features = []

        # Weights for different text components
        weights = {
            'title': 3.0,      # Title is most important
            'summary': 2.0,    # Summary is very important
            'content': 1.0     # Content is baseline
        }

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing texts"):
            text_parts = []
            row_features = {}

            # Process title
            if pd.notna(row.get('title')) and str(row.get('title')).strip():
                title_clean, title_feat = self.text_preprocessor.clean_and_process(
                    row['title'], remove_stopwords=False, extract_features=True
                )
                if title_clean:
                    if use_weights:
                        text_parts.extend([title_clean] * int(weights['title']))
                    else:
                        text_parts.append(title_clean)
                    row_features.update({f'title_{k}': v for k, v in title_feat.items()})

            # Process summary
            if pd.notna(row.get('summary')) and str(row.get('summary')).strip():
                summary_clean, summary_feat = self.text_preprocessor.clean_and_process(
                    row['summary'], remove_stopwords=False, extract_features=True
                )
                if summary_clean:
                    if use_weights:
                        text_parts.extend([summary_clean] * int(weights['summary']))
                    else:
                        text_parts.append(summary_clean)
                    row_features.update({f'summary_{k}': v for k, v in summary_feat.items()})

            # Process content
            if pd.notna(row.get('content_html')) and str(row.get('content_html')).strip():
                content_clean, content_feat = self.text_preprocessor.clean_and_process(
                    row['content_html'], remove_stopwords=False, extract_features=True
                )
                if content_clean:
                    text_parts.append(content_clean)
                    row_features.update({f'content_{k}': v for k, v in content_feat.items()})

            # Combine texts
            combined_text = ' [SEP] '.join(text_parts) if text_parts else ""
            combined_texts.append(combined_text)
            linguistic_features.append(row_features)

        return combined_texts, linguistic_features

    def prepare_data(self, df, min_length=50, max_length=8000, balance_data=False):
        """Enhanced data preparation with optional balancing"""
        print("📋 Enhanced preprocessing...")

        # Combine texts and extract features
        combined_texts, linguistic_features = self.combine_text_features(df)
        df['combined_text'] = combined_texts

        # Add linguistic features to dataframe
        if linguistic_features and linguistic_features[0]:  # Check if features exist
            feature_df = pd.DataFrame(linguistic_features)
            df = pd.concat([df, feature_df], axis=1)

        # Filter by text length
        df['text_length'] = df['combined_text'].str.len()

        print(f"📊 Original dataset: {len(df)} samples")

        # Remove texts that are too short or too long
        df = df[(df['text_length'] >= min_length) & (df['text_length'] <= max_length)]

        # Remove empty texts
        df = df[df['combined_text'].str.strip() != '']

        print(f"📊 After length filtering: {len(df)} samples")

        # Balance dataset if requested
        if balance_data:
            df = self.balance_dataset(df)

        # Ensure labels are binary
        df['label'] = df['label'].astype(int)

        print(f"📊 Final dataset: {len(df)} samples")
        print(f"📊 Label distribution:")
        print(df['label'].value_counts())
        print(f"📊 Text length statistics:")
        print(df['text_length'].describe())

        return df

    def balance_dataset(self, df):
        """Balance dataset using undersampling"""
        print("⚖️ Balancing dataset...")

        # Get minority class size
        label_counts = df['label'].value_counts()
        min_size = label_counts.min()

        # Sample equal amounts from each class
        balanced_dfs = []
        for label in df['label'].unique():
            class_df = df[df['label'] == label].sample(n=min_size, random_state=42)
            balanced_dfs.append(class_df)

        balanced_df = pd.concat(balanced_dfs, ignore_index=True)

        print(f"📊 Balanced dataset: {len(balanced_df)} samples")
        print(f"📊 New label distribution:")
        print(balanced_df['label'].value_counts())

        return balanced_df

print("✅ Enhanced Dataset Preprocessor defined!")

✅ Enhanced Dataset Preprocessor defined!


In [9]:
# ===============================
# CELL 5: 🤖 UNIVERSAL TRANSFORMER CLASSIFIER (fixed imports + metrics logging)
# ===============================
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import torch.nn as nn
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except:
        auc = 0.0
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}

class UniversalTransformerClassifier:
    def __init__(self, model_name, max_length=512, num_labels=2, from_scratch=False):
        self.model_name = model_name
        self.max_length = max_length
        self.num_labels = num_labels
        self.from_scratch = from_scratch

        print(f"\n🤖 Initializing {model_name} (from_scratch={from_scratch}) ...")

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

        if from_scratch:
            config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
            self.model = AutoModelForSequenceClassification.from_config(config)
            print(f"⚠️ Model {model_name} initialized with RANDOM weights.")
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
            print(f"✅ Model {model_name} loaded pretrained weights.")

        self.model.to(device)

    def create_dataset(self, texts, labels=None):
        data = {"text": texts}
        if labels is not None:
            data["labels"] = labels
        dataset = Dataset.from_dict(data)
        return dataset.map(
            lambda batch: self.tokenizer(
                batch["text"], truncation=True, padding="max_length", max_length=self.max_length
            ),
            batched=True,
        )

    def train(self, train_texts, train_labels, val_texts, val_labels,
              num_epochs=20, batch_size=16, learning_rate=2e-5):
        print(f"\n🚀 Training {self.model_name} for {num_epochs} epochs (from_scratch={self.from_scratch})")

        train_dataset = self.encode(train_texts, train_labels)
        val_dataset = self.encode(val_texts, val_labels)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        loss_fn = nn.CrossEntropyLoss()

        best_val_f1 = -1.0
        best_state, best_preds, best_labels = None, None, None

        for epoch in range(1, num_epochs + 1):
            # Training
            self.model.train()
            total_loss = 0.0
            for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs} - Training", leave=False):
                input_ids, attention_mask, labels = [x.to(device) for x in batch]
                optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_train_loss = total_loss / max(1, len(train_loader))

            # Validation
            self.model.eval()
            val_preds, val_labels_all, val_probs = [], [], []
            with torch.no_grad():
                for batch in tqdm(val_loader, desc=f"Epoch {epoch}/{num_epochs} - Validation", leave=False):
                    input_ids, attention_mask, labels = [x.to(device) for x in batch]
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
                    preds = np.argmax(probs, axis=1)

                    val_preds.extend(preds.tolist())
                    val_labels_all.extend(labels.cpu().numpy().tolist())
                    val_probs.extend(probs.tolist())

            acc = accuracy_score(val_labels_all, val_preds)
            prec = precision_score(val_labels_all, val_preds, zero_division=0)
            rec = recall_score(val_labels_all, val_preds, zero_division=0)
            f1 = f1_score(val_labels_all, val_preds, zero_division=0)
            try:
                auc = roc_auc_score(val_labels_all, np.array(val_probs)[:, 1])
            except:
                auc = 0.0

            print(f"📊 Epoch {epoch}/{num_epochs} | Loss={avg_train_loss:.4f} "
                  f"| Acc={acc:.4f} | Prec={prec:.4f} | Rec={rec:.4f} | F1={f1:.4f} | AUC={auc:.4f}")

            # Save best
            if f1 > best_val_f1:
                best_val_f1 = f1
                best_state = {k: v.cpu().clone() for k, v in self.model.state_dict().items()}
                best_preds, best_labels = val_preds.copy(), val_labels_all.copy()

        # Restore best
        if best_state is not None:
            self.model.load_state_dict(best_state)
            self.model.to(device)

        # Confusion matrix of best epoch
        cm = confusion_matrix(best_labels, best_preds)
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=["Real","Fake"], yticklabels=["Real","Fake"])
        plt.title(f"Best Confusion Matrix ({self.model_name})")
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.show()

        return {"best_f1": best_val_f1}

    def save_model(self, save_dir="./saved_model"):
        os.makedirs(save_dir, exist_ok=True)
        self.model.save_pretrained(save_dir)
        self.tokenizer.save_pretrained(save_dir)
        print(f"✅ {self.model_name} saved to {save_dir}")

    def predict(self, texts, batch_size=32):
        dataset = self.create_dataset(texts)
        preds = self.trainer.predict(dataset)
        pred_labels = np.argmax(preds.predictions, axis=1)
        probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()
        return pred_labels, probs



Using device: cuda


In [10]:
# ===============================
# 🔧 CELL 6 (MODIFIED): PHOBERT FEATURE EXTRACTOR (support from_scratch)
# ===============================
from transformers import AutoConfig, AutoModel

class PhoBERTFeatureExtractor:
    def __init__(self, model_name='vinai/phobert-base', layer=-2, from_scratch=False):
        self.model_name = model_name
        self.layer = layer
        self.from_scratch = from_scratch

        print(f"🔧 Initializing PhoBERTFeatureExtractor (from_scratch={from_scratch}) ...")
        # Keep tokenizer from pretrained vocab
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        if from_scratch:
            config = AutoConfig.from_pretrained(model_name)
            # Use AutoModel (encoder only) from config -> random init
            self.model = AutoModel.from_config(config)
            print("⚠️ PhoBERT model initialized with RANDOM weights for feature extraction.")
        else:
            # Use pretrained weights (original behavior)
            self.model = AutoModel.from_pretrained(model_name)
            print("✅ PhoBERT pretrained model loaded for feature extraction.")

        self.model.to(device)
        self.model.eval()
        print(f"✅ PhoBERT feature extractor ready on {device}")

    def extract_features(self, texts, batch_size=16):
        all_features = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting PhoBERT features"):
            batch_texts = texts[i:i+batch_size]
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=256
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states[self.layer]
                pooled_features = torch.mean(hidden_states, dim=1)
                all_features.extend(pooled_features.cpu().numpy())
        return np.array(all_features)

print("✅ PhoBERT Feature Extractor (with from_scratch option) defined!")


✅ PhoBERT Feature Extractor (with from_scratch option) defined!


In [11]:
# ===============================
# 🎯 CELL 7 (MODIFIED): HYBRID MODELS (support passing from_scratch to PhoBERT extractor)
# ===============================
class HybridPhoBERTClassifier:
    def __init__(self, method='tfidf', tfidf_features=5000, w2v_size=200, phobert_from_scratch=False):
        self.method = method
        # Pass the from_scratch flag to feature extractor
        self.phobert_extractor = PhoBERTFeatureExtractor(from_scratch=phobert_from_scratch)

        if method == 'tfidf':
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=tfidf_features,
                ngram_range=(1, 3),
                lowercase=True,
                stop_words=None
            )
        elif method == 'word2vec':
            self.w2v_size = w2v_size
            self.word2vec_model = None

        self.classifier = LogisticRegression(
            random_state=42,
            max_iter=1000,
            class_weight='balanced',
            C=1.0
        )
        self.feature_scaler = StandardScaler()

        print(f"🎯 Initialized Hybrid PhoBERT + {method.upper()} classifier (phobert_from_scratch={phobert_from_scratch})")

    # prepare_word2vec_features, train, predict - keep the same as your original implementation
    def prepare_word2vec_features(self, texts):
        print("🔤 Preparing Word2Vec features...")
        tokenized_texts = []
        preprocessor = AdvancedVietnameseTextPreprocessor()
        for text in tqdm(texts, desc="Tokenizing for Word2Vec"):
            tokens = preprocessor.tokenize_vietnamese(text)
            if tokens:
                tokenized_texts.append(tokens)
        if self.word2vec_model is None:
            print("🔤 Training Word2Vec model...")
            self.word2vec_model = Word2Vec(
                sentences=tokenized_texts,
                vector_size=self.w2v_size,
                window=5,
                min_count=2,
                workers=4,
                epochs=10,
                sg=1
            )
        doc_vectors = []
        for tokens in tokenized_texts:
            vectors = []
            for token in tokens:
                if token in self.word2vec_model.wv:
                    vectors.append(self.word2vec_model.wv[token])
            if vectors:
                doc_vector = np.mean(vectors, axis=0)
            else:
                doc_vector = np.zeros(self.w2v_size)
            doc_vectors.append(doc_vector)
        return np.array(doc_vectors)

    def train(self, train_texts, train_labels):
        print(f"🔧 Training PhoBERT + {self.method.upper()}...")
        phobert_features = self.phobert_extractor.extract_features(train_texts)
        if self.method == 'tfidf':
            text_features = self.tfidf_vectorizer.fit_transform(train_texts).toarray()
        elif self.method == 'word2vec':
            text_features = self.prepare_word2vec_features(train_texts)
        combined_features = np.hstack([phobert_features, text_features])
        combined_features_scaled = self.feature_scaler.fit_transform(combined_features)
        self.classifier.fit(combined_features_scaled, train_labels)
        print(f"✅ PhoBERT + {self.method.upper()} training completed!")
        return self

    def predict(self, texts):
        phobert_features = self.phobert_extractor.extract_features(texts)
        if self.method == 'tfidf':
            text_features = self.tfidf_vectorizer.transform(texts).toarray()
        elif self.method == 'word2vec':
            text_features = self.prepare_word2vec_features(texts)
        combined_features = np.hstack([phobert_features, text_features])
        combined_features_scaled = self.feature_scaler.transform(combined_features)
        predictions = self.classifier.predict(combined_features_scaled)
        probabilities = self.classifier.predict_proba(combined_features_scaled)
        return predictions, probabilities

print("✅ Hybrid PhoBERT Classifiers (with phobert_from_scratch option) defined!")


✅ Hybrid PhoBERT Classifiers (with phobert_from_scratch option) defined!


In [12]:
# ===============================
# 📊 CELL 8: COMPREHENSIVE MODEL EVALUATOR
# ===============================
class ComprehensiveModelEvaluator:
    def __init__(self):
        self.results = {}
        self.predictions = {}
        self.probabilities = {}

    def evaluate_model(self, y_true, y_pred, y_prob=None, model_name="Model"):
        """Comprehensive model evaluation"""

        # Basic metrics
        results = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
            'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
            'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
            'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
            'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0)
        }

        # AUC if probabilities are provided
        if y_prob is not None and y_prob.shape[1] == 2:
            try:
                results['auc'] = roc_auc_score(y_true, y_prob[:, 1])
            except:
                results['auc'] = 0.0

        self.results[model_name] = results
        self.predictions[model_name] = y_pred
        if y_prob is not None:
            self.probabilities[model_name] = y_prob

        print(f"\n📊 {model_name} Results:")
        print("-" * 50)
        for metric, value in results.items():
            print(f"{metric.upper():15}: {value:.4f}")

        # Detailed classification report
        print(f"\n📋 Classification Report - {model_name}:")
        print(classification_report(y_true, y_pred, target_names=['Real', 'Fake'], digits=4))

        return results

    def plot_confusion_matrices(self, y_true, models_to_plot=None):
        """Plot confusion matrices for all models"""
        if models_to_plot is None:
            models_to_plot = list(self.predictions.keys())

        n_models = len(models_to_plot)
        if n_models == 0:
            return

        # Calculate grid dimensions
        n_cols = min(3, n_models)
        n_rows = (n_models + n_cols - 1) // n_cols

        fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
        if n_models == 1:
            axes = [axes]
        elif n_rows == 1:
            axes = axes.reshape(1, -1)

        for idx, model_name in enumerate(models_to_plot):
            row = idx // n_cols
            col = idx % n_cols
            ax = axes[row, col] if n_rows > 1 else axes[col]

            cm = confusion_matrix(y_true, self.predictions[model_name])

            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                       xticklabels=['Real', 'Fake'],
                       yticklabels=['Real', 'Fake'])
            ax.set_title(f'{model_name}')
            ax.set_ylabel('True Label')
            ax.set_xlabel('Predicted Label')

        # Hide empty subplots
        for idx in range(n_models, n_rows * n_cols):
            if n_rows > 1:
                row = idx // n_cols
                col = idx % n_cols
                fig.delaxes(axes[row, col])
            elif n_cols > 1:
                fig.delaxes(axes[idx])

        plt.tight_layout()
        plt.show()

    def plot_roc_curves(self, y_true):
        """Plot ROC curves for models with probabilities"""
        plt.figure(figsize=(10, 8))

        for model_name, y_prob in self.probabilities.items():
            if y_prob.shape[1] == 2:  # Binary classification
                fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
                auc_score = roc_auc_score(y_true, y_prob[:, 1])
                plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.3f})', linewidth=2)

        plt.plot([0, 1], [0, 1], 'k--', alpha=0.6)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves Comparison')
        plt.legend(loc="lower right")
        plt.grid(True, alpha=0.3)
        plt.show()

    def compare_models(self, plot_confusion=True, plot_roc=True):
        """Comprehensive model comparison"""
        if not self.results:
            print("No results to compare!")
            return None

        df_results = pd.DataFrame(self.results).T

        print("\n" + "="*80)
        print("🏆 COMPREHENSIVE MODEL COMPARISON")
        print("="*80)
        print(df_results.round(4))

        # Plot metrics comparison
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Model Performance Comparison', fontsize=16)

        # Accuracy, Precision, Recall, F1
        metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_weighted']
        colors = ['skyblue', 'lightgreen', 'salmon', 'gold']

        for i, metric in enumerate(metrics_to_plot):
            ax = axes[i//2, i%2]
            bars = ax.bar(df_results.index, df_results[metric], color=colors[i])
            ax.set_title(f'{metric.replace("_", " ").title()} Comparison')
            ax.set_ylabel('Score')
            ax.set_ylim(0, 1)

            # Add value labels on bars
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                       f'{height:.3f}', ha='center', va='bottom')

            plt.setp(ax.get_xticklabels(), rotation=45, ha='right')

        plt.tight_layout()
        plt.show()

        # Plot confusion matrices if requested
        if plot_confusion and self.predictions:
            y_true_sample = None
            for pred in self.predictions.values():
                y_true_sample = pred  # We'll need the actual y_true from outside
                break
            # Note: This will be called with actual y_true from the main pipeline

        return df_results

    def get_best_model(self, metric='f1_weighted'):
        """Get the best performing model"""
        if not self.results:
            return None

        best_score = -1
        best_model = None

        for model_name, results in self.results.items():
            if results.get(metric, 0) > best_score:
                best_score = results[metric]
                best_model = model_name

        print(f"🏅 Best model: {best_model} with {metric} = {best_score:.4f}")
        return best_model, best_score

print("✅ Comprehensive Model Evaluator defined!")

✅ Comprehensive Model Evaluator defined!


In [13]:
# ===============================
# 🔄 CELL 9: MAIN PIPELINE CLASS (updated for from_scratch training)
# ===============================
class ComprehensiveFakeNewsPipeline:
    def __init__(self):
        self.preprocessor = EnhancedNewsDatasetPreprocessor()
        self.evaluator = ComprehensiveModelEvaluator()
        self.models = {}
        self.full_data = None   # để lưu dataset đầy đủ

    def load_data(self, fake_path, real_path):
        """Load and combine datasets"""
        print("📂 Loading datasets...")
        cols = ['domain','title','publish_date','summary','content_html','label']
        try:
            df_fake = pd.read_csv(fake_path, usecols=cols)
            df_real = pd.read_csv(real_path, usecols=cols)
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return None

        print(f"📊 Fake news articles: {len(df_fake)}")
        print(f"📊 Real news articles: {len(df_real)}")

        df = pd.concat([df_fake, df_real], axis=0).reset_index(drop=True)

        print(f"📊 Combined dataset: {len(df)} articles")
        print(f"📊 Label distribution:")
        print(df['label'].value_counts())

        return df

    def run_comprehensive_pipeline(self, df, test_size=0.2,
                                   balance_data=False, models_to_run=None, keep_full_data=True):
        """Run the complete pipeline"""

        # Giữ dataset gốc
        full_df = df.copy()

        # Tiền xử lý
        df_clean = self.preprocessor.prepare_data(df, balance_data=balance_data)
        if len(df_clean) == 0:
            print("❌ No data after preprocessing!")
            return None

        # Nếu muốn giữ dataset đầy đủ
        if keep_full_data:
            self.full_data = self.preprocessor.prepare_data(full_df, balance_data=False)
        else:
            self.full_data = df_clean

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            df_clean['combined_text'].tolist(),
            df_clean['label'].tolist(),
            test_size=test_size,
            random_state=42,
            stratify=df_clean['label']
        )

        print(f"\n📊 Dataset splits:")
        print(f"Training samples: {len(X_train)}")
        print(f"Test samples: {len(X_test)}")
        print(f"Train label distribution: {pd.Series(y_train).value_counts().to_dict()}")
        print(f"Test label distribution: {pd.Series(y_test).value_counts().to_dict()}")

        # Run selected models
        results_summary = {}
        num_epochs = 20
        batch_size = 16

        # 1. PhoBERT
        if 'phobert' in models_to_run:
            print("\n" + "="*60)
            print("🤖 TRAINING PHOBERT (from scratch)")
            print("="*60)

            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            phobert = UniversalTransformerClassifier('vinai/phobert-base', max_length=256, from_scratch=True)

            phobert.train(X_train, y_train, X_test, y_test,
                          num_epochs=num_epochs, batch_size=batch_size)

            phobert_pred, phobert_prob = phobert.predict(X_test, batch_size=batch_size)

            results_summary['PhoBERT'] = self.evaluator.evaluate_model(
                y_test, phobert_pred, phobert_prob, "PhoBERT"
            )
            self.models['PhoBERT'] = phobert

        # 2. BERT
        if 'bert' in models_to_run:
            print("\n" + "="*60)
            print("🤖 TRAINING BERT (from scratch)")
            print("="*60)

            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            bert = UniversalTransformerClassifier('bert-base-multilingual-cased', max_length=512, from_scratch=True)

            bert.train(X_train, y_train, X_test, y_test,
                       num_epochs=num_epochs, batch_size=batch_size)

            bert_pred, bert_prob = bert.predict(X_test, batch_size=batch_size)

            results_summary['BERT'] = self.evaluator.evaluate_model(
                y_test, bert_pred, bert_prob, "BERT"
            )
            self.models['BERT'] = bert

        # 3. RoBERTa
        if 'roberta' in models_to_run:
            print("\n" + "="*60)
            print("🤖 TRAINING ROBERTA (from scratch)")
            print("="*60)

            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            roberta = UniversalTransformerClassifier('roberta-base', max_length=512, from_scratch=True)

            roberta.train(X_train, y_train, X_test, y_test,
                          num_epochs=num_epochs, batch_size=batch_size)

            roberta_pred, roberta_prob = roberta.predict(X_test, batch_size=batch_size)

            results_summary['RoBERTa'] = self.evaluator.evaluate_model(
                y_test, roberta_pred, roberta_prob, "RoBERTa"
            )
            self.models['RoBERTa'] = roberta

        # 4. PhoBERT + TF-IDF
        if 'phobert_tfidf' in models_to_run:
            print("\n" + "="*60)
            print("🎯 TRAINING PHOBERT + TF-IDF (from scratch)")
            print("="*60)

            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            phobert_tfidf = HybridPhoBERTClassifier(method='tfidf', tfidf_features=10000, phobert_from_scratch=True)
            phobert_tfidf.train(X_train, y_train)

            tfidf_pred, tfidf_prob = phobert_tfidf.predict(X_test)

            results_summary['PhoBERT+TF-IDF'] = self.evaluator.evaluate_model(
                y_test, tfidf_pred, tfidf_prob, "PhoBERT+TF-IDF"
            )
            self.models['PhoBERT+TF-IDF'] = phobert_tfidf

        # 5. PhoBERT + Word2Vec
        if 'phobert_w2v' in models_to_run:
            print("\n" + "="*60)
            print("🎯 TRAINING PHOBERT + WORD2VEC (from scratch)")
            print("="*60)

            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            phobert_w2v = HybridPhoBERTClassifier(method='word2vec', w2v_size=300, phobert_from_scratch=True)
            phobert_w2v.train(X_train, y_train)

            w2v_pred, w2v_prob = phobert_w2v.predict(X_test)

            results_summary['PhoBERT+Word2Vec'] = self.evaluator.evaluate_model(
                y_test, w2v_pred, w2v_prob, "PhoBERT+Word2Vec"
            )
            self.models['PhoBERT+Word2Vec'] = phobert_w2v

        # Final comparison
        print("\n" + "="*80)
        print("🏆 FINAL RESULTS COMPARISON")
        print("="*80)

        comparison_df = self.evaluator.compare_models()

        # Plot confusion matrices
        if len(self.evaluator.predictions) > 0:
            print("\n📊 Confusion Matrices:")
            self.evaluator.plot_confusion_matrices(y_test)

        # Plot ROC curves
        if len(self.evaluator.probabilities) > 0:
            print("\n📈 ROC Curves:")
            self.evaluator.plot_roc_curves(y_test)

        # Get best model
        best_model, best_score = self.evaluator.get_best_model('f1_weighted')

        return {
            'comparison_df': comparison_df,
            'best_model': best_model,
            'best_score': best_score,
            'X_test': X_test,
            'y_test': y_test,
            'models': self.models,
            'evaluator': self.evaluator
        }

print("✅ Comprehensive Pipeline (from scratch) defined!")


✅ Comprehensive Pipeline (from scratch) defined!


In [14]:
# ===============================
# 🎯 CELL 10: PREDICTION & UTILITY FUNCTIONS
# ===============================
def predict_single_text(pipeline, text, model_name=None):
    """Predict on single text with the best model or specified model"""

    if model_name is None:
        # Use best model
        best_model, _ = pipeline.evaluator.get_best_model('f1_weighted')
        model_name = best_model

    if model_name not in pipeline.models:
        print(f"❌ Model {model_name} not found!")
        available_models = list(pipeline.models.keys())
        print(f"Available models: {available_models}")
        return None

    model = pipeline.models[model_name]
    pred, prob = model.predict([text])

    label = "FAKE" if pred[0] == 1 else "REAL"
    confidence = max(prob[0])

    print(f"\n📝 Text: {text[:200]}...")
    print(f"🎯 Model: {model_name}")
    print(f"✅ Prediction: {label} (Confidence={confidence:.4f})")
    print(f"📊 Probabilities → Real={prob[0][0]:.4f}, Fake={prob[0][1]:.4f}")

    return pred[0], confidence, prob[0]


def save_models(pipeline, base_path="/content/drive/MyDrive/fake_news_models"):
    """Save all trained models"""
    os.makedirs(base_path, exist_ok=True)

    for model_name, model in pipeline.models.items():
        try:
            if hasattr(model, 'model') and hasattr(model, 'tokenizer'):
                # Transformer models
                model_path = os.path.join(base_path, model_name.lower().replace('+', '_'))
                os.makedirs(model_path, exist_ok=True)
                model.model.save_pretrained(model_path)
                model.tokenizer.save_pretrained(model_path)
                print(f"✅ {model_name} saved to {model_path}")
            else:
                # Hybrid models
                model_path = os.path.join(base_path, f"{model_name.lower().replace('+', '_')}.joblib")
                joblib.dump(model, model_path)
                print(f"✅ {model_name} saved to {model_path}")
        except Exception as e:
            print(f"❌ Error saving {model_name}: {e}")


def analyze_dataset_statistics(df):
    """Analyze dataset statistics"""
    print("📊 DATASET ANALYSIS")
    print("="*50)

    # Basic statistics
    print(f"Total articles: {len(df)}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    print(f"Label percentage:\n{df['label'].value_counts(normalize=True).round(4)}")

    # Text length analysis
    if 'combined_text' in df.columns:
        print(f"\nText length statistics:\n{df['combined_text'].str.len().describe()}")

    # Missing values
    print(f"\nMissing values:")
    for col in ['title', 'summary', 'content_html']:
        if col in df.columns:
            missing = df[col].isna().sum()
            print(f"{col}: {missing} ({missing/len(df)*100:.1f}%)")

    # Domain analysis if available
    if 'domain' in df.columns:
        print(f"\nTop 10 domains:\n{df['domain'].value_counts().head(10)}")

print("✅ Utility functions defined!")


✅ Utility functions defined!


In [15]:
# ===============================
# 🚀 CELL 11: MAIN EXECUTION
# ===============================
import pandas as pd
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split

def main(sample_size=None, models_to_run=None, balance_data=False, keep_full_data=True):
    """Main function to run the complete pipeline"""

    if models_to_run is None:
        # Default: run all models
        models_to_run = ['phobert', 'bert', 'roberta', 'phobert_tfidf', 'phobert_w2v']

    print("🚀 STARTING COMPREHENSIVE FAKE NEWS DETECTION PIPELINE")
    print("="*80)

    # Initialize pipeline
    pipeline = ComprehensiveFakeNewsPipeline()

    # Load data - UPDATE THESE PATHS
    fake_path = "/content/drive/MyDrive/NLP Project - Thay Khanh/ViFN-Vietnamese_Fake_New_Datasets_Ver3-main/processed/deduplicated_articles_fake.csv"
    real_path = "/content/drive/MyDrive/NLP Project - Thay Khanh/ViFN-Vietnamese_Fake_New_Datasets_Ver3-main/processed/deduplicated_articles_real.csv"

    df = pipeline.load_data(fake_path, real_path)
    if df is None:
        print("❌ Failed to load data. Please check your file paths!")
        return None

    # Analyze dataset
    analyze_dataset_statistics(df)

    # Run pipeline
    print("\n⚙️ Configuration:")
    print(f"📊 Sample size: {'Full dataset' if sample_size is None else sample_size}")
    print(f"🤖 Models to run: {models_to_run}")
    print(f"⚖️ Balance data: {balance_data}")
    print(f"📋 Keep full data: {keep_full_data}")

    results = pipeline.run_comprehensive_pipeline(
        df,
        balance_data=balance_data,
        models_to_run=models_to_run,
        keep_full_data=keep_full_data
    )

    if results is None:
        print("❌ Pipeline failed!")
        return None

    print("\n✅ Pipeline completed successfully!")
    return pipeline, results


In [16]:
# ===============================
# 🎯 CELL 12: RUN THE PIPELINE
# ===============================

# Configuration - MODIFY THESE AS NEEDED
SAMPLE_SIZE = None  # Full dataset
MODELS_TO_RUN = ['phobert', 'bert', 'roberta', 'phobert_tfidf', 'phobert_w2v']

# Run
pipeline, results = main(
    sample_size=SAMPLE_SIZE,
    models_to_run=MODELS_TO_RUN,
    balance_data=False,   # Set True nếu dữ liệu quá lệch
    keep_full_data=True
)


🚀 STARTING COMPREHENSIVE FAKE NEWS DETECTION PIPELINE
📂 Loading datasets...
📊 Fake news articles: 666
📊 Real news articles: 4795
📊 Combined dataset: 5461 articles
📊 Label distribution:
label
0    4795
1     666
Name: count, dtype: int64
📊 DATASET ANALYSIS
Total articles: 5461
Label distribution:
label
0    4795
1     666
Name: count, dtype: int64
Label percentage:
label
0    0.878
1    0.122
Name: proportion, dtype: float64

Missing values:
title: 0 (0.0%)
summary: 17 (0.3%)
content_html: 0 (0.0%)

Top 10 domains:
domain
baomoi.com       929
vietnamnet.vn    745
hanoimoi.vn      564
tienphong.vn     477
laodong.vn       367
tuoitre.vn       325
vov.vn           316
vnexpress.net    314
baocaobang.vn    297
thanhnien.vn     185
Name: count, dtype: int64

⚙️ Configuration:
📊 Sample size: Full dataset
🤖 Models to run: ['phobert', 'bert', 'roberta', 'phobert_tfidf', 'phobert_w2v']
⚖️ Balance data: False
📋 Keep full data: True
📋 Enhanced preprocessing...


Processing texts: 100%|██████████| 5461/5461 [01:48<00:00, 50.28it/s]


📊 Original dataset: 5461 samples
📊 After length filtering: 4888 samples
📊 Final dataset: 4888 samples
📊 Label distribution:
label
0    4251
1     637
Name: count, dtype: int64
📊 Text length statistics:
count    4888.000000
mean     3530.451718
std      1597.710313
min       183.000000
25%      2338.750000
50%      3303.500000
75%      4539.250000
max      7998.000000
Name: text_length, dtype: float64
📋 Enhanced preprocessing...


Processing texts: 100%|██████████| 5461/5461 [01:56<00:00, 46.68it/s]


📊 Original dataset: 5461 samples
📊 After length filtering: 4888 samples
📊 Final dataset: 4888 samples
📊 Label distribution:
label
0    4251
1     637
Name: count, dtype: int64
📊 Text length statistics:
count    4888.000000
mean     3530.451718
std      1597.710313
min       183.000000
25%      2338.750000
50%      3303.500000
75%      4539.250000
max      7998.000000
Name: text_length, dtype: float64

📊 Dataset splits:
Training samples: 3910
Test samples: 978
Train label distribution: {0: 3400, 1: 510}
Test label distribution: {0: 851, 1: 127}

🤖 TRAINING PHOBERT (from scratch)

🤖 Initializing vinai/phobert-base (from_scratch=True) ...
⚠️ Model vinai/phobert-base initialized with RANDOM weights.

🚀 Training vinai/phobert-base for 20 epochs (from_scratch=True)




📊 Epoch 1/20 | Loss=0.3983 | Acc=0.8701 | Prec=0.0000 | Rec=0.0000 | F1=0.0000 | AUC=0.8470




📊 Epoch 2/20 | Loss=0.2730 | Acc=0.9407 | Prec=0.7447 | Rec=0.8268 | F1=0.7836 | AUC=0.9669




📊 Epoch 3/20 | Loss=0.1029 | Acc=0.9571 | Prec=0.9126 | Rec=0.7402 | F1=0.8174 | AUC=0.9796




📊 Epoch 4/20 | Loss=0.0673 | Acc=0.9622 | Prec=0.9592 | Rec=0.7402 | F1=0.8356 | AUC=0.9685




📊 Epoch 5/20 | Loss=0.0415 | Acc=0.9581 | Prec=0.9674 | Rec=0.7008 | F1=0.8128 | AUC=0.9833




📊 Epoch 6/20 | Loss=0.0186 | Acc=0.9376 | Prec=1.0000 | Rec=0.5197 | F1=0.6839 | AUC=0.9677




KeyboardInterrupt: 

In [None]:
import pandas as pd

def evaluate_test_samples(pipeline, test_texts, true_labels):
    """
    Evaluate multiple test samples across all trained models.
    Show comparison table: True label + predictions of each model.
    """
    results = []
    models = list(pipeline.models.keys())  # ['PhoBERT', 'BERT', 'RoBERTa', 'PhoBERT+TF-IDF', 'PhoBERT+Word2Vec']

    for i, text in enumerate(test_texts, 1):
        row = {
            "Index": i,
            "Post/News": text[:120] + ("..." if len(text) > 120 else ""),
            "True Label": "FAKE" if true_labels[i-1] == 1 else "REAL"
        }
        for model_name in models:
            model = pipeline.models[model_name]
            pred, prob = model.predict([text])
            row[model_name] = "FAKE" if pred[0] == 1 else "REAL"
        results.append(row)

    df_results = pd.DataFrame(results)
    print("\n📊 Comparison Table across models:")
    display(df_results)
    return df_results


# ===============================
# 🧪 CELL 13: TEST PREDICTIONS (UPDATED)
# ===============================
if pipeline is not None:
    print("\n" + "="*50)
    print("🧪 TESTING PREDICTIONS (Comparison Table)")
    print("="*50)

    # 4 test samples
    test_texts = [
        "Chính phủ Việt Nam công bố chính sách mới về phát triển kinh tế số trong năm 2024",
        "NÓNG: Phát hiện loại thực phẩm có thể chữa khỏi mọi bệnh ung thư chỉ trong 3 ngày!",
        "Nghiên cứu khoa học mới cho thấy tác động tích cực của việc tập thể dục đối với sức khỏe",
        "SHOCK: Trái đất sẽ kết thúc vào năm 2025 theo lời tiên tri cổ đại!"
    ]

    # True labels for reference (1=FAKE, 0=REAL)
    true_labels = [1, 1, 0, 0]

    results_table = evaluate_test_samples(pipeline, test_texts, true_labels)

print("\n🎉 ALL DONE! Table ready for your report.")


In [None]:
import os
import joblib
import torch

# ===============================
# 📦 LƯU TF-IDF + LogisticRegression
# ===============================
def save_tfidf_model(tfidf_model, save_path="/content/drive/MyDrive/fake_news_models/tfidf_model.joblib"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump(tfidf_model, save_path)
    print(f"✅ TF-IDF model saved to {save_path}")

# ===============================
# 🤖 LƯU PHOBERT (HuggingFace)
# ===============================
def save_phobert_model(phobert_model, save_dir="/content/drive/MyDrive/fake_news_models/phobert_model"):
    os.makedirs(save_dir, exist_ok=True)
    phobert_model.model.save_pretrained(save_dir)
    phobert_model.tokenizer.save_pretrained(save_dir)
    print(f"✅ PhoBERT model saved to {save_dir}")


In [None]:
# Lưu TF-IDF
if 'TF-IDF' in pipeline.models:
    save_tfidf_model(pipeline.models['TF-IDF'])

# Lưu PhoBERT
if 'PhoBERT' in pipeline.models:
    save_phobert_model(pipeline.models['PhoBERT'])


In [None]:
# ===============================
# 📝 USAGE INSTRUCTIONS
# ===============================
print("""
📝 HƯỚNG DẪN SỬ DỤNG:

1. 📁 Đảm bảo đường dẫn file CSV chính xác trong CELL 11

2. ▶️ Chạy các cells theo thứ tự:
   - CELL 1-10: Setup và định nghĩa classes
   - CELL 11: Load dataset (theo cách bạn đã viết)
   - CELL 12: Định nghĩa main function
   - CELL 13: Chạy pipeline
   - CELL 14: Test predictions (optional)

3. 🔧 Tùy chỉnh trong CELL 13:
   - sample_size=2000: Test nhanh
   - sample_size=None: Dùng full dataset
   - run_phobert=False: Bỏ qua PhoBERT nếu muốn

4. 🧪 Test predictions:
   predict_single_text(pipeline, "Your text here", "PhoBERT")

⚡ Lưu ý:
- Dataset được load đúng như code gốc của bạn
- Có thể điều chỉnh sample_size trong CELL 13
- Sử dụng GPU runtime để train nhanh hơn
""")