In [None]:
!pip install transformers torch scikit-learn numpy pandas matplotlib seaborn underthesea vncorenlp -q

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, T5ForConditionalGeneration, BartForConditionalGeneration
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
try:
    from underthesea import word_tokenize, sent_tokenize
    HAS_UNDERTHESEA = True
except ImportError:
    HAS_UNDERTHESEA = False

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
def clean_text(text):
    """Clean and normalize Vietnamese text."""
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\s+', ' ', text.strip())
    return text

In [None]:
def simple_sentence_tokenize(text):
    """Simple regex-based sentence tokenizer for Vietnamese text."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

In [None]:
def get_sentence_tokenizer():
    """Get the best available sentence tokenizer"""
    if HAS_UNDERTHESEA:
        return sent_tokenize
    else:
        return simple_sentence_tokenize

In [None]:
sentence_tokenize = get_sentence_tokenizer()

In [None]:
class BaseSummarizer:
    def __init__(self, name="Base"):
        self.name = name

    def summarize(self, text, ratio=0.3):
        raise NotImplementedError("Each summarizer must implement this method")

    def __str__(self):
        return self.name

In [None]:
class PhoBERTSummarizer(BaseSummarizer):
    def __init__(self, device=None):
        super().__init__(name="PhoBERT (VietAI)")
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Initializing PhoBERT on {self.device}")

        # Load PhoBERT model
        self.model_name = "vinai/phobert-base"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)

    def get_sentence_embeddings(self, sentences):
        embeddings = []
        for sentence in sentences:
            # PhoBERT uses word-level tokens
            inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=256).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)

            # Use the [CLS] token embedding as the sentence embedding
            sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(sentence_embedding[0])

        return np.array(embeddings)

    def summarize(self, text, ratio=0.3):
        # Clean and get sentences
        text = clean_text(text)
        sentences = sentence_tokenize(text)

        if len(sentences) <= 2:
            return text, [], np.array([[1]])

        # Get sentence embeddings
        embeddings = self.get_sentence_embeddings(sentences)

        # Compute cosine similarity between sentences
        sim_matrix = cosine_similarity(embeddings)

        # Score sentences using the PageRank-like algorithm
        scores = np.sum(sim_matrix, axis=1)
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

        # Select top sentences
        num_sentences = max(1, int(len(sentences) * ratio))
        selected_indices = sorted([item[1] for item in ranked_sentences[:num_sentences]])

        # Reconstruct the summary
        summary = " ".join([sentences[i] for i in selected_indices])

        return summary, ranked_sentences, sim_matrix

In [None]:
class ViT5Summarizer(BaseSummarizer):
    def __init__(self, device=None):
        super().__init__(name="ViT5")
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Initializing ViT5 on {self.device}")

        # Load ViT5 model with correct configuration
        self.model_name = "VietAI/vit5-base-vietnews-summarization"  # Use a model fine-tuned specifically for Vietnamese summarization
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
            print("Successfully loaded ViT5 model")
        except Exception as e:
            print(f"Error loading ViT5 model: {e}")
            # Fallback to base model if specific one isn't available
            self.model_name = "VietAI/vit5-base"
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
            print("Using fallback ViT5 base model")

    def summarize(self, text, ratio=0.3):
        # Clean text and handle encoding
        text = clean_text(text)

        # Preparing prompt with specific format for T5
        # The actual prompt format depends on how the model was fine-tuned
        input_text = f"summarize: {text}"  # Many T5 models expect "summarize: " prefix

        # Different prompt formats to try if first one fails
        prompt_formats = [
            f"summarize: {text}",
            f"tóm tắt: {text}",
            text  # Sometimes no prefix works better
        ]

        # Try each prompt format until one works
        for prompt in prompt_formats:
            try:
                # Tokenize with careful handling of Vietnamese characters
                inputs = self.tokenizer(
                    prompt,
                    return_tensors="pt",
                    max_length=1024,
                    truncation=True,
                    padding="max_length",
                    add_special_tokens=True
                ).to(self.device)

                # Generate summary with careful parameter tuning
                with torch.no_grad():
                    summary_ids = self.model.generate(
                        inputs.input_ids,
                        attention_mask=inputs.attention_mask,
                        max_length=256,  # Shorter max length to avoid repetition
                        min_length=30,   # Ensure a reasonable summary length
                        num_beams=4,     # Beam search for better quality
                        length_penalty=2.0,  # Encourage longer summaries
                        early_stopping=True,
                        no_repeat_ngram_size=3,  # Avoid repeated phrases
                        bad_words_ids=None,      # Don't explicitly ban any tokens
                        do_sample=False          # Deterministic generation
                    )

                # Decode summary with special handling for Vietnamese
                summary = self.tokenizer.decode(
                    summary_ids[0],
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True
                )

                # Check if summary looks valid (containing actual Vietnamese text)
                if any(char in summary for char in "abcdefghijklmnopqrstuvwxyzáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđ"):
                    break  # If valid, use this summary
            except Exception as e:
                print(f"Error with prompt format '{prompt[:20]}...': {e}")
                summary = "Không thể tạo tóm tắt (Unable to generate summary)"

        # For compatibility with visualization functions
        sentences = sentence_tokenize(text)
        dummy_matrix = np.zeros((len(sentences), len(sentences)))
        dummy_ranked = [(1.0, i, s) for i, s in enumerate(sentences)]

        # Perform basic post-processing to clean up summary
        summary = re.sub(r'[^\x00-\x7F\u00C0-\u1EF9\s.,!?:;]', '', summary)  # Remove invalid characters
        summary = re.sub(r'\s+', ' ', summary).strip()  # Fix spacing
        summary = re.sub(r'(.+?)\1{2,}', r'\1', summary)  # Remove excessive repetition

        return summary, dummy_ranked, dummy_matrix

In [None]:
class TfidfSummarizer(BaseSummarizer):
    def __init__(self):
        super().__init__(name="TF-IDF")

    def summarize(self, text, ratio=0.3):
        # Clean and get sentences
        text = clean_text(text)
        sentences = sentence_tokenize(text)

        if len(sentences) <= 2:
            return text, [], np.array([[1]])

        # Compute sentence vectors using TF-IDF
        vectorizer = TfidfVectorizer()
        try:
            sentence_vectors = vectorizer.fit_transform(sentences)
        except ValueError:  # Handle case with empty sentences
            dummy_matrix = np.zeros((len(sentences), len(sentences)))
            dummy_ranked = [(1.0, i, s) for i, s in enumerate(sentences)]
            return " ".join(sentences[:max(1, int(len(sentences) * ratio))]), dummy_ranked, dummy_matrix

        # Compute sentence similarity matrix
        sim_matrix = cosine_similarity(sentence_vectors)

        # Score sentences based on similarity
        scores = np.sum(sim_matrix, axis=1)
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

        # Select top sentences
        num_sentences = max(1, int(len(sentences) * ratio))
        selected_indices = sorted([item[1] for item in ranked_sentences[:num_sentences]])

        # Reconstruct the summary
        summary = " ".join([sentences[i] for i in selected_indices])

        return summary, ranked_sentences, sim_matrix

In [None]:
class MultiBERTSummarizer(BaseSummarizer):
    def __init__(self, device=None):
        super().__init__(name="mBERT")
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Initializing mBERT on {self.device}")

        # Load multilingual BERT model
        self.model_name = "bert-base-multilingual-cased"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)

    def get_sentence_embeddings(self, sentences):
        embeddings = []
        for sentence in sentences:
            inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=256).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)

            # Use the [CLS] token embedding as the sentence embedding
            sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(sentence_embedding[0])

        return np.array(embeddings)

    def summarize(self, text, ratio=0.3):
        # Clean and get sentences
        text = clean_text(text)
        sentences = sentence_tokenize(text)

        if len(sentences) <= 2:
            return text, [], np.array([[1]])

        # Get sentence embeddings
        embeddings = self.get_sentence_embeddings(sentences)

        # Compute cosine similarity between sentences
        sim_matrix = cosine_similarity(embeddings)

        # Score sentences using the PageRank-like algorithm
        scores = np.sum(sim_matrix, axis=1)
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

        # Select top sentences
        num_sentences = max(1, int(len(sentences) * ratio))
        selected_indices = sorted([item[1] for item in ranked_sentences[:num_sentences]])

        # Reconstruct the summary
        summary = " ".join([sentences[i] for i in selected_indices])

        return summary, ranked_sentences, sim_matrix

In [None]:
class PositionSummarizer(BaseSummarizer):
    def __init__(self):
        super().__init__(name="Position-based")

    def summarize(self, text, ratio=0.3):
        # Clean and get sentences
        text = clean_text(text)
        sentences = sentence_tokenize(text)

        if len(sentences) <= 2:
            return text, [], np.array([[1]])

        # Score sentences based on their position (earlier = more important)
        # Give highest weight to first sentence, then gradually decrease
        num_sent = len(sentences)
        scores = [1.0 - (i/num_sent) for i in range(num_sent)]

        # Create ranked sentences
        ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

        # Select top sentences
        num_sentences = max(1, int(len(sentences) * ratio))
        selected_indices = sorted([item[1] for item in ranked_sentences[:num_sentences]])

        # Reconstruct the summary
        summary = " ".join([sentences[i] for i in selected_indices])

        # Create a dummy similarity matrix for visualization
        sim_matrix = np.zeros((len(sentences), len(sentences)))

        return summary, ranked_sentences, sim_matrix

In [None]:
if __name__ == "__main__":
    # Initialize summarizers
    summarizers = [
        TfidfSummarizer(),  # Classical TF-IDF approach
        PositionSummarizer(),  # Position-based simple baseline
        PhoBERTSummarizer(device),  # VietAI's PhoBERT
    ]

   # Optionally add more complex models if GPU available
    if torch.cuda.is_available():
        summarizers.append(MultiBERTSummarizer(device))  # Multilingual BERT
        try:
            # Uncomment to test ViT5 separately first
            test_vit5 = False  # Set to True to test ViT5 in isolation
            if test_vit5:
                print("\nTesting ViT5 summarizer in isolation...")
                vit5 = ViT5Summarizer(device)
                for text_name, text in [("Tech Article", vietnamese_tech_text), ("Tourism Article", vietnamese_tourism_text)]:
                    print(f"\nTesting on {text_name}...")
                    vit5_summary, _, _ = vit5.summarize(text)
                    print(f"ViT5 Summary ({len(vit5_summary)} chars):")
                    print("="*80)
                    print(vit5_summary)
                    print("="*80)
            else:
                # Add ViT5 to the main comparison
                summarizers.append(ViT5Summarizer(device))  # Vietnamese T5
        except Exception as e:
            print(f"Note: ViT5 model could not be loaded: {e}")

            # Try MT5 as alternative
            try:
                print("\nTrying MT5 as alternative to ViT5...")
                from transformers import MT5ForConditionalGeneration

                class MT5Summarizer(BaseSummarizer):
                    def __init__(self, device=None):
                        super().__init__(name="MT5")
                        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
                        print(f"Initializing MT5 on {self.device}")

                        # Load MT5 model
                        self.model_name = "google/mt5-small"
                        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
                        self.model = MT5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)

                    def summarize(self, text, ratio=0.3):
                        # Clean text
                        text = clean_text(text)

                        # Prepare with Vietnamese prompt
                        input_text = f"summarize to Vietnamese: {text}"

                        # Tokenize input
                        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(self.device)

                        # Generate summary
                        with torch.no_grad():
                            summary_ids = self.model.generate(
                                inputs.input_ids,
                                max_length=150,
                                min_length=40,
                                num_beams=4,
                                early_stopping=True
                            )

                        # Decode summary
                        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

                        # For compatibility with visualization functions
                        sentences = sentence_tokenize(text)
                        dummy_matrix = np.zeros((len(sentences), len(sentences)))
                        dummy_ranked = [(1.0, i, s) for i, s in enumerate(sentences)]

                        return summary, dummy_ranked, dummy_matrix

                summarizers.append(MT5Summarizer(device))
            except Exception as e:
                print(f"Note: MT5 model could not be loaded either: {e}")

    print(f"Initialized {len(summarizers)} summarization models")

    # Compare models on tech article
    print("\n" + "="*80)
    print("SAMPLE 1: VIETNAMESE TECHNOLOGY NEWS")
    print("="*80)
    print("ORIGINAL TEXT:")
    print("=" * 80)
    print(vietnamese_tech_text)
    print()
    tech_results = compare_summaries(vietnamese_tech_text, summarizers, ratio=0.4)

    # Display comparison table
    tech_comparison = display_comparison_table(tech_results, len(vietnamese_tech_text))
    print("\nModel Comparison for Technology Text:")
    print(tech_comparison)

    # Plot performance comparison
    plot_performance_comparison(tech_results)

    # Display summaries
    for model, result in tech_results.items():
        print(f"\n{model} Summary:")
        print("="*80)
        print(result["summary"])

    # Compare models on tourism article
    print("\n" + "="*80)
    print("SAMPLE 2: VIETNAMESE TOURISM ARTICLE")
    print("="*80)
    print("ORIGINAL TEXT:")
    print("=" * 80)
    print(vietnamese_tourism_text)
    print()
    tourism_results = compare_summaries(vietnamese_tourism_text, summarizers, ratio=0.4)

    # Display comparison table
    tourism_comparison = display_comparison_table(tourism_results, len(vietnamese_tourism_text))
    print("\nModel Comparison for Tourism Text:")
    print(tourism_comparison)

    # Plot performance comparison
    plot_performance_comparison(tourism_results)

    # Display summaries
    for model, result in tourism_results.items():
        print(f"\n{model} Summary:")
        print("="*80)
        print(result["summary"])

    # Visualize PhoBERT results in detail (as it's our primary model)
    if "PhoBERT (VietAI)" in tech_results:
        print("\n" + "="*80)
        print("DETAILED ANALYSIS OF PHOBERT MODEL")
        print("="*80)

        # Tech text visualization
        phobert_tech = tech_results["PhoBERT (VietAI)"]
        plot_similarity_matrix(phobert_tech["sim_matrix"],
                              sentence_tokenize(clean_text(vietnamese_tech_text)),
                              "PhoBERT: Tech Article Similarity Matrix")

        plot_sentence_scores(phobert_tech["ranked_sentences"],
                            sentence_tokenize(clean_text(vietnamese_tech_text)),
                            "PhoBERT: Tech Article Sentence Scores")

        # Tourism text visualization
        phobert_tourism = tourism_results["PhoBERT (VietAI)"]
        plot_similarity_matrix(phobert_tourism["sim_matrix"],
                              sentence_tokenize(clean_text(vietnamese_tourism_text)),
                              "PhoBERT: Tourism Article Similarity Matrix")

        plot_sentence_scores(phobert_tourism["ranked_sentences"],
                            sentence_tokenize(clean_text(vietnamese_tourism_text)),
                            "PhoBERT: Tourism Article Sentence Scores")