# Comprehensive Summarization Evaluation
## Evaluating PEGASUS and PRIMERA on Multi-News Dataset
### For Publication in Top-Tier Journals

## 1. Install Required Packages
Run this cell first to install all dependencies

In [None]:
!pip install datasets transformers torch evaluate rouge-score bert-score -q
!pip install summac -q
!pip install git+https://github.com/neulab/BARTScore.git -q
!pip install git+https://github.com/maszhongming/UniEval.git -q
!pip install git+https://github.com/W4ngatang/qags.git -q
!pip install factcc -q
!pip install qafacteval -q
!pip install alignscore -q

## 2. Download AlignScore Checkpoint

In [None]:
!wget -q https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt

## 3. Import Libraries

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 4. Define Evaluation Class

In [None]:
class SummarizationEvaluator:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load dataset
        print("Loading Multi-News dataset...")
        self.dataset = load_dataset("Awesome075/multi_news_parquet", split='test')
        self.test_data = self.dataset.select(range(min(num_samples, len(self.dataset))))
        print(f"Loaded {len(self.test_data)} samples")
        
        # Models configuration
        self.models = {
            'PEGASUS': 'google/pegasus-multi_news',
            'PRIMERA': 'allenai/PRIMERA'
        }
        
        # Initialize metrics
        self.setup_metrics()
        
    def setup_metrics(self):
        """Initialize all evaluation metrics"""
        print("Setting up evaluation metrics...")
        
        # ROUGE (Rouge-1, Rouge-2, Rouge-L)
        from evaluate import load
        self.rouge = load('rouge')
        print("✓ ROUGE loaded")
        
        # BERTScore
        from bert_score import BERTScorer
        self.bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
        print("✓ BERTScore loaded")
        
        # BARTScore
        from bart_score import BARTScorer
        self.bart_scorer = BARTScorer(device=self.device, checkpoint='facebook/bart-large-cnn')
        print("✓ BARTScore loaded")
        
        # SummaC
        from summac.model_summac import SummaCZS
        self.summac_model = SummaCZS(granularity="sentence", model_name="vitc", device=self.device)
        print("✓ SummaC loaded")
        
        # FactCC
        from factcc.modeling import FactCCModel
        self.factcc_model = FactCCModel.from_pretrained("manueldeprada/FactCC")
        self.factcc_model.to(self.device)
        self.factcc_model.eval()
        print("✓ FactCC loaded")
        
        # UniEval
        from utils import convert_to_json
        from metric.evaluator import get_evaluator
        self.unieval = get_evaluator('summarization')
        print("✓ UniEval loaded")
        
        # AlignScore
        from alignscore import AlignScore
        self.alignscore = AlignScore(model='roberta-base', batch_size=32, device=self.device, 
                                     ckpt_path='AlignScore-base.ckpt', evaluation_mode='nli_sp')
        print("✓ AlignScore loaded")
        
        # QAGS
        from qags.qags_model import QAGSModel
        self.qags_model = QAGSModel()
        print("✓ QAGS loaded")
        
        # QAFactEval
        from QAFactEval.run_model import QAFactEval as QAFactEvalModel
        self.qafacteval = QAFactEvalModel()
        print("✓ QAFactEval loaded")
        
        print("\nAll metrics initialized successfully!")
        
    def generate_summaries(self, model_name, model_path):
        """Generate summaries using the specified model"""
        print(f"\nGenerating summaries with {model_name}...")
        
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        model.to(self.device)
        model.eval()
        
        summaries = []
        
        with torch.no_grad():
            for item in tqdm(self.test_data, desc=f"Generating {model_name}"):
                source = item['document']
                
                # Tokenize input
                inputs = tokenizer(source, max_length=1024, truncation=True, 
                                 return_tensors='pt').to(self.device)
                
                # Generate summary (vanilla generation - no special parameters)
                summary_ids = model.generate(inputs['input_ids'])
                
                # Decode summary
                summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                summaries.append(summary)
        
        # Clear memory
        del model
        torch.cuda.empty_cache()
        
        return summaries
    
    def evaluate_rouge(self, predictions, references):
        """Evaluate ROUGE-1, ROUGE-2, ROUGE-L"""
        results = self.rouge.compute(predictions=predictions, references=references)
        return {
            'rouge1': results['rouge1'],
            'rouge2': results['rouge2'],
            'rougeL': results['rougeL']
        }
    
    def evaluate_bertscore(self, predictions, references):
        """Evaluate BERTScore"""
        P, R, F1 = self.bert_scorer.score(predictions, references)
        return {
            'bertscore_precision': P.mean().item(),
            'bertscore_recall': R.mean().item(),
            'bertscore_f1': F1.mean().item()
        }
    
    def evaluate_bartscore(self, predictions, references, sources):
        """Evaluate BARTScore"""
        # BARTScore ref-hyp (reference-based)
        ref_scores = self.bart_scorer.score(references, predictions)
        
        # BARTScore src-hyp (source-based faithfulness)
        src_scores = self.bart_scorer.score(sources, predictions)
        
        return {
            'bartscore_ref': np.mean(ref_scores),
            'bartscore_src': np.mean(src_scores)
        }
    
    def evaluate_summac(self, predictions, sources):
        """Evaluate SummaC (factual consistency)"""
        scores = []
        for pred, src in zip(predictions, sources):
            score = self.summac_model.score([src], [pred])
            scores.append(score['scores'][0])
        
        return {'summac': np.mean(scores)}
    
    def evaluate_factcc(self, predictions, sources):
        """Evaluate FactCC"""
        scores = []
        
        for pred, src in tqdm(zip(predictions, sources), total=len(predictions), desc="FactCC"):
            # Prepare input
            inputs = self.factcc_model.tokenizer(
                src, pred, max_length=512, truncation=True, 
                return_tensors='pt'
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.factcc_model(**inputs)
                logits = outputs.logits
                prob = torch.softmax(logits, dim=1)[0][1].item()  # Probability of being factual
                scores.append(prob)
        
        return {'factcc': np.mean(scores)}
    
    def evaluate_unieval(self, predictions, references, sources):
        """Evaluate UniEval (coherence, consistency, fluency, relevance)"""
        # Prepare data in UniEval format
        data = []
        for src, ref, pred in zip(sources, references, predictions):
            data.append({
                'source': src,
                'reference': ref,
                'system_output': pred
            })
        
        # Evaluate all dimensions
        eval_results = self.unieval.evaluate(data, dims=['coherence', 'consistency', 
                                                         'fluency', 'relevance'], 
                                            overall=True)
        
        return {
            'unieval_coherence': np.mean([r['coherence'] for r in eval_results]),
            'unieval_consistency': np.mean([r['consistency'] for r in eval_results]),
            'unieval_fluency': np.mean([r['fluency'] for r in eval_results]),
            'unieval_relevance': np.mean([r['relevance'] for r in eval_results]),
            'unieval_overall': np.mean([r['overall'] for r in eval_results])
        }
    
    def evaluate_alignscore(self, predictions, sources):
        """Evaluate AlignScore (factual consistency)"""
        scores = self.alignscore.score(contexts=sources, claims=predictions)
        return {'alignscore': np.mean(scores)}
    
    def evaluate_qags(self, predictions, sources):
        """Evaluate QAGS"""
        scores = []
        for pred, src in tqdm(zip(predictions, sources), total=len(predictions), desc="QAGS"):
            score = self.qags_model.score(source=src, summary=pred)
            scores.append(score)
        
        return {'qags': np.mean(scores)}
    
    def evaluate_qafacteval(self, predictions, sources):
        """Evaluate QAFactEval"""
        results = self.qafacteval.score_batch_qafacteval(
            sources=sources,
            summaries=predictions
        )
        
        return {'qafacteval': np.mean(results)}
    
    def run_evaluation(self):
        """Run complete evaluation pipeline"""
        all_results = {}
        
        for model_name, model_path in self.models.items():
            print(f"\n{'='*60}")
            print(f"Evaluating {model_name}")
            print(f"{'='*60}")
            
            # Generate summaries
            predictions = self.generate_summaries(model_name, model_path)
            
            # Extract references and sources
            references = [item['summary'] for item in self.test_data]
            sources = [item['document'] for item in self.test_data]
            
            # Run all metrics
            results = {}
            
            print("\nEvaluating ROUGE...")
            results.update(self.evaluate_rouge(predictions, references))
            
            print("Evaluating BERTScore...")
            results.update(self.evaluate_bertscore(predictions, references))
            
            print("Evaluating BARTScore...")
            results.update(self.evaluate_bartscore(predictions, references, sources))
            
            print("Evaluating SummaC...")
            results.update(self.evaluate_summac(predictions, sources))
            
            print("Evaluating FactCC...")
            results.update(self.evaluate_factcc(predictions, sources))
            
            print("Evaluating UniEval...")
            results.update(self.evaluate_unieval(predictions, references, sources))
            
            print("Evaluating AlignScore...")
            results.update(self.evaluate_alignscore(predictions, sources))
            
            print("Evaluating QAGS...")
            results.update(self.evaluate_qags(predictions, sources))
            
            print("Evaluating QAFactEval...")
            results.update(self.evaluate_qafacteval(predictions, sources))
            
            all_results[model_name] = results
            
            # Display results for this model
            print(f"\n{model_name} Results:")
            for metric, value in results.items():
                print(f"  {metric}: {value:.4f}")
        
        return all_results
    
    def save_results(self, results, filename):
        """Save results to JSON file"""
        import json
        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\nResults saved to {filename}")
    
    def create_results_table(self, results):
        """Create formatted results table for publication"""
        df = pd.DataFrame(results).T
        
        # Reorder columns for better presentation
        metric_order = [
            'rouge1', 'rouge2', 'rougeL',
            'bertscore_precision', 'bertscore_recall', 'bertscore_f1',
            'bartscore_ref', 'bartscore_src',
            'summac', 'factcc', 'alignscore',
            'unieval_coherence', 'unieval_consistency', 'unieval_fluency', 
            'unieval_relevance', 'unieval_overall',
            'qags', 'qafacteval'
        ]
        
        df = df[[col for col in metric_order if col in df.columns]]
        
        # Round values for publication
        df = df.round(4)
        
        # Save as CSV
        df.to_csv('results_table.csv')
        
        # Print formatted table
        print("\n" + "="*80)
        print("FINAL RESULTS TABLE")
        print("="*80)
        print(df.to_string())
        print("\nTable saved to results_table.csv")
        
        # Create LaTeX table for publication
        latex_table = df.to_latex(float_format="%.4f")
        with open('results_table.tex', 'w') as f:
            f.write(latex_table)
        print("LaTeX table saved to results_table.tex")
        
        return df

## 5. Initialize Evaluator
This will load the dataset and initialize all metrics

In [None]:
# Initialize evaluator with 100 test samples
evaluator = SummarizationEvaluator(num_samples=100)

## 6. Run Complete Evaluation
This will evaluate both PEGASUS and PRIMERA on all metrics. This may take several hours to complete.

In [None]:
# Run evaluation
results = evaluator.run_evaluation()

## 7. Save and Display Results

In [None]:
# Save final results
evaluator.save_results(results, 'final_results.json')

# Create and display results table
results_df = evaluator.create_results_table(results)

## 8. Display Individual Model Results

In [None]:
# Display PEGASUS results
print("PEGASUS Results:")
print("="*60)
for metric, value in results['PEGASUS'].items():
    print(f"{metric:30s}: {value:.4f}")

In [None]:
# Display PRIMERA results
print("PRIMERA Results:")
print("="*60)
for metric, value in results['PRIMERA'].items():
    print(f"{metric:30s}: {value:.4f}")

## 9. Visualize Results (Optional)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

# Create comparison plot
results_df_T = results_df.T
results_df_T.plot(kind='bar', width=0.8)
plt.title('Model Comparison Across All Metrics', fontsize=16, fontweight='bold')
plt.xlabel('Metrics', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Models', fontsize=10)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualization saved to model_comparison.png")

## 10. Statistical Significance Testing (Optional)

In [None]:
# If you have per-sample scores, you can perform statistical tests
# This is a placeholder for paired t-test or Wilcoxon signed-rank test

from scipy import stats

# Example: Compare ROUGE-1 scores (you would need per-sample scores)
# pegasus_rouge1_scores = [...]
# primera_rouge1_scores = [...]
# t_stat, p_value = stats.ttest_rel(pegasus_rouge1_scores, primera_rouge1_scores)
# print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

print("Statistical testing can be added here with per-sample scores")

## Summary

This notebook evaluates PEGASUS and PRIMERA models on the Multi-News dataset using:

**Reference-based Metrics:**
- ROUGE-1, ROUGE-2, ROUGE-L
- BERTScore (Precision, Recall, F1)
- BARTScore (Reference-based)

**Factual Consistency Metrics:**
- BARTScore (Source-based)
- SummaC
- FactCC
- AlignScore
- QAGS
- QAFactEval

**Multi-dimensional Metrics:**
- UniEval (Coherence, Consistency, Fluency, Relevance, Overall)

All results are saved in publication-ready formats (CSV and LaTeX).