# Comprehensive Summarization Evaluation - Google Colab
## PEGASUS vs PRIMERA on Multi-News Dataset
### All Original Metrics for Top-Tier Publication

**⚡ IMPORTANT: Set Runtime to GPU** (Runtime → Change runtime type → T4 GPU or better)

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install core packages
!pip install -q datasets transformers torch accelerate evaluate rouge-score bert-score pandas matplotlib seaborn scipy summac

In [None]:
# Clone metric repositories
!rm -rf BARTScore UniEval qags QAFactEval AlignScore
!git clone -q https://github.com/neulab/BARTScore.git
!git clone -q https://github.com/maszhongming/UniEval.git
!git clone -q https://github.com/W4ngatang/qags.git
!git clone -q https://github.com/salesforce/QAFactEval.git
!git clone -q https://github.com/yuh-zha/AlignScore.git

# Install additional dependencies
!pip install -q -r /content/UniEval/requirements.txt
!pip install -q spacy nltk questeval
!python -m spacy download en_core_web_sm

# Download AlignScore checkpoint
!wget -q https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-base.ckpt -O /content/AlignScore-base.ckpt

# Add to path
import sys
sys.path.insert(0, '/content/BARTScore')
sys.path.insert(0, '/content/UniEval')
sys.path.insert(0, '/content/qags')
sys.path.insert(0, '/content/QAFactEval')
sys.path.insert(0, '/content/AlignScore')

print("✓ All metrics installed")

In [None]:
# Imports
import os
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
class SummarizationEvaluator:
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Device: {self.device}")
        
        print("Loading Multi-News...")
        self.dataset = load_dataset("Awesome075/multi_news_parquet", split='test')
        self.test_data = self.dataset.select(range(min(num_samples, len(self.dataset))))
        print(f"Loaded {len(self.test_data)} samples")
        
        self.models = {
            'PEGASUS': 'google/pegasus-multi_news',
            'PRIMERA': 'allenai/PRIMERA'
        }
        
        self.setup_metrics()
    
    def setup_metrics(self):
        from evaluate import load as eval_load
        from bert_score import BERTScorer
        
        self.rouge = eval_load('rouge')
        self.bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=self.device)
        print("✓ ROUGE, BERTScore")
        
        try:
            from bart_score import BARTScorer
            self.bart_scorer = BARTScorer(device=self.device, checkpoint='facebook/bart-large-cnn')
            print("✓ BARTScore")
        except:
            self.bart_scorer = None
            print("✗ BARTScore failed")
        
        try:
            from summac.model_summac import SummaCZS
            self.summac_model = SummaCZS(granularity="sentence", model_name="vitc", device=self.device)
            print("✓ SummaC")
        except:
            self.summac_model = None
            print("✗ SummaC failed")
        
        try:
            self.factcc_tokenizer = AutoTokenizer.from_pretrained("manueldeprada/FactCC")
            self.factcc_model = AutoModelForSequenceClassification.from_pretrained("manueldeprada/FactCC")
            self.factcc_model.to(self.device).eval()
            print("✓ FactCC")
        except:
            self.factcc_model = None
            print("✗ FactCC failed")
        
        try:
            from metric.evaluator import get_evaluator
            self.unieval = get_evaluator('summarization')
            print("✓ UniEval")
        except:
            self.unieval = None
            print("✗ UniEval failed")
        
        try:
            from alignscore import AlignScore
            self.alignscore = AlignScore(model='roberta-base', batch_size=32, device=self.device,
                                        ckpt_path='/content/AlignScore-base.ckpt', evaluation_mode='nli_sp')
            print("✓ AlignScore")
        except:
            self.alignscore = None
            print("✗ AlignScore failed")
        
        try:
            from qags_model import QAGSModel
            self.qags_model = QAGSModel()
            print("✓ QAGS")
        except:
            self.qags_model = None
            print("✗ QAGS failed")
        
        try:
            from run_model import QAFactEval as QAFactEvalModel
            self.qafacteval = QAFactEvalModel()
            print("✓ QAFactEval")
        except:
            self.qafacteval = None
            print("✗ QAFactEval failed")
    
    def generate_summaries(self, model_name, model_path):
        print(f"\nGenerating {model_name} summaries...")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device).eval()
        
        summaries = []
        with torch.no_grad():
            for item in tqdm(self.test_data):
                inputs = tokenizer(item['document'], max_length=1024, truncation=True, return_tensors='pt').to(self.device)
                summary_ids = model.generate(inputs['input_ids'])
                summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
        
        del model
        torch.cuda.empty_cache()
        return summaries
    
    def evaluate_rouge(self, predictions, references):
        r = self.rouge.compute(predictions=predictions, references=references)
        return {'rouge1': r['rouge1'], 'rouge2': r['rouge2'], 'rougeL': r['rougeL']}
    
    def evaluate_bertscore(self, predictions, references):
        P, R, F1 = self.bert_scorer.score(predictions, references)
        return {'bertscore_p': P.mean().item(), 'bertscore_r': R.mean().item(), 'bertscore_f1': F1.mean().item()}
    
    def evaluate_bartscore(self, predictions, references, sources):
        if not self.bart_scorer: return {'bartscore_ref': None, 'bartscore_src': None}
        return {'bartscore_ref': np.mean(self.bart_scorer.score(references, predictions)),
                'bartscore_src': np.mean(self.bart_scorer.score(sources, predictions))}
    
    def evaluate_summac(self, predictions, sources):
        if not self.summac_model: return {'summac': None}
        scores = [self.summac_model.score([src], [pred])['scores'][0] for pred, src in zip(predictions, sources)]
        return {'summac': np.mean(scores)}
    
    def evaluate_factcc(self, predictions, sources):
        if not self.factcc_model: return {'factcc': None}
        scores = []
        for pred, src in tqdm(zip(predictions, sources), desc="FactCC", total=len(predictions)):
            inputs = self.factcc_tokenizer(src, pred, max_length=512, truncation=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                logits = self.factcc_model(**inputs).logits
                scores.append(torch.softmax(logits, dim=1)[0][1].item())
        return {'factcc': np.mean(scores)}
    
    def evaluate_unieval(self, predictions, references, sources):
        if not self.unieval: return {f'unieval_{k}': None for k in ['coherence', 'consistency', 'fluency', 'relevance', 'overall']}
        data = [{'source': s, 'reference': r, 'system_output': p} for s, r, p in zip(sources, references, predictions)]
        res = self.unieval.evaluate(data, dims=['coherence', 'consistency', 'fluency', 'relevance'], overall=True)
        return {f'unieval_{k}': np.mean([r[k] for r in res]) for k in ['coherence', 'consistency', 'fluency', 'relevance', 'overall']}
    
    def evaluate_alignscore(self, predictions, sources):
        if not self.alignscore: return {'alignscore': None}
        return {'alignscore': np.mean(self.alignscore.score(contexts=sources, claims=predictions))}
    
    def evaluate_qags(self, predictions, sources):
        if not self.qags_model: return {'qags': None}
        scores = []
        for pred, src in tqdm(zip(predictions, sources), desc="QAGS", total=len(predictions)):
            try: scores.append(self.qags_model.score(source=src, summary=pred))
            except: pass
        return {'qags': np.mean(scores) if scores else None}
    
    def evaluate_qafacteval(self, predictions, sources):
        if not self.qafacteval: return {'qafacteval': None}
        try: return {'qafacteval': np.mean(self.qafacteval.score_batch_qafacteval(sources=sources, summaries=predictions))}
        except: return {'qafacteval': None}
    
    def run_evaluation(self):
        all_results = {}
        for model_name, model_path in self.models.items():
            print(f"\n{'='*60}\n{model_name}\n{'='*60}")
            predictions = self.generate_summaries(model_name, model_path)
            references = [item['summary'] for item in self.test_data]
            sources = [item['document'] for item in self.test_data]
            
            results = {}
            results.update(self.evaluate_rouge(predictions, references))
            results.update(self.evaluate_bertscore(predictions, references))
            results.update(self.evaluate_bartscore(predictions, references, sources))
            results.update(self.evaluate_summac(predictions, sources))
            results.update(self.evaluate_factcc(predictions, sources))
            results.update(self.evaluate_unieval(predictions, references, sources))
            results.update(self.evaluate_alignscore(predictions, sources))
            results.update(self.evaluate_qags(predictions, sources))
            results.update(self.evaluate_qafacteval(predictions, sources))
            
            all_results[model_name] = results
            for k, v in results.items():
                print(f"{k:30s}: {v if v is None else f'{v:.4f}'}")
        return all_results
    
    def save_and_display(self, results):
        import json
        from google.colab import files
        
        # JSON
        clean = {k: {m: (v if v is not None else "N/A") for m, v in r.items()} for k, r in results.items()}
        with open('results.json', 'w') as f: json.dump(clean, f, indent=2)
        
        # DataFrame
        df = pd.DataFrame(results).T.round(4)
        df.to_csv('results.csv')
        print("\n" + "="*80 + "\nRESULTS\n" + "="*80)
        print(df.to_string())
        
        # LaTeX
        with open('results.tex', 'w') as f: f.write(df.to_latex(float_format="%.4f", na_rep="N/A"))
        
        # Download
        files.download('results.json')
        files.download('results.csv')
        files.download('results.tex')
        
        return df

print("✓ Evaluator ready")

## Quick Test (5 samples)

In [None]:
test_eval = SummarizationEvaluator(num_samples=5)
test_results = test_eval.run_evaluation()
test_eval.save_and_display(test_results)

## Full Evaluation (100 samples)

In [None]:
evaluator = SummarizationEvaluator(num_samples=100)
results = evaluator.run_evaluation()
df = evaluator.save_and_display(results)

## Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

sns.set_style("whitegrid")
plot_df = df.dropna(axis=1)

if len(plot_df.columns) > 0:
    plt.figure(figsize=(16, 8))
    plot_df.T.plot(kind='bar', width=0.75, edgecolor='black')
    plt.title('PEGASUS vs PRIMERA', fontsize=16, fontweight='bold')
    plt.xlabel('Metrics', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Models')
    plt.tight_layout()
    plt.savefig('comparison.png', dpi=300)
    plt.show()
    files.download('comparison.png')