# FactSum: Publication-Ready Evaluation

This notebook runs **complete factuality evaluation** for top-tier journal submission.

## Features
- ‚úÖ **500+ samples** (configurable)
- ‚úÖ **8 factuality metrics** (including QAFactEval)
- ‚úÖ **3 baseline models** (PEGASUS, PRIMERA, Hierarchical)
- ‚úÖ **GPU acceleration**
- ‚úÖ **Results saved to Google Drive**

**Runtime**: ~2-4 hours depending on sample size and GPU availability

## 1. Setup & Installation

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q transformers datasets evaluate rouge-score bert-score sentence-transformers
!pip install -q torch torchvision torchaudio
!pip install -q nltk pandas numpy tqdm

In [None]:
# Install QAFactEval (ONLY WORKS ON LINUX x86)
!pip install qafacteval

In [None]:
# Mount Google Drive for saving results
from google.colab import drive
drive.mount('/content/drive')

# Create output directory
import os
OUTPUT_DIR = '/content/drive/MyDrive/FactSum_Results'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Results will be saved to: {OUTPUT_DIR}")

In [None]:
# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

## 2. Configuration

In [None]:
# ===== CONFIGURATION =====
# Adjust these for your needs

NUM_SAMPLES = 500  # Recommended: 500 for short papers, 1000+ for main venues
RANDOM_SEED = 42
BATCH_SIZE = 4

# Models to evaluate
MODELS = {
    'PEGASUS': 'google/pegasus-multi_news',
    'PRIMERA': 'allenai/PRIMERA',
    # Add your hierarchical model here if you upload it
}

# Metrics to use
USE_QAFACTEVAL = True  # Original Salesforce implementation
USE_SUMMAC = True
USE_FACTCC = True
USE_BARTSCORE = True
USE_BERTSCORE = True
USE_UNIEVAL = True
USE_ALIGNSCORE = True
USE_QAGS = True

## 3. Load Dataset

In [None]:
import numpy as np
from datasets import load_dataset

print("Loading Multi-News dataset...")
dataset = load_dataset("multi_news", split="test")
print(f"Full test set size: {len(dataset)}")

# Sample
np.random.seed(RANDOM_SEED)
indices = np.random.choice(len(dataset), min(NUM_SAMPLES, len(dataset)), replace=False)
samples = [dataset[int(i)] for i in indices]
print(f"Selected {len(samples)} samples for evaluation")

## 4. Initialize Metrics

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import CrossEncoder
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

metrics = {}

In [None]:
# 4.1 QAFactEval (Original Salesforce)
if USE_QAFACTEVAL:
    print("Loading QAFactEval (Original)...")
    from qafacteval import QAFactEval
    qafacteval = QAFactEval(
        cuda_device=0 if torch.cuda.is_available() else -1,
        use_lerc_quip=True,
        verbose=False
    )
    
    def score_qafacteval(source, summary):
        result = qafacteval.score_batch([source], [[summary]], return_qa_pairs=False)
        return result[0]['qa_f1']
    
    metrics['QAFactEval'] = score_qafacteval
    print("‚úì QAFactEval loaded")

In [None]:
# 4.2 SummaC-ZS
if USE_SUMMAC:
    print("Loading SummaC-ZS...")
    summac_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large-mnli")
    summac_model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-large-mnli").to(device)
    summac_model.eval()
    
    def score_summac(source, summary):
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize(summary)
        if not sentences:
            return 0.0
        probs = []
        for sent in sentences:
            inputs = summac_tokenizer(source[:1024], sent, return_tensors="pt", truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = summac_model(**inputs)
                p = torch.softmax(outputs.logits, dim=-1)
                probs.append(p[0][2].item())  # Entailment
        return float(np.mean(probs))
    
    metrics['SummaC'] = score_summac
    print("‚úì SummaC loaded")

In [None]:
# 4.3 FactCC
if USE_FACTCC:
    print("Loading FactCC...")
    factcc_tokenizer = AutoTokenizer.from_pretrained("manueldeprada/FactCC")
    factcc_model = AutoModelForSequenceClassification.from_pretrained("manueldeprada/FactCC").to(device)
    factcc_model.eval()
    
    def score_factcc(source, summary):
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize(summary)
        if not sentences:
            return 0.0
        probs = []
        for sent in sentences:
            inputs = factcc_tokenizer(source[:1024], sent, return_tensors="pt", truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = factcc_model(**inputs)
                p = torch.softmax(outputs.logits, dim=-1)
                probs.append(p[0][1].item())  # Consistent
        return float(np.mean(probs))
    
    metrics['FactCC'] = score_factcc
    print("‚úì FactCC loaded")

In [None]:
# 4.4 BARTScore
if USE_BARTSCORE:
    print("Loading BARTScore...")
    bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
    bart_model.eval()
    
    def score_bartscore(source, summary):
        source_ids = bart_tokenizer(source[:1024], return_tensors='pt', truncation=True, max_length=1024).to(device)
        with bart_tokenizer.as_target_tokenizer():
            labels = bart_tokenizer(summary, return_tensors='pt', truncation=True, max_length=256).to(device)
        with torch.no_grad():
            outputs = bart_model(input_ids=source_ids['input_ids'], attention_mask=source_ids['attention_mask'], labels=labels['input_ids'])
            nll = outputs.loss.item()
            score = -nll
        return max(0, min(1, (score + 5) / 5))
    
    metrics['BARTScore'] = score_bartscore
    print("‚úì BARTScore loaded")

In [None]:
# 4.5 BERTScore
if USE_BERTSCORE:
    print("Loading BERTScore...")
    from bert_score import score as bert_score_fn
    
    def score_bertscore(source, summary):
        P, R, F1 = bert_score_fn([summary], [source[:2000]], model_type='microsoft/deberta-xlarge-mnli', device=device, verbose=False)
        return F1.item()
    
    metrics['BERTScore'] = score_bertscore
    print("‚úì BERTScore loaded")

In [None]:
# 4.6 UniEval-Fact
if USE_UNIEVAL:
    print("Loading UniEval-Fact...")
    unieval_tokenizer = AutoTokenizer.from_pretrained("MingZhong/unieval-fact")
    unieval_model = AutoModelForSeq2SeqLM.from_pretrained("MingZhong/unieval-fact").to(device)
    unieval_model.eval()
    
    def score_unieval(source, summary):
        input_text = f"question: Is this a factual summary? </s> document: {source[:500]} </s> summary: {summary}"
        inputs = unieval_tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = unieval_model.generate(inputs['input_ids'], max_length=10)
            decoded = unieval_tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "yes" in decoded.lower():
            return 1.0
        elif "no" in decoded.lower():
            return 0.0
        return 0.5
    
    metrics['UniEval'] = score_unieval
    print("‚úì UniEval loaded")

In [None]:
# 4.7 AlignScore
if USE_ALIGNSCORE:
    print("Loading AlignScore...")
    try:
        align_tokenizer = AutoTokenizer.from_pretrained("yzha/AlignScore-base")
        align_model = AutoModelForSequenceClassification.from_pretrained("yzha/AlignScore-base").to(device)
        align_model.eval()
        
        def score_alignscore(source, summary):
            inputs = align_tokenizer(source[:1024], summary, return_tensors='pt', truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = align_model(**inputs)
                score = torch.sigmoid(outputs.logits).item()
            return score
        
        metrics['AlignScore'] = score_alignscore
        print("‚úì AlignScore loaded")
    except Exception as e:
        print(f"‚ö† AlignScore not available: {e}")

In [None]:
# 4.8 QAGS (using our implementation)
if USE_QAGS:
    print("Loading QAGS...")
    from transformers import pipeline
    
    qags_qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
    qags_qg_model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap").to(device)
    qags_qa_pipeline = pipeline('question-answering', model='deepset/roberta-base-squad2', device=0 if torch.cuda.is_available() else -1)
    
    def score_qags(source, summary):
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize(summary)[:5]
        questions = []
        for sent in sentences:
            inputs = qags_qg_tokenizer.encode("generate questions: " + sent, return_tensors="pt", max_length=512, truncation=True).to(device)
            with torch.no_grad():
                outputs = qags_qg_model.generate(inputs, max_length=64, num_beams=4)
                q = qags_qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
                if q and "?" in q:
                    questions.append(q)
        if not questions:
            return 0.0
        matches = []
        for q in questions:
            try:
                ans_src = qags_qa_pipeline(question=q, context=source[:2000], handle_impossible_answer=True)
                ans_sum = qags_qa_pipeline(question=q, context=summary, handle_impossible_answer=True)
                src_text = ans_src['answer'].lower().strip()
                sum_text = ans_sum['answer'].lower().strip()
                if not sum_text:
                    continue
                if src_text == sum_text or src_text in sum_text or sum_text in src_text:
                    matches.append(1.0)
                else:
                    src_toks = set(src_text.split())
                    sum_toks = set(sum_text.split())
                    overlap = len(src_toks & sum_toks) / len(sum_toks) if sum_toks else 0
                    matches.append(overlap)
            except:
                continue
        return float(np.mean(matches)) if matches else 0.0
    
    metrics['QAGS'] = score_qags
    print("‚úì QAGS loaded")

print(f"\nüìä Loaded {len(metrics)} metrics: {list(metrics.keys())}")

## 5. Generate Summaries & Evaluate

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from transformers import LEDTokenizer, LEDForConditionalGeneration
from tqdm.notebook import tqdm
from rouge_score import rouge_scorer
import pandas as pd
import json

rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def generate_summary(model, tokenizer, text, model_type, max_length=256):
    """Generate summary from a model."""
    max_input = 1024 if 'pegasus' in model_type.lower() else 4096
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input).to(device)
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            min_length=50,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def evaluate_model(model_name, model, tokenizer, samples, metrics):
    """Evaluate a single model on all samples with all metrics."""
    results = {'summaries': [], 'rouge': [], 'metrics': {m: [] for m in metrics}}
    
    for sample in tqdm(samples, desc=f"{model_name}"):
        source = sample['document']
        reference = sample['summary']
        
        # Generate summary
        summary = generate_summary(model, tokenizer, source, model_name)
        results['summaries'].append(summary)
        
        # ROUGE
        rouge_scores = rouge.score(reference, summary)
        results['rouge'].append({
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        })
        
        # Factuality metrics
        for name, metric_fn in metrics.items():
            try:
                score = metric_fn(source, summary)
                results['metrics'][name].append(score)
            except Exception as e:
                print(f"‚ö† {model_name} {name} error: {e}")
                results['metrics'][name].append(0.0)
    
    return results

In [None]:
# Evaluate PEGASUS
print("="*70)
print("EVALUATING PEGASUS-MULTI_NEWS")
print("="*70)

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-multi_news")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-multi_news").to(device)

pegasus_results = evaluate_model("PEGASUS", pegasus_model, pegasus_tokenizer, samples, metrics)

# Free memory
del pegasus_model
torch.cuda.empty_cache()

In [None]:
# Evaluate PRIMERA
print("="*70)
print("EVALUATING PRIMERA")
print("="*70)

primera_tokenizer = LEDTokenizer.from_pretrained("allenai/PRIMERA")
primera_model = LEDForConditionalGeneration.from_pretrained("allenai/PRIMERA").to(device)

primera_results = evaluate_model("PRIMERA", primera_model, primera_tokenizer, samples, metrics)

# Free memory
del primera_model
torch.cuda.empty_cache()

## 6. Save Results

In [None]:
# Aggregate and save results
all_results = {
    'PEGASUS': pegasus_results,
    'PRIMERA': primera_results
}

summary_table = {}

for model_name, results in all_results.items():
    rouge_avg = {
        'ROUGE-1': np.mean([r['rouge1'] for r in results['rouge']]),
        'ROUGE-2': np.mean([r['rouge2'] for r in results['rouge']]),
        'ROUGE-L': np.mean([r['rougeL'] for r in results['rouge']])
    }
    metric_avg = {name: np.mean(scores) for name, scores in results['metrics'].items()}
    summary_table[model_name] = {**rouge_avg, **metric_avg}
    
    # Save detailed results
    df = pd.DataFrame({
        'generated': results['summaries'],
        'rouge1': [r['rouge1'] for r in results['rouge']],
        'rouge2': [r['rouge2'] for r in results['rouge']],
        'rougeL': [r['rougeL'] for r in results['rouge']],
        **{f'{m.lower()}_score': scores for m, scores in results['metrics'].items()}
    })
    df.to_csv(f"{OUTPUT_DIR}/{model_name.lower()}_detailed_results.csv", index=False)
    print(f"‚úì Saved {model_name} details")

# Save summary
with open(f"{OUTPUT_DIR}/complete_comparison.json", 'w') as f:
    json.dump(summary_table, f, indent=2)
print(f"‚úì Saved comparison summary")

In [None]:
# Print final results table
print("\n" + "="*100)
print("PUBLICATION-READY RESULTS TABLE")
print("="*100)
print(f"{'Model':<12} {'R-1':>7} {'R-2':>7} {'R-L':>7}", end="")
for m in metrics.keys():
    print(f" {m[:8]:>8}", end="")
print()
print("-"*100)

for model_name, scores in summary_table.items():
    print(f"{model_name:<12} {scores['ROUGE-1']:>7.4f} {scores['ROUGE-2']:>7.4f} {scores['ROUGE-L']:>7.4f}", end="")
    for m in metrics.keys():
        print(f" {scores.get(m, 0):>8.4f}", end="")
    print()

print("="*100)
print(f"\nüìÅ All results saved to: {OUTPUT_DIR}")

## 7. Generate LaTeX Table (For Paper)

In [None]:
# Generate LaTeX table for paper
latex = """\\begin{table*}[t]
\\centering
\\caption{Comparison of summarization models on Multi-News test set (N=""" + str(NUM_SAMPLES) + """).}
\\label{tab:results}
\\begin{tabular}{l|ccc|""" + "c"*len(metrics) + """}
\\toprule
Model & R-1 & R-2 & R-L"""

for m in metrics.keys():
    latex += f" & {m}"
latex += " \\\\ \\midrule\n"

for model_name, scores in summary_table.items():
    latex += f"{model_name} & {scores['ROUGE-1']:.2f} & {scores['ROUGE-2']:.2f} & {scores['ROUGE-L']:.2f}"
    for m in metrics.keys():
        latex += f" & {scores.get(m, 0):.2f}"
    latex += " \\\\\n"

latex += """\\bottomrule
\\end{tabular}
\\end{table*}"""

print(latex)

# Save LaTeX
with open(f"{OUTPUT_DIR}/results_table.tex", 'w') as f:
    f.write(latex)
print(f"\n‚úì LaTeX table saved to {OUTPUT_DIR}/results_table.tex")