# Reproducing EMNLP 2024 Paper: Multi-Target Cross-Lingual Summarization

Using SONAR encoder from Meta's Hugging Face repo via `sonar-space`.

In [None]:
# ✅ Install dependencies
!pip install -q transformers datasets evaluate rouge-score sentence-transformers
!pip install sonar-space
!pip install fairseq2 --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/pt2.6.0/cu124
!pip install comet_ml
!pip install fasttext

In [None]:
# ✅ Clone the MTXLSum repository
!git clone https://github.com/Priberam/MTXLSum.git
%cd MTXLSum


## 🔍 Load SONAR Encoder and Compute Similarity

In [None]:
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize SONAR embedding model
sonar_model = TextToEmbeddingModelPipeline(
    encoder="text_sonar_basic_encoder",
    tokenizer="text_sonar_basic_encoder",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

def similarity_score(text1, text2):
    emb = sonar_model.predict([text1, text2], source_lang="eng_Latn")
    return float(cosine_similarity([emb[0].cpu().numpy()], [emb[1].cpu().numpy()])[0][0])


## ✨ Example: Re-Rank Candidate Summaries Based on Semantic Coherence

In [None]:
# Placeholder: Example candidate summaries for demonstration
candidates = {
    'es': ['Bitcoin consume más energía que Argentina.', 'Bitcoin es muy costoso en electricidad.'],
    'fr': ['Le Bitcoin consomme plus que l’Argentine.', 'Bitcoin a un coût énergétique élevé.']
}

# Re-rank using similarity
from itertools import product

def neutral_rr(candidates):
    best_set = None
    best_score = -1
    for combo in product(*candidates.values()):
        score = 0
        for i in range(len(combo)):
            for j in range(i+1, len(combo)):
                score += similarity_score(combo[i], combo[j])
        avg_score = score / (len(combo)*(len(combo)-1)/2)
        if avg_score > best_score:
            best_score = avg_score
            best_set = combo
    return best_set, best_score

best_summaries, score = neutral_rr(candidates)
print('Best Semantically Coherent Summaries:')
for lang, summ in zip(candidates.keys(), best_summaries):
    print(f'{lang}: {summ}')
print(f'Average Pairwise Similarity: {score:.4f}')


## 🧠 Generate Summaries with mT5 (English → Multilingual)

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load mT5 fine-tuned on CrossSum
model_name = "csebuetnlp/mT5_m2m_crossSum"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

article_text = """Videos that say approved vaccines are dangerous and cause autism, cancer or infertility are among those that will be taken down, the company said.  The policy includes the termination of accounts of anti-vaccine influencers.  Tech giants have been criticised for not doing more to counter false health information on their sites.  In July, US President Joe Biden said social media platforms were largely responsible for people's scepticism in getting vaccinated by spreading misinformation, and appealed for them to address the issue.  YouTube, which is owned by Google, said 130,000 videos were removed from its platform since last year, when it implemented a ban on content spreading misinformation about Covid vaccines.  In a blog post, the company said it had seen false claims about Covid jabs "spill over into misinformation about vaccines in general". The new policy covers long-approved vaccines, such as those against measles or hepatitis B.  "We're expanding our medical misinformation policies on YouTube with new guidelines on currently administered vaccines that are approved and confirmed to be safe and effective by local health authorities and the WHO," the post said, referring to the World Health Organization."""

get_lang_id = lambda lang: tokenizer._convert_token_to_id(
    model.config.task_specific_params["langid_map"][lang][1]
)

def generate_summary(text, target_lang):
    input_ids = tokenizer(
        [WHITESPACE_HANDLER(text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        decoder_start_token_id=get_lang_id(target_lang),
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=16,
    )[0]

    return tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )


In [None]:
# Example English news article
text = (
    "Bitcoin uses more electricity annually than the whole of Argentina, "
    "analysis by Cambridge University suggests. Mining for the cryptocurrency "
    "is power-hungry, involving heavy computer calculations to verify transactions."
)

langs = {
    "fr": "french",
    "es": "spanish",
    "pt": "portuguese",
    "pu": "punjabi",
    "ko": "korean"
}

for code, full_lang in langs.items():
    summary = generate_summary(text, target_lang=full_lang)
    print(f"{full_lang} summary: {summary}")


Now you can run these summaries through SONAR to check their semantic coherence using the NeutralRR scoring function.

## 📚 Load and Cluster CrossSum Dataset for Multi-Target Summarization

In [None]:
from datasets import load_dataset
from collections import defaultdict

# Load CrossSum dataset
def load_and_preview_crosssum(split_name="english-french", num_samples=5):
    crosssum = load_dataset("csebuetnlp/CrossSum", split_name)

    print(f"\n✅ Loaded CrossSum split: {split_name}")
    print(f"Available splits: {list(crosssum.keys())}")

    # Preview samples from the 'test' set
    for i in range(min(num_samples, len(crosssum["test"]))):
        sample = crosssum["test"][i]
        print(f"\nSample {i+1}")
        print("📰 Source (English):", sample["text"][:150], "...")
        print("📝 Reference Summary (Target language):", sample["summary"][:150], "...")

    return crosssum


You can now choose one document from the cluster as input and apply mT5 + SONAR-based NeutralRR re-ranking.

## 🚀 Apply mT5 and SONAR Reranking on a CrossSum Cluster

In [None]:
crosssum_french = load_and_preview_crosssum("english-french", num_samples=1)
source_sample_french = crosssum_french["test"][3]
source_text_french = source_sample_french["text"]

crosssum_english = load_and_preview_crosssum("english-english", num_samples=1)
source_sample_english = crosssum_english["test"][4]
source_text_english = source_sample_english["text"]

# Load and preview the English-Spanish split, then extract the first article
crosssum_spanish = load_and_preview_crosssum("english-spanish", num_samples=1)
source_sample_spanish = crosssum_spanish["test"][0]
source_text_spanish = source_sample_spanish["text"]

crosssum_korean = load_and_preview_crosssum("english-korean", num_samples=1)
source_sample_korean = crosssum_korean["test"][0]
source_text_korean = source_sample_korean["text"]

crosssum_portuguese = load_and_preview_crosssum("english-portuguese", num_samples=1)
source_sample_portuguese = crosssum_portuguese["test"][0]
source_text_portuguese = source_sample_portuguese["text"]

crosssum_punjabi = load_and_preview_crosssum("english-punjabi", num_samples=1)
source_sample_punjabi = crosssum_punjabi["test"][0]
source_text_punjabi = source_sample_punjabi["text"]

# Compare source documents
print("\n📰 Source Document (English → English):\n", source_text_english[:300], "...\n")
print("📰 Source Document (English → Spanish):\n", source_text_spanish[:300], "...\n")
print("📰 Source Document (English → French):\n", source_text_french[:300], "...\n")

In [None]:
# Generate Spanish summary and use Spanish reference
spanish_reference = source_sample_spanish["summary"]
spanish_generated = generate_summary(source_text_spanish, target_lang="spanish")

candidates_spanish = {
    "reference": [spanish_reference],
    "generated": [spanish_generated]
}

# Generate French summary and use French reference
french_reference = source_sample_french["summary"]
french_generated = generate_summary(source_text_french, target_lang="french")

candidates_french = {
    "reference": [french_reference],
    "generated": [french_generated]
}


# Generate english summary and use english reference
english_reference = source_sample_english["summary"]
english_generated = generate_summary(source_text_english, target_lang="english")

candidates_english = {
    "reference": [english_reference],
    "generated": [english_generated]
}

korean_reference = source_sample_korean["summary"]
korean_generated = generate_summary(source_text_korean, target_lang="korean")

candidates_korean = {
    "reference": [korean_reference],
    "generated": [korean_generated]
}

# Generate Portuguese summary
portuguese_reference = source_sample_portuguese["summary"]
portuguese_generated = generate_summary(source_text_portuguese, target_lang="portuguese")

candidates_portuguese = {
    "reference": [portuguese_reference],
    "generated": [portuguese_generated]
}

# Generate Punjabi summary
punjabi_reference = source_sample_punjabi["summary"]
punjabi_generated = generate_summary(source_text_punjabi, target_lang="punjabi")

candidates_punjabi = {
    "reference": [punjabi_reference],
    "generated": [punjabi_generated]
}

# Apply NeutralRR to Korean summaries
best_english_summaries, english_score = neutral_rr(candidates_english)
print("\n English NeutralRR Result:")
for lang, summ in zip(candidates_english.keys(), best_english_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (english): {english_score:.4f}")

# Apply NeutralRR to Spanish summaries
best_spanish_summaries, spanish_score = neutral_rr(candidates_spanish)
print("\n Spanish NeutralRR Result:")
for lang, summ in zip(candidates_spanish.keys(), best_spanish_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (Spanish): {spanish_score:.4f}")

# Apply NeutralRR to French summaries
best_french_summaries, french_score = neutral_rr(candidates_french)
print("\n French NeutralRR Result:")
for lang, summ in zip(candidates_french.keys(), best_french_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (French): {french_score:.4f}")

best_korean_summaries, korean_score = neutral_rr(candidates_korean)
print("\n Korean NeutralRR Result:")
for lang, summ in zip(candidates_korean.keys(), best_korean_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (Korean): {korean_score:.4f}")

# Apply NeutralRR to Portuguese summaries
best_portuguese_summaries, portuguese_score = neutral_rr(candidates_portuguese)
print("\n Portuguese NeutralRR Result:")
for lang, summ in zip(candidates_portuguese.keys(), best_portuguese_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (Portuguese): {portuguese_score:.4f}")

# Apply NeutralRR to Punjabi summaries
best_punjabi_summaries, punjabi_score = neutral_rr(candidates_punjabi)
print("\n Punjabi NeutralRR Result:")
for lang, summ in zip(candidates_punjabi.keys(), best_punjabi_summaries):
    print(f"{lang.capitalize()}: {summ}")
print(f"Average Similarity (Punjabi): {punjabi_score:.4f}")




## 📏 Evaluate with ROUGE and Export Results

In [None]:
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)

# Get references
reference_summary_fr = source_sample_french["summary"]
reference_summary_es = source_sample_spanish["summary"]
reference_summary_en = source_sample_english["summary"]
reference_summary_ko = source_sample_korean["summary"]
reference_summary_pt = source_sample_portuguese["summary"]
reference_summary_pu = source_sample_punjabi["summary"]

# Prepare output
records = []

# -- Evaluate French generated summary --
for lang, summary in zip(candidates_french.keys(), best_french_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_fr, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "french",
            "generated_summary": summary,
            "reference_summary": reference_summary_fr,
            "rouge2_f1": round(rouge_f1, 4)
        })

# -- Evaluate Spanish generated summary --
for lang, summary in zip(candidates_spanish.keys(), best_spanish_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_es, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "spanish",
            "generated_summary": summary,
            "reference_summary": reference_summary_es,
            "rouge2_f1": round(rouge_f1, 4)
        })

# -- Evaluate English generated summary --
for lang, summary in zip(candidates_english.keys(), best_english_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_en, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "english",
            "generated_summary": summary,
            "reference_summary": reference_summary_en,
            "rouge2_f1": round(rouge_f1, 4)
        })

for lang, summary in zip(candidates_korean.keys(), best_korean_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_ko, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "korean",
            "generated_summary": summary,
            "reference_summary": reference_summary_ko,
            "rouge2_f1": round(rouge_f1, 4)
        })

# Add Portuguese evaluation
for lang, summary in zip(candidates_portuguese.keys(), best_portuguese_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_pt, summary)
        print(reference_summary_pt, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "portuguese",
            "generated_summary": summary,
            "reference_summary": reference_summary_pt,
            "rouge2_f1": round(rouge_f1, 4)
        })

# Add Punjabi evaluation
for lang, summary in zip(candidates_punjabi.keys(), best_punjabi_summaries):
    if lang == "generated":
        score = scorer.score(reference_summary_pu, summary)
        rouge_f1 = score["rouge2"].fmeasure
        records.append({
            "language": "punjabi",
            "generated_summary": summary,
            "reference_summary": reference_summary_pu,
            "rouge2_f1": round(rouge_f1, 4)
        })

# Display results
df = pd.DataFrame(records)
df



In [None]:
# Export results to CSV
output_path = "/content/neutralrr_results.csv"
df.to_csv(output_path, index=False)
print(f"📁 Results exported to: {output_path}")


## 🧪 Evaluate with BLEU Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Function to compute BLEU score
def compute_bleu(reference, prediction):
    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.split()]
    prediction_tokens = prediction.split()
    return sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothie)

# Add BLEU to DataFrame (only if reference is real, not placeholder)
df["bleu"] = df.apply(
    lambda row: round(compute_bleu(row["reference_summary"], row["generated_summary"]), 4)
    if isinstance(row["reference_summary"], str) and row["reference_summary"] != "N/A"
    else "N/A",
    axis=1
)

# Display final scores
df[["language", "rouge2_f1", "bleu"]]


In [None]:
# Re-export with BLEU
output_path_bleu = "/content/neutralrr_results_bleu.csv"
df.to_csv(output_path_bleu, index=False)
print(f"📁 Results with BLEU exported to: {output_path_bleu}")


## 📊 Visualize ROUGE and BLEU Scores

In [None]:
import matplotlib.pyplot as plt

# Set plotting style
plt.style.use("ggplot")

# Bar chart for ROUGE-2 F1
plt.figure(figsize=(10, 5))
plt.bar(df["language"], df["rouge2_f1"], width=0.5)
plt.title("ROUGE-2 F1 Score per Language (NeutralRR)")
plt.ylabel("ROUGE-2 F1")
plt.xlabel("Language")
plt.ylim(0, 1)
plt.show()


In [None]:
# Bar chart for BLEU Score
plt.figure(figsize=(10, 5))
plt.bar(df["language"], df["bleu"], width=0.5, color='skyblue')
plt.title("BLEU Score per Language (NeutralRR)")
plt.ylabel("BLEU")
plt.xlabel("Language")
plt.ylim(0, 1)
plt.show()
