# Reproducing EMNLP 2024 Paper: Multi-Target Cross-Lingual Summarization



In [None]:
# ✅ Install dependencies
!pip install -q transformers datasets evaluate rouge-score sentence-transformers
!pip install sonar-space
!pip install fairseq2 --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/pt2.6.0/cu124
!pip install comet_ml
!pip install fasttext

## 🔍 Load SONAR Encoder and Compute Similarity

In [None]:
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize SONAR embedding model
sonar_model = TextToEmbeddingModelPipeline(
    encoder="text_sonar_basic_encoder",
    tokenizer="text_sonar_basic_encoder",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

def similarity_score(text1, text2):
    emb = sonar_model.predict([text1, text2], source_lang="eng_Latn")
    return float(cosine_similarity([emb[0].cpu().numpy()], [emb[1].cpu().numpy()])[0][0])


## ✨ Example: Re-Rank Candidate Summaries Based on Semantic Coherence

In [None]:
# Placeholder: Example candidate summaries for demonstration
candidates = {
    'es': ['Bitcoin consume más energía que Argentina.', 'Bitcoin es muy costoso en electricidad.'],
    'fr': ['Le Bitcoin consomme plus que l’Argentine.', 'Bitcoin a un coût énergétique élevé.']
}

# Re-rank using similarity
from itertools import product

def neutral_rr(candidates):
    best_set = None
    best_score = -1
    for combo in product(*candidates.values()):
        score = 0
        for i in range(len(combo)):
            for j in range(i+1, len(combo)):
                score += similarity_score(combo[i], combo[j])
        avg_score = score / (len(combo)*(len(combo)-1)/2)
        if avg_score > best_score:
            best_score = avg_score
            best_set = combo
    return best_set, best_score

best_summaries, score = neutral_rr(candidates)
print('Best Semantically Coherent Summaries:')
for lang, summ in zip(candidates.keys(), best_summaries):
    print(f'{lang}: {summ}')
print(f'Average Pairwise Similarity: {score:.4f}')


## 🧠 Generate Summaries with mT5 (English → Multilingual)

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load mT5 fine-tuned on CrossSum
model_name = "csebuetnlp/mT5_m2m_crossSum"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

article_text = """Videos that say approved vaccines are dangerous and cause autism, cancer or infertility are among those that will be taken down, the company said.  The policy includes the termination of accounts of anti-vaccine influencers.  Tech giants have been criticised for not doing more to counter false health information on their sites.  In July, US President Joe Biden said social media platforms were largely responsible for people's scepticism in getting vaccinated by spreading misinformation, and appealed for them to address the issue.  YouTube, which is owned by Google, said 130,000 videos were removed from its platform since last year, when it implemented a ban on content spreading misinformation about Covid vaccines.  In a blog post, the company said it had seen false claims about Covid jabs "spill over into misinformation about vaccines in general". The new policy covers long-approved vaccines, such as those against measles or hepatitis B.  "We're expanding our medical misinformation policies on YouTube with new guidelines on currently administered vaccines that are approved and confirmed to be safe and effective by local health authorities and the WHO," the post said, referring to the World Health Organization."""

get_lang_id = lambda lang: tokenizer._convert_token_to_id(
    model.config.task_specific_params["langid_map"][lang][1]
)

def generate_summary(text, target_lang):
    input_ids = tokenizer(
        [WHITESPACE_HANDLER(text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        decoder_start_token_id=get_lang_id(target_lang),
        max_length=168,
        no_repeat_ngram_size=2,
        num_beams=4,
    )[0]

    return tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )


In [None]:
# Example English news article
text = "Bitcoin uses more electricity annually than the whole of Argentina, analysis by Cambridge University suggests. Mining for the cryptocurrency is powerhungry, involving heavy computer calculations to verify transactions. Cambridge researchers say it consumes around 121.36 terawatt-hours (TWh) a year - and is unlikely to fall unless the value of the currency slumps"

langs = {
    "fr": "french",
    "es": "spanish",
    "pt": "portuguese",
    "pu": "punjabi",
    "ko": "korean"
}

summaries = {}
for code, full_lang in langs.items():
    summary = generate_summary(text, target_lang=full_lang)
    summaries[full_lang] = summary
    print(f"{full_lang} summary: {summary}")

In [None]:
from itertools import combinations

candidates = {lang: [sum_text] for lang, sum_text in summaries.items()}

best_set, best_score = neutral_rr(candidates)

print(f"\nAverage NeutralRR similarity across all pairs: {best_score:.4f}")

## 🚀 Load CrossSum Cluster and apply mT5 and SONAR Reranking

In [None]:
!pip install unbabel-comet
import torch
from datasets import load_dataset
from itertools import product
from collections import defaultdict
from evaluate import load
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Function to compute BLEU score
def compute_bleu(reference, prediction):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], prediction.split(), smoothing_function=smoothie)

device = 0 if torch.cuda.is_available() else -1

def compute_scores(configs,num_samples,source_lang):
  data = {
    lang: load_dataset("csebuetnlp/CrossSum", cfg)["test"].select(range(num_samples))
    for lang, cfg in configs.items()
  }

  rouge = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
  bleu  = load("bleu")
  comet = load("comet", config_name="wmt20-comet-da", device=device)

  scores = {lang: defaultdict(list) for lang in configs}

  for lang, ds in data.items():
      r2_scores, bleu_scores, bl_scores, comet_scores = [], [], [], []
      print(f"\nEvaluating language: {lang}")
      for idx, sample in enumerate(ds, 1):
          src, ref = sample["text"], sample["summary"]
          gen = generate_summary(src, target_lang=lang)


          _, bl = neutral_rr({"reference":[ref], "generated":[gen]})
          bl_scores.append(bl*10)

          r2 = rouge.score(ref, gen)["rouge2"].fmeasure
          r2_scores.append(r2*100)

          bleu = compute_bleu(ref, gen)
          bleu_scores.append(bleu)

          c = comet.compute(predictions=[gen], references=[ref], sources=[src])["scores"][0]
          comet_scores.append(c*100)

          print(f"[{idx:02d}/{len(ds)}] R2={r2:.4f}, BLEU={bleu:.4f}, BLASER={bl:.4f}, COMET={c:.4f}")

    # store averages
      scores[lang]["rouge2"]  = sum(r2_scores) / len(r2_scores)
      scores[lang]["bleu"]    = sum(bleu_scores) / len(bleu_scores)
      scores[lang]["blaser"]  = sum(bl_scores) / len(bl_scores)
      scores[lang]["comet"]   = sum(comet_scores) / len(comet_scores)

# ─── Print summary ───────────────────────────────────────────────────────────
  print(f"\nAverage scores over all languages for {source_lang}:")
  for lang, m in scores.items():
      print(f" • {lang.capitalize():7s} → "
          f"ROUGE-2 {m['rouge2']:.4f}, "
          f"BLEU {m['bleu']:.4f}, "
          f"BLASER {m['blaser']:.4f}, "
          f"COMET {m['comet']:.4f}")

  return scores



In [None]:
import matplotlib.pyplot as plt

def visualiseScores(scores) 
  # Prepare data for plotting
  languages = list(scores.keys())
  rouge2_vals = [scores[lang]["rouge2"] for lang in languages]
  bleu_vals   = [scores[lang]["bleu"] for lang in languages]
  blaser_vals = [scores[lang]["blaser"] for lang in languages]
  comet_vals  = [scores[lang]["comet"] for lang in languages]

  # Set bar width and position
  bar_width = 0.2
  x = range(len(languages))

  # Plotting
  plt.figure(figsize=(12, 6))
  plt.bar([i - 1.5 * bar_width for i in x], rouge2_vals, width=bar_width, label='ROUGE-2')
  plt.bar([i - 0.5 * bar_width for i in x], bleu_vals,   width=bar_width, label='BLEU')
  plt.bar([i + 0.5 * bar_width for i in x], blaser_vals, width=bar_width, label='BLASER')
  plt.bar([i + 1.5 * bar_width for i in x], comet_vals,  width=bar_width, label='COMET')

  plt.xlabel("Language")
  plt.ylabel("Average Score")
  plt.title("Average Evaluation Metrics per Language")
  plt.xticks(ticks=x, labels=[lang.capitalize() for lang in languages])
  plt.legend()
  plt.tight_layout()
  plt.grid(axis='y', linestyle='--', alpha=0.5)
  plt.show()

In [None]:
configs_en = {
    "english": "english-english",
    "spanish": "english-spanish",
    "french":  "english-french",
    "arabic":  "english-arabic",
    "russian": "english-russian",
    "chinese_simplified": "english-chinese_simplified"
}

configs_zh = {
    "english": "chinese_simplified-english",
    "spanish": "chinese_simplified-spanish",
    "french": "chinese_simplified-french",
    "arabic":  "chinese_simplified-arabic",
    "russian": "chinese_simplified-russian",
    "chinese_simplified": "chinese_simplified-chinese_simplified"
}

configs_fr = {
    "english": "english-english",
    "spanish": "english-spanish",
    "french":  "english-french",
    "arabic":  "english-arabic",
    "russian": "english-russian",
    "chinese_simplified": "spanish-chinese_simplified"
}

configs_es = {
    "english": "spanish-english",
    "spanish": "spanish-spanish",
    "french":  "spanish-french",
    "arabic":  "spanish-arabic",
    "russian": "spanish-russian",
    "chinese_simplified": "spanish-chinese_simplified"
}

scores_en = compute_scores(configs_en,50,"English")
visualiseScores(scores_en)

scores_zh = compute_scores(configs_zh,30,"Chinese (simplified)")
visualiseScores(scores_zh)

scores_fr = compute_scores(configs_fr,50,"French")
visualiseScores(scores_fr)

scores_es = compute_scores(configs_es,50,"Spanish")
visualiseScores(scores_es)