# Work Dir

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Example: change to a specific folder inside your Google Drive
project_path = '/content/drive/My Drive/AI6127_NLP_Project'

# Create the directory if it doesn't exist
os.makedirs(project_path, exist_ok=True)
os.chdir(project_path)

# Optional: verify the current working directory
print("Current working directory:", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/My Drive/AI6127_NLP_Project


# Model

In [None]:
!pip uninstall transformers -y
!pip install transformers
!pip install --upgrade numpy

!pip install tensorflow==2.18.0 --no-deps
!pip install numba==0.60.0 --no-deps



Found existing installation: transformers 4.51.2
Uninstalling transformers-4.51.2:
  Successfully uninstalled transformers-4.51.2
Collecting transformers
  Using cached transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Using cached transformers-4.51.2-py3-none-any.whl (10.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.51.2


In [None]:

from transformers import pipeline

# Use Helsinki-NLP's opus-MT model for English to German translation
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de")

# Sentence to translate (in English)
english_text = "Hello, world!"

# Translate the sentence; set max_length to ensure the output is long enough
result = translator(english_text, max_length=40)

# Print the translation result
print(result[0]['translation_text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Device set to use cpu


Hallo, Welt!


# XML Translation

## Translator Class

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import xml.etree.ElementTree as ET
from pathlib import Path
import torch
import re

class XMLTranslator:
    def __init__(self, model_name="Helsinki-NLP/opus-mt-en-de"):
        self.translator = pipeline("translation",
                                 model=model_name,
                                 device=-1)

        # Configure elements to translate/preserve
        self.translate_tags = {'title', 'head', 'p', 'note'}
        self.preserve_tags = {'xref', 'date', 'classCode', 'bibl'}

        # Legal terminology regex patterns
        self.term_patterns = {
            re.compile(r'\bOfficial Journal\b', re.I): 'Amtsblatt',
            re.compile(r'\bpaper edition\b', re.I): 'Papierausgabe',
            re.compile(r'\bdeemed authentic\b', re.I): 'verbindlich gelten'
        }

    def translate_text(self, text):
        """Enhanced translation with terminology control"""
        try:
            # First pass translation
            result = self.translator(text.strip(),
                                   max_length=1000,
                                   clean_up_tokenization_spaces=True,
                                   num_beams=5)[0]['translation_text']

            # Post-process terminology
            for pattern, replacement in self.term_patterns.items():
                result = pattern.sub(replacement, result)

            return result
        except Exception as e:
            print(f"Translation failed: {str(e)}")
            return text

    def process_xml(self, input_path, output_dir):
        """Process single XML file"""
        try:
            tree = ET.parse(input_path)
            root = tree.getroot()

            # Update language attributes
            self.update_language_attributes(root)

            # Process translatable elements
            for elem in root.iter():
                if elem.tag in self.translate_tags and elem.text:
                    self.process_element(elem)
                elif elem.tag in self.preserve_tags:
                    self.preserve_element(elem)

            original_name = input_path.name
            new_name = original_name.replace("-en.xml", "-de.xml")
            # Save translated file

            output_path = output_dir / new_name
            self.write_xml(tree, output_path)
            print(f"Processed: {input_path.name}")

        except Exception as e:
            print(f"Error processing {input_path.name}: {str(e)}")

    def process_element(self, elem):
        """Handle translatable elements"""
        leading_ws = elem.text[:len(elem.text)-len(elem.text.lstrip())]
        trailing_ws = elem.text[len(elem.text.rstrip()):]
        content = elem.text.strip()

        if content:
            translated = self.translate_text(content)
            elem.text = f"{leading_ws}{translated}{trailing_ws}"

    def preserve_element(self, elem):
        """Ensure preserved elements maintain original content"""
        if elem.text:
            elem.text = elem.text.strip()

    def update_language_attributes(self, root):
        """Update lang attributes to 'de'"""
        for elem in [root, root.find('.//teiHeader')]:
            if elem is not None:
                elem.set('lang', 'de')

    def write_xml(self, tree, output_path):
        """Write XML with proper formatting"""
        tree.write(output_path,
                 encoding='utf-8',
                 xml_declaration=True,
                 method='xml',
                 short_empty_elements=False)

## Batch Process


In [None]:
def batch_translate_xml(input_dir, output_dir):
    """Process all XML files in directory"""
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    translator = XMLTranslator()

    xml_files = list(input_dir.glob("*.xml"))
    print(f"Found {len(xml_files)} files to process")

    for idx, xml_file in enumerate(xml_files, 1):
        print(f"Processing file {idx}/{len(xml_files)}")
        translator.process_xml(xml_file, output_dir)

# you need to create input_xml folder to place the -en.xml and output_xml folder to place generated -de.xml
INPUT_DIR = Path("input_xml")
OUTPUT_DIR = Path("output_xml")
batch_translate_xml(INPUT_DIR, OUTPUT_DIR)

Device set to use cpu


Found 5 files to process
Processing file 1/5
Processed: jrcC2006#294#39-en.xml
Processing file 2/5
Processed: jrcC2006#294#40-en.xml
Processing file 3/5
Processed: jrcC2006#294#41-en.xml
Processing file 4/5
Processed: jrcC2006#294#42-en.xml
Processing file 5/5
Processed: jrcC2006#294#43-en.xml


# Evalution

## Evaluator Class

In [None]:
!pip install evaluate pandas transformers sentencepiece unbabel-comet sacrebleu

def extract_text_from_xml_etree(filepath):
    """Extracts text from <p> and <head> tags using ElementTree."""
    try:
        tree = ET.parse(filepath)
        root = tree.getroot()

        # Collect text from <p> and <head> tags
        texts = [elem.text.strip() for elem in root.iter() if elem.tag in ('p', 'head') and elem.text and elem.text.strip()]
        return texts
    except ET.ParseError as e:
        print(f"❌ Error parsing XML file {filepath}: {e}")
        return []


class TranslationEvaluator:
    def __init__(self):
        self.metrics = {
            'bleu': load('bleu'),
            'meteor': load('meteor'),
            'ter': load('ter'),
            'comet_qe': load('comet')
        }

    def evaluate_file(self, ref_path, hyp_path, scs_path):
        ref_lines = extract_text_from_xml_etree(ref_path)
        hyp_lines = extract_text_from_xml_etree(hyp_path)
        scs_lines = extract_text_from_xml_etree(scs_path)

        aligned_refs, aligned_hyps, aligned_scs = self._align_lines(ref_lines, hyp_lines, scs_lines)

        if not aligned_refs:
            print("⚠️ No aligned lines found. Skipping evaluation.")
            return {}


        filtered_hyps = self._filter_none(aligned_hyps)
        filtered_refs = self._filter_none(aligned_refs)
        filtered_scs = self._filter_none(aligned_scs)
        formatted_refs = [[line] for line in filtered_refs]

        return self._calculate_metrics(filtered_hyps, filtered_refs, filtered_scs)

    def _align_lines(self, ref_lines, hyp_lines, scs_lines):
        min_len = min(len(ref_lines), len(hyp_lines), len(scs_lines))
        return ref_lines[:min_len], hyp_lines[:min_len], scs_lines[:min_len]

    def _filter_none(self, items):
        return [item if item is not None else "" for item in items]

    def _calculate_metrics(self, hypotheses, references, sources):
        results = {}

        try:
            results['bleu'] = self.metrics['bleu'].compute(
                predictions=hypotheses,
                references=references
            ).get('bleu', None)
        except Exception as e:
            print(f"BLEU metric computation failed: {e}")

        try:
            results['meteor'] = self.metrics['meteor'].compute(
                predictions=hypotheses,
                references=references
            ).get('meteor', None)
        except Exception as e:
            print(f"METEOR metric computation failed: {e}")

        try:
            results['ter'] = self.metrics['ter'].compute(
                predictions=hypotheses,
                references=references
            ).get('score', None)
        except Exception as e:
            print(f"TER metric computation failed: {e}")

        try:
            comet_output = self.metrics['comet_qe'].compute(
                predictions=hypotheses,
                references=[r[0] for r in references],
                sources=sources
            )

            results['comet_qe'] = comet_output.get('mean_score', None)
        except Exception as e:
            print(f"COMET QE computation failed: {e}")
            results['comet_qe'] = None


        return results




## Batch Process

In [None]:
def generate_report(input_dir, output_dir, ref_dir, output_csv):
    evaluator = TranslationEvaluator()
    results = []

    output_path = Path(output_dir)
    print(f"📂 Scanning: {output_path.absolute()}")

    for hyp_file in output_path.glob('*.xml'):
        scs_file = Path(input_dir) / hyp_file.name.replace("-de.xml", "-en.xml")
        ref_file = Path(ref_dir) / hyp_file.name

        print(f"\n🔍 Processing:\n- Hypothesis: {hyp_file.name}\n- Reference: {ref_file.name}")

        if not ref_file.exists():
            print(f"⚠️ Missing reference file: {ref_file.name} - skipping")
            continue

        try:
            res = evaluator.evaluate_file(ref_file, hyp_file, scs_file)
            res['file'] = hyp_file.name
            results.append(res)
            print("✅ Metrics:", res)
        except Exception as e:
            print(f"❌ Error processing {hyp_file.name}: {e}")
            continue

    if not results:
        print("⚠️ No results collected - CSV will be empty")
    else:
        print(f"\n✅ Completed evaluations for {len(results)} files")

    df = pd.DataFrame(results)
    print("\n📊 Preview of results:")
    print(df.head())

    df.to_csv(output_csv, index=False)
    print(f"\n📁 Report saved to: {output_csv}")
    return df


# === Run the evaluation ===
result_dir = Path('result')
result_dir.mkdir(exist_ok=True)

generate_report(
    input_dir='input_xml',
    output_dir='output_xml',
    ref_dir='ref_xml',
    output_csv='result/translation_quality.csv'
)

# === Summary ===
df = pd.read_csv('result/translation_quality.csv')
print("\n📈 Averages:")
print(f"Average BLEU: {df.bleu.mean():.2%}")
print(f"Average METEOR: {df.meteor.mean():.2%}")
print(f"Average TER: {df.ter.mean():.2f}")
print(f"Average COMET-QE: {df.comet_qe.mean():.2%}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


📂 Scanning: /content/drive/MyDrive/AI6127_NLP_Project/output_xml

🔍 Processing:
- Hypothesis: jrcC2006#294#39-de.xml
- Reference: jrcC2006#294#39-de.xml


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


✅ Metrics: {'bleu': 0.6059316133102173, 'meteor': np.float64(0.644120289279557), 'ter': 33.74233128834356, 'comet_qe': 0.4355567531152205, 'file': 'jrcC2006#294#39-de.xml'}

🔍 Processing:
- Hypothesis: jrcC2006#294#40-de.xml
- Reference: jrcC2006#294#40-de.xml


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


✅ Metrics: {'bleu': 0.7249280994389855, 'meteor': np.float64(0.7681369234766364), 'ter': 21.782178217821784, 'comet_qe': 0.4439805012482863, 'file': 'jrcC2006#294#40-de.xml'}

🔍 Processing:
- Hypothesis: jrcC2006#294#41-de.xml
- Reference: jrcC2006#294#41-de.xml


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


✅ Metrics: {'bleu': 0.49557979789980283, 'meteor': np.float64(0.6235598670066361), 'ter': 45.3551912568306, 'comet_qe': 0.4634612301985423, 'file': 'jrcC2006#294#41-de.xml'}

🔍 Processing:
- Hypothesis: jrcC2006#294#42-de.xml
- Reference: jrcC2006#294#42-de.xml


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


✅ Metrics: {'bleu': 0.4143938102678174, 'meteor': np.float64(0.5742125676206209), 'ter': 52.49110320284698, 'comet_qe': 0.443631311780528, 'file': 'jrcC2006#294#42-de.xml'}

🔍 Processing:
- Hypothesis: jrcC2006#294#43-de.xml
- Reference: jrcC2006#294#43-de.xml


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


✅ Metrics: {'bleu': 0.5231306545647392, 'meteor': np.float64(0.5938008609620914), 'ter': 45.588235294117645, 'comet_qe': 0.46549390469278606, 'file': 'jrcC2006#294#43-de.xml'}

✅ Completed evaluations for 5 files

📊 Preview of results:
       bleu    meteor        ter  comet_qe                    file
0  0.605932  0.644120  33.742331  0.435557  jrcC2006#294#39-de.xml
1  0.724928  0.768137  21.782178  0.443981  jrcC2006#294#40-de.xml
2  0.495580  0.623560  45.355191  0.463461  jrcC2006#294#41-de.xml
3  0.414394  0.574213  52.491103  0.443631  jrcC2006#294#42-de.xml
4  0.523131  0.593801  45.588235  0.465494  jrcC2006#294#43-de.xml

📁 Report saved to: result/translation_quality.csv

📈 Averages:
Average BLEU: 55.28%
Average METEOR: 64.08%
Average TER: 39.79
Average COMET-QE: 45.04%
