In [None]:
# # Install required packages when running the notebook standalone
# %pip install transformers sentencepiece sacremoses

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-win_amd64.whl.metadata (10 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.1-cp311-cp311-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ------------------- -------------------- 0.5/1.1 MB 2.4 MB/s eta 0:00:01
   ------------------- -------------------- 0.5/1.1 MB 2.4 MB/s eta 0:00:01
   ------------------- -------------------- 0.5/1.1 MB 2.4 MB/s eta 0:00:01
   ------------------- -------------------- 0.5/1.1 MB 2.4 MB/s eta 0:00:01
   ----------------------------- ---------- 0.8/1.1 MB 524.3 kB/s eta 0:00:01
   ----------------------------- ---------- 0.8/1.1 MB 524.3 kB/s eta 0:00:01
   ----



In [1]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, Optional
from abc import ABC, abstractmethod
import logging
from functools import lru_cache

from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@dataclass
class TranslationResult:
    """Represents the result of a translation operation."""
    source_text: str
    translated_text: str
    source_lang: str
    target_lang: str
    provider: str

    def to_dict(self) -> Dict[str, str]:
        """Serialize the result to a dictionary."""
        return {
            "source_text": self.source_text,
            "translated_text": self.translated_text,
            "source_lang": self.source_lang,
            "target_lang": self.target_lang,
            "provider": self.provider,
        }

In [3]:
class BaseTranslator(ABC):
    """Abstract base class for concrete translation providers."""

    @abstractmethod
    def translate(self, text: str, source_lang: str, target_lang: str) -> TranslationResult:
        """Translate text from source_lang to target_lang and return a TranslationResult."""
        raise NotImplementedError

class HuggingFaceTranslator(BaseTranslator):
    """Translate text using Helsinki-NLP opus-mt models from Hugging Face."""

    def __init__(self, model_template: str = "Helsinki-NLP/opus-mt-{src}-{tgt}", max_length: int = 400):
        self.model_template = model_template
        self.max_length = max_length
        self._pipelines: Dict[str, pipeline] = {}

    def _normalize(self, lang: str) -> str:
        if not lang:
            raise ValueError("Language codes must be provided")
        return lang.lower()

    def _model_name_for(self, source_lang: str, target_lang: str) -> str:
        return self.model_template.format(src=self._normalize(source_lang), tgt=self._normalize(target_lang))

    def _get_pipeline(self, source_lang: str, target_lang: str):
        key = f"{self._normalize(source_lang)}-{self._normalize(target_lang)}"
        if key not in self._pipelines:
            model_name = self._model_name_for(source_lang, target_lang)
            logging.info("Loading translation model %s", model_name)
            self._pipelines[key] = pipeline("translation", model=model_name, max_length=self.max_length)
        return self._pipelines[key]

    def translate(self, text: str, source_lang: str, target_lang: str) -> TranslationResult:
        if not text or not text.strip():
            raise ValueError("Text to translate must be a non-empty string")
        normalized_source = self._normalize(source_lang)
        normalized_target = self._normalize(target_lang)
        if normalized_source == normalized_target:
            return TranslationResult(
                source_text=text,
                translated_text=text,
                source_lang=normalized_source,
                target_lang=normalized_target,
                provider="noop",
            )
        translator_pipeline = self._get_pipeline(normalized_source, normalized_target)
        result = translator_pipeline(text)[0]
        translated_text = result.get("translation_text", text)
        return TranslationResult(
            source_text=text,
            translated_text=translated_text,
            source_lang=normalized_source,
            target_lang=normalized_target,
            provider="huggingface",
        )

In [4]:
class TranslationService:
    """High-level translation service orchestrating multiple translators."""

    def __init__(
        self,
        translators: Optional[Dict[str, BaseTranslator]] = None,
        default_provider: str = "huggingface",
        language_detector: Optional[object] = None,
        internal_language: str = "en",
    ):
        self.translators = translators or {"huggingface": HuggingFaceTranslator()}
        self.default_provider = default_provider
        self.language_detector = language_detector
        self.internal_language = internal_language

    def translate(
        self, text: str, source_lang: str, target_lang: str, provider: Optional[str] = None
    ) -> TranslationResult:
        provider_key = provider or self.default_provider
        if provider_key not in self.translators:
            raise ValueError(f"Unknown translation provider: {provider_key}")
        translator = self.translators[provider_key]
        return translator.translate(text, source_lang, target_lang)

    def translate_to_internal_language(
        self, text: str, detected_lang: str, internal_lang: Optional[str] = None
    ) -> TranslationResult:
        internal_target = (internal_lang or self.internal_language).lower()
        detected_lang = detected_lang.lower()
        if detected_lang == internal_target:
            return TranslationResult(
                source_text=text,
                translated_text=text,
                source_lang=detected_lang,
                target_lang=internal_target,
                provider="noop",
            )
        return self.translate(text, detected_lang, internal_target)

    def translate_to_user_language(
        self, text: str, user_lang: str, internal_lang: Optional[str] = None
    ) -> TranslationResult:
        source_lang = (internal_lang or self.internal_language).lower()
        target_lang = user_lang.lower()
        if source_lang == target_lang:
            return TranslationResult(
                source_text=text,
                translated_text=text,
                source_lang=source_lang,
                target_lang=target_lang,
                provider="noop",
            )
        return self.translate(text, source_lang, target_lang)

    def detect_and_translate_auto(
        self, text: str, target_lang: str, provider: Optional[str] = None
    ) -> TranslationResult:
        if not self.language_detector:
            raise RuntimeError("No language detector configured for automatic translation")
        detection = self.language_detector.detect_from_text(text)
        return self.translate(text, detection.language_code, target_lang, provider=provider)

In [5]:
# Basic smoke tests for the translation service.
translation_service = TranslationService()
sample_en_text = "Hello world, this is PolyLingua in action."
result_en_es = translation_service.translate(sample_en_text, "en", "es")
print("English to Spanish:", result_en_es.translated_text)

sample_es_text = "Hola mundo, esto es PolyLingua en acción."
result_es_en = translation_service.translate(sample_es_text, "es", "en")
print("Spanish to English:", result_es_en.translated_text)

Device set to use cuda:0


English to Spanish: Hola mundo, esto es PolyLingua en acción.


Device set to use cuda:0


Spanish to English: Hello world, this is PolyLingua in action.
