In [1]:
# Install required packages if not installed
# pip install transformers torch sentencepiece python-docx PyPDF2

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from docx import Document
import PyPDF2

# -------------------------------
# Step 1: Load multilingual model once
# -------------------------------
model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Detect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -------------------------------
# Step 2: Document reading functions
# -------------------------------
def read_docx(file_path):
    doc = Document(file_path)
    return [para.text.strip() for para in doc.paragraphs if para.text.strip()]

def read_pdf(file_path):
    pdf_reader = PyPDF2.PdfReader(file_path)
    texts = []
    for page in pdf_reader.pages:
        texts.append(page.extract_text())
    return [t.strip() for t in texts if t.strip()]

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

# -------------------------------
# Step 3: Batch translation function
# -------------------------------
def translate_batch(texts, target_lang="ta", batch_size=8):
    tokenizer.src_lang = "en"  # Assuming source is English
    translated_texts = []

    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.get_lang_id(target_lang),
            max_length=600
        )
        translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        translated_texts.extend(translations)
    return translated_texts

# -------------------------------
# Step 4: Translate entire document
# -------------------------------
def translate_document(file_path, target_languages=["ta", "fr", "es"], file_type="docx"):
    # Step 4a: Read document
    if file_type == "docx":
        texts = read_docx(file_path)
    elif file_type == "pdf":
        texts = read_pdf(file_path)
    elif file_type == "txt":
        texts = read_txt(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Step 4b: Translate per language
    translated_docs = {}
    for lang in target_languages:
        translated_docs[lang] = translate_batch(texts, target_lang=lang)
    return translated_docs

# -------------------------------
# Step 5: Save translated documents (Optional)
# -------------------------------
def save_translations(translations, output_prefix="translated_doc"):
    for lang, texts in translations.items():
        doc = Document()
        for para in texts:
            doc.add_paragraph(para)
        doc.save(f"{output_prefix}_{lang}.docx")

# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    file_path = "insu_doc.docx"  # Replace with your file
    translations = translate_document(file_path, target_languages=["ta", "fr", "es"], file_type="docx")
    save_translations(translations, output_prefix="insurance_translated")
    print("Translation completed for Tamil, French, Spanish.")


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Translation completed for Tamil, French, Spanish.
