In [None]:
import os
import shutil
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

In [None]:
translator = GoogleTranslator(source="en", target="vi")
translation_cache = {}  
MAX_CHUNK_SIZE = 4000

In [None]:
def split_text_into_chunks(text_list, max_size=MAX_CHUNK_SIZE):
    chunks = []
    current_chunk = []
    current_size = 0
    for text in text_list:
        if current_size + len(text) > max_size:
            chunks.append("\n".join(current_chunk))
            current_chunk = []
            current_size = 0
        current_chunk.append(text)
        current_size += len(text)

    if current_chunk:
        chunks.append("\n".join(current_chunk))
    
    return chunks

def translate_chunk(chunk, max_attempts=3):
    attempt = 0
    text_lines = chunk.split("\n")
    while attempt < max_attempts:
        try:
            translated_chunk = translator.translate(chunk)
            if translated_chunk:
                return translated_chunk
            else:
                print(f"Warning: Translation returned empty for a chunk. Retrying...")
                attempt += 1
        except Exception as e:
            print(f"Warning: Failed to translate chunk. Error: {e}")
            if len(chunk) > MAX_CHUNK_SIZE // 2:  
                mid = len(text_lines) // 2
                first_half = "\n".join(text_lines[:mid])
                second_half = "\n".join(text_lines[mid:])
                print(f"  Splitting the chunk into smaller parts...")
                return translate_chunk(first_half) + "\n" + translate_chunk(second_half)
            attempt += 1
    
    print(f"Warning: Maximum attempts reached. Keeping original text for this chunk.")
    return chunk  

def translate_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text_elements = []
    text_nodes = []
    for element in soup.find_all(string=True):
        if element.parent.name not in ['script', 'style', 'meta', 'link', 'a', 'code']:
            original_text = element.strip()
            if original_text:
                text_elements.append(original_text)
                text_nodes.append(element)

    if text_elements:
        try:
            to_translate = [text for text in text_elements if text not in translation_cache]
            
            if to_translate:
                chunks = split_text_into_chunks(to_translate)
                translated_texts = []
                
                for chunk in chunks:
                    translated_chunk = translate_chunk(chunk)
                    translated_texts.extend(translated_chunk.split("\n"))
                
                for original, translated in zip(to_translate, translated_texts):
                    translation_cache[original] = translated

            for element, original_text in zip(text_nodes, text_elements):
                translated_text = translation_cache.get(original_text, original_text)
                element.replace_with(translated_text)
        except Exception as e:
            print(f"Warning: Translation process failed completely. Keeping original text. Error: {e}")

    return str(soup)

def process_folder(input_folder, output_folder):
    """Recursively process all HTML files and copy non-HTML files."""
    for root, _, files in os.walk(input_folder):
        relative_path = os.path.relpath(root, input_folder)
        output_path = os.path.join(output_folder, relative_path)
        os.makedirs(output_path, exist_ok=True)

        print(f"Processing folder: {relative_path}")

        for file in files:
            input_file_path = os.path.join(root, file)
            output_file_path = os.path.join(output_path, file)

            if file.endswith('.html'):
                print(f"  Translating: {file}")
                with open(input_file_path, 'r', encoding='utf-8') as f:
                    html_content = f.read()

                translated_html = translate_html(html_content)

                with open(output_file_path, 'w', encoding='utf-8') as f:
                    f.write(translated_html)
            else:
                shutil.copy(input_file_path, output_file_path)


In [None]:
input_folder = ""  
output_folder = ""  

print("\nStarting translation process...\n")
process_folder(input_folder, output_folder)
print("\nTranslation completed successfully!")