import environment

In [6]:
import asyncio
import shutil
import json
import re
import tempfile
from django.conf import settings
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import (
    OpenAIChatCompletion,
    AzureChatCompletion,
)
from semantic_kernel.connectors.ai.prompt_execution_settings import (
    PromptExecutionSettings,
)
from semantic_kernel.contents.chat_history import ChatHistory
from semantic_kernel.core_plugins import ConversationSummaryPlugin
from semantic_kernel.prompt_template.input_variable import InputVariable
from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig
import hashlib
import os
import yaml
from urllib.parse import urlparse

async function translate read content from local file path, translate .md file content.

In [33]:
DEPLOYMENT_NAME_TEXT_TRANSLATION="yb521"
ENDPOINT_TEXT_TRANSLATION="https://yb521.openai.azure.com/"
API_KEY_TEXT_TRANSLATION="d72c1040736a4996ab0a244a8d28ff4c"

FONT_PATH="./NotoSans-Medium.ttf"
FONT_FOLDER_PATH="fonts"

async def translate(output_lang, input_file, output_file):
    def generate_prompt(output_lang, document_chunk):
        
        repo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        with open(os.path.join(repo_root, "font_language_mappings.yml"), "r") as file:
            mappings = yaml.safe_load(file)
        
        
        is_rtl = mappings.get(output_lang, {}).get('rtl', False)    

        if len(document_chunk.split("\n")) == 1:
            prompt = f"Translate the following text to {output_lang}. NEVER ADD ANY EXTRA CONTENT OUTSIDE THE TRANSLATION. TRANSLATE ONLY WHAT IS GIVEN TO YOU."

        else: 
            prompt = f"""
Translate the following markdown file to {output_lang}.
Make sure the translation does not sound too literal. Make sure you translate comments as well.
Do not translate any entities, such as variable names, function names, or class names, but keep them in the file.
Do not translate any urls or paths, but keep them in the file.
"""
        if is_rtl:
            prompt += "Please write the output from right to left, respecting that this is a right-to-left language.\n"
        else:
            prompt += "Please write the output from left to right.\n"

        prompt += "\n" + document_chunk

        return prompt

    async def run_prompt(prompt, thread_count, i):
        print(f"thread {i}/{thread_count}")
        prompt_template_config = PromptTemplateConfig(
            template=prompt,
            name="translate",
            description="Translate a text to another language",
            template_format="semantic-kernel",
            execution_settings=req_settings,
        )

        function = kernel.add_function(
            function_name="translate_function",
            plugin_name="translate_plugin",
            prompt_template_config=prompt_template_config,
        )

        result = await kernel.invoke(function)
        return result

    def update_image_paths(md_file_path):
        with open(md_file_path, "r") as file:
            content = file.read()

        pattern = r"(!\[.*?\]\()(\S+?)(\))"

        def add_parent_directory(match):
            return f"{match.group(1)}../{match.group(2)}{match.group(3)}"

        updated_content = re.sub(pattern, add_parent_directory, content)

        with open(md_file_path, "w") as file:
            file.write(updated_content)

    kernel = Kernel()

    service_id = "chat-gpt"


    # credentials    
    deployment_name = DEPLOYMENT_NAME_TEXT_TRANSLATION
    endpoint = ENDPOINT_TEXT_TRANSLATION
    api_key = API_KEY_TEXT_TRANSLATION

    kernel.add_service(
        AzureChatCompletion(
            service_id=service_id,
            deployment_name=deployment_name,
            endpoint=endpoint,
            api_key=api_key,
        )
    )

    # Define the request settings
    req_settings = kernel.get_prompt_execution_settings_from_service_id(service_id)
    req_settings.max_tokens = 4096
    req_settings.temperature = 0.7
    req_settings.top_p = 0.8

    with open(input_file, "r") as file:
        document = file.read().strip()

    if not document:
        # If document is empty, just copy the input file to output
        shutil.copyfile(input_file, output_file)
        return

    # Check if there is only one line in the document
    if document.count('\n') == 0:
        # Generate prompt for single line translation
        prompt = f"Translate the following text to {output_lang}. NEVER ADD ANY EXTRA CONTENT OUTSIDE THE TRANSLATION. TRANSLATE ONLY WHAT IS GIVEN TO YOU. MAINTAIN MARKDOWN FORMAT\n\n{document}"

        result = await run_prompt(prompt, 1, 1)

        with open(output_file, "w") as text_file:
            text_file.write(str(result))

        return

    # Split document into chunks (not needed in single line scenario)
    document_chunks = [document]
    
    prompts = [
        generate_prompt(output_lang=output_lang, document_chunk=document_chunk)
        for document_chunk in document_chunks
    ]

    with open("prompts.md", "w") as text_file:
        for i, prompt in enumerate(prompts):
            text_file.write(f"-------------- Prompt {i+1} ---------------\n")
            text_file.write(prompt)
            text_file.write("\n\n")

    thread_count = len(prompts)
    results = await asyncio.gather(*[run_prompt(prompt, thread_count, i+1) for i, prompt in enumerate(prompts)])

    with open(output_file, "w") as text_file:
        for result in results:
            text_file.write(str(result))
            text_file.write("\n")
    
        # Add Disclaimer
        text_file.write("\n\n")
        disclaimer_prompt = f""" Translate the following text to {output_lang}.

        Disclaimer: The translation was translated from its original by an AI model and may not be perfect. 
        Please review the output and make any necessary corrections."""
        disclaimer = await run_prompt(disclaimer_prompt, 'disclaimer prompt', 1)
        text_file.write(str(disclaimer))


here a example shows how the function works, it translate the original text into chinese.

In [11]:

original_text = "hello world, this is text before translation"
language = "chinese"



# Create a temporary input file
with tempfile.NamedTemporaryFile(
    delete=False, suffix=".md", mode="w"
) as temp_input_file:
    temp_input_file_path = temp_input_file.name
    temp_input_file.write(original_text)

# Create a temporary output file path
temp_output_file_path = tempfile.mktemp(suffix=".md")

# Call the translate function
await translate(language, temp_input_file_path, temp_output_file_path)

# Read the translated content from the temporary output file
with open(temp_output_file_path, "r") as temp_output_file:
    translated_content = temp_output_file.read()
    # translated_content = update_image_link(md_file_path, translated_content, language_code, docs_dir)



print("Translated content:", translated_content)


thread 1/1
Translated content: 你好世界，这是翻译前的文本。


### Link Update

link in markdown files needs to be handled. update_image_link provide new link point to translated images, and get_unique_id calculate hash value base on image's absolute path, to avoid filename conflict

In [27]:
SUPPORTED_IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp")

def update_image_link(md_file_path, markdown_string, language_code, docs_dir):
    print("UPDATING IMAGE LINKS")
    pattern = r'!\[(.*?)\]\((.*?)\)'  # Capture both alt text and link
    matches = re.findall(pattern, markdown_string)

    print(f"matches: {matches}")
    for alt_text, link in matches:
        parsed_url = urlparse(link)
        if parsed_url.scheme in ('http', 'https'):
            print(f"skipped {link} as it is a URL")
            continue  # Skip web URLs

        original_filename = os.path.basename(link).split('.')[0]
        file_ext = os.path.basename(link).split('.')[-1]

        print(f"link: {link}, original_filename: {original_filename}, file_ext: {file_ext}")
        print("#docs_dir:", docs_dir, "Doc?", link.startswith(f'{docs_dir}/'))

        if ("."+file_ext.lower()) in SUPPORTED_IMAGE_EXTENSIONS:

            if link.startswith(f'{docs_dir}'): # is a docs image
                # translated_folder_absolute = os.path.join(docs_dir, "translated_images")
                # count how many levels to go up
                rel_levels = os.path.relpath(os.path.dirname(md_file_path), docs_dir).count(os.path.sep) + 1
                # Create the relative path to the image directory
                translated_folder = ('../' * rel_levels) + 'translated_images'
            else:  # is an a readme image
                # translated_folder = os.path.join(os.path.dirname(link), "translations", "translated_images")
                translated_folder = "./translated_images"
            
            # print("###HASH in UPDATING LINKS:", f"{link} in {language_code}")
            print("###FINDING IMAGE HASH NAME FOR MARKDOWN")
            # hash = get_unique_id(f"{link}.{language_code}")
            md_file_dir = os.path.dirname(md_file_path)
            actual_image_path = os.path.normpath(os.path.join(md_file_dir, link))
            hash = get_unique_id(actual_image_path)
            new_filename = f"{original_filename}.{hash}.{language_code}.{file_ext}"
            updated_link = os.path.join(translated_folder, new_filename)
            if (not updated_link.startswith("/")) and (not updated_link.startswith(".")):
                updated_link = "/" + updated_link
                
            print(f"updated_link: {updated_link}")
            new_image_markup = f'![{alt_text}]({updated_link})'
            markdown_string = re.sub(rf'!\[{re.escape(alt_text)}\]\({re.escape(link)}\)', new_image_markup, markdown_string)
           
    return markdown_string


def get_unique_id(file_path):
    # Convert the file path to bytes
    file_path_bytes = file_path.encode('utf-8')
    
    # Create a SHA-256 hash object
    hash_object = hashlib.sha256()
    
    # Update the hash object with the bytes of the file path
    hash_object.update(file_path_bytes)
    
    # Generate the hexadecimal digest
    unique_identifier = hash_object.hexdigest()
    print("###HASH in GET UNIQUE ID for:", file_path, " HASH=", unique_identifier)
    return unique_identifier

Here's a example of how links are updated

In [30]:
string = "here's image of a cat: ![image](/docs/cat1.png) and image of a cat ![image](docs/cat1.png)"
ans = update_image_link(
    "/docs/test.md", 
    string,
    "zh",
    "/docs"
)
print(ans)

UPDATING IMAGE LINKS
matches: [('image', '/docs/cat1.png'), ('image', 'docs/cat1.png')]
link: /docs/cat1.png, original_filename: cat1, file_ext: png
#docs_dir: /docs Doc? True
###FINDING IMAGE HASH NAME FOR MARKDOWN
###HASH in GET UNIQUE ID for: \docs\cat1.png  HASH= 24c104f9950fcf0665017c035e5ce1abe77c35d1c369c0d722dd044dcf176270
updated_link: ../translated_images\cat1.24c104f9950fcf0665017c035e5ce1abe77c35d1c369c0d722dd044dcf176270.zh.png


error: bad escape \c at position 29

### High Level Function
function translate_string is a higher level function. To fit software structure of backend, input and output types are changed to str.

In [54]:
def translate_string(input_string, language, language_code, docs_dir,md_file_path):
    """
    Writes the input string to a temporary markdown file, translates it,
    and returns the translated string.

    Args:
    input_string (str): The string to be translated.
    language (str): The target language for translation.

    Returns:
    str: The translated string.
    """
    # Create a temporary input file
    with tempfile.NamedTemporaryFile(
        delete=False, suffix=".md", mode="w"
    ) as temp_input_file:
        temp_input_file_path = temp_input_file.name
        temp_input_file.write(input_string)

    # Create a temporary output file path
    temp_output_file_path = tempfile.mktemp(suffix=".md")

    # Call the translate function
    translate(language, temp_input_file_path, temp_output_file_path)

    # Read the translated content from the temporary output file
    with open(temp_output_file_path, "r") as temp_output_file:
        translated_content = temp_output_file.read()
        translated_content = update_image_link(md_file_path, translated_content, language_code, docs_dir)

    return translated_content

Here's a sample of how it works

In [55]:
string = "here's image of a cat: ![image](/docs/cat1.png) and image of a cat ![image](docs/cat1.png)"
ans = translate_string(
    string,
    language="chinese",
    language_code="zh",
    docs_dir="/docs",
    md_file_path="/docs/test.md"
)


print(ans)

  translate(language, temp_input_file_path, temp_output_file_path)
  for name, size in _cache_format[opname[deop]].items():


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\junyi.wu\\AppData\\Local\\Temp\\tmpavb3td7j.md'