In [None]:
import re
import os
import glob
import json
import tqdm

from openai import OpenAI
from src.path_utils import get_project_root

In [None]:
test_data_path = os.path.join(get_project_root(), 'data', 'test_data')

test_data = {}

for language_test_path in glob.glob(f'{test_data_path}/**/*_subtask1*'):

    language = language_test_path.split('/')[-2]
    if language == 'english':
        continue
    print(f'Language : {language}')

    with open(language_test_path, 'r') as f:
        lang_data = json.load(f)

    test_data[language] = lang_data

print(test_data)

In [None]:
OPENAI_API_KEY="sk-<KEY>"
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
out_path = os.path.join(get_project_root(), 'data', 'test_translations')


In [None]:
sys_prompt = "You are a bilingual humorist, adept at translating meme text between languages while preserving the original humor, cultural nuances, and any slang or idiomatic expressions. Ensure the translation is accurate, contextually appropriate, and retains the meme's playful tone. Avoid adding explainations or additional commentary and provide only the translation."


for lang, lang_data in test_data.items():

    print(f'Language: {lang}')

    lang_translations = []

    for sample in tqdm.tqdm(lang_data, 'Translating memes'):
        # print(sample)

        sample_text = re.sub(r'\s+', ' ', sample['text']).strip()

        sample_prompt = f"Translate the following meme text from {lang.capitalize()} to English: {sample_text}"
        # print(sample_prompt)

        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": sample_prompt}
        ]
        # print(messages)

        completion = client.chat.completions.create(
            model="gpt-4",
            messages=messages

        )

        message_content = completion.choices[0].message.content

        # Remove quotation-marks introduced by chatgpt
        if message_content.startswith('\"'):
            message_content = message_content[1:]
        if message_content.endswith('\"'):
            message_content = message_content[:-1]


        lang_translations.append({
            'id': sample['id'],
            'text': message_content
        })


        # Dump to file
        with open(os.path.join(out_path, f'{lang}_translated.json'), 'w') as f:
            json.dump(lang_translations, f, indent=4)
