In [37]:
import requests
import json
from tqdm.auto import tqdm
import glob
import os
from pathlib import Path
import concurrent.futures as cf
import tiktoken

In [38]:
enc = tiktoken.get_encoding("gpt2")

def cal_len(text):
    return len(enc.encode(text))

In [39]:
original_folder_path = 'D:\CodeNguon\selenium\\box_chat\json'
translated_folder_path = 'D:\CodeNguon\selenium\\box_chat\data'

all_data = []
for json_file in tqdm(glob.glob(original_folder_path + '/*.json')):
    data = json.load(open(json_file, 'r', encoding='utf-8'))
    all_data.append(dict(name=json_file.rsplit(os.sep, 1)[-1], data=data))

  0%|          | 0/1 [00:00<?, ?it/s]

In [40]:
max_workers = max(2, os.cpu_count() - 4)
max_workers

8

In [41]:
api_endpoint = 'https://streaming.tenant-forefront-default.knative.chi.coreweave.com/free-chat'

def get_prompt(text):
    txt = "Dịch nội dung dưới đây sang tiếng Việt, chỉ trả về nội dung đã được dịch:\n" + text
    return {
        "text": txt,
        "action": "noauth",
        "id": "",
        "parentId": "",
        "workspaceId": "",
        "messagePersona": "607e41fe-95be-497e-8e97-010a59b2e2c0",
        "model": "gpt-3.5",
        "messages": [],
        "internetMode": "auto",
        "hidden": False
    }

headers = {
    'Content-Type': 'application/json',
    'user-agent': 'PostmanRuntime/7.32.2',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Authorization': 'Bearer null',
}

def do_translate(source_text):
    if not source_text:
        return '\n'
    data = get_prompt(source_text)
    res = requests.post(api_endpoint, json=data,
                        headers=headers, stream=True)
    flag = True
    cnt = 2
    trans = ''
    _len = cal_len(source_text)
    for val in tqdm(res.raw, desc=f'Translating chunk length {_len}'):
        if cnt > 0 and flag:
            cnt -= 1
            continue
        flag = False
        cnt += 1
        if cnt % 2 == 0:
            txt = json.loads(val.decode('utf-8')[6:])['delta']
            trans += txt
            if txt == '':
                break
        if cnt == 3:
            cnt = 0
    return trans

In [42]:
max_length = 2048

In [43]:
def process_translate(big_content):
    raw_chunks = big_content.split('\n')
    chunks = ['']
    for c in raw_chunks:
        if cal_len(chunks[-1]) < max_length:
            chunks[-1] += c
        else:
            chunks.append('')
    if chunks[-1] == '':
        chunks = chunks[:-1]
    print(f"Translating {len(chunks)} chunks.")
    with cf.ThreadPoolExecutor(max_workers=max_workers) as exe:
        translated = list(exe.map(do_translate, chunks))
    return '\n'.join(translated)

In [44]:
def process_examples(data):
    if data['language'] != 'vietnamese':
        data['language'] = 'vietnamese'
        data['content'] = process_translate(data['content']) 
    return data

In [45]:
for file_data in tqdm(all_data):
    name = file_data['name']
    new_data = []
    for data in tqdm(file_data['data'], desc=f'Translating {name}...'):
        new_data.append(process_examples(data))
    json.dump(new_data, open(f'{translated_folder_path}/{name}', 'w', encoding='utf-8'),
              ensure_ascii=False, indent=2)

  0%|          | 0/1 [00:00<?, ?it/s]

Translating data5.json...:   0%|          | 0/6909 [00:00<?, ?it/s]

Translating 1 chunks.


Translating chunk length 206: 0it [00:00, ?it/s]

Translating 1 chunks.


Translating chunk length 838: 0it [00:00, ?it/s]

Translating 1 chunks.


Translating chunk length 832: 0it [00:00, ?it/s]

Translating 1 chunks.


Translating chunk length 585: 0it [00:00, ?it/s]