<a href="https://colab.research.google.com/github/LC1332/Luotuo-Chinese-LLM/blob/main/notebook/improvedTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 一个升级后的批量翻译代码

这个代码最初由黄泓森进行开发，由李鲁鲁转到colab并进行了更改

[骆驼项目主页](https://github.com/LC1332/Luotuo-Chinese-LLM)

如果你使用我们的代码获取了有用的数据，也欢迎分享给我们，或者告诉我们你公开后的github/huggingface链接

如果你使用我们的代码获取数据并发表了论文或者tech report，欢迎cite我们的github repo

## 安装环境

In [None]:
!pip install openai
!pip install aiofiles
!pip install tiktoken

In [None]:
import os
import json
import time
import openai
import asyncio
import aiohttp
import aiofiles
from functools import partial
from tqdm.asyncio import tqdm as tqdm
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
max_zh_en_ratio = 2.3

## 输入你的openAI API

In [None]:
# 在这里输入你的openAI API token

api_key = ["sk-DfFyR"]


class KeyPool:
    def __init__(self, strings):
        self.pool = list(strings)
        self.last_used = {s: -1 for s in strings}

    def getKey(self):
        result = min(self.last_used, key=self.last_used.get)
        self.last_used[result] = int(time.time() * 1000)
        return result

pool = KeyPool(api_key)

## 指定工作目录



In [None]:
os.chdir("/content/")

## 获取需要翻译的样本

这里我们使用WizardLM的样本

In [None]:
!wget https://raw.githubusercontent.com/LC1332/WizardLM/main/data/WizardLM_testset.jsonl -O WizardLM_testset.jsonl

--2023-05-12 03:22:05--  https://raw.githubusercontent.com/LC1332/WizardLM/main/data/WizardLM_testset.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81139 (79K) [text/plain]
Saving to: ‘WizardLM_testset.jsonl’


2023-05-12 03:22:05 (32.6 MB/s) - ‘WizardLM_testset.jsonl’ saved [81139/81139]



In [None]:
delay = 0.05

concurrency_limit = 32

input_file = "WizardLM_testset.jsonl"

# 数据缓存目录
temp_path = "/content/temp"

# 数据输出目录
output_path = "/content/translate"

output_prefix = "WizardLM_tr"

max_file_size = 1024**3

# 需要翻译的字段
entries = ["Instruction"]

os.system(f"mkdir -p {temp_path} {output_path}")

0

In [None]:
import re

async def getTranslation(item, entries: list = []):
    async def get(text):
        # text = text.replace("\n", " ")
        openai.api_key = pool.getKey()
        try:
            en_token_len = float(len(enc.encode( text )))
            max_zh_len = int( max_zh_en_ratio * en_token_len + 10 )

            messages =  [  
            {'role':'system', 'content':'将反引号中的英文文本翻译成简体中文，并输出到一对反引号中，如`cat`->`猫`'},
            {'role':'user', 'content':'将反引号中的指令翻译成中文:`dog`'},
            {'role':'assistant', 'content':'`狗`'},   
            {'role':'user', 'content':f'将反引号中的指令翻译成中文:`{text}`'}  ]

            resp = await openai.ChatCompletion.acreate(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0,
                max_tokens=max_zh_len
            )
            if "choices" in resp:
                result = resp['choices'][0]['message']['content']

                result = result.strip()

                if len(result) > 1 and result[0] == result[-1] == '`':  # 判断首尾字符是否是反引号
                    return result[1:-1]  # 如果是，去掉反引号，并返回True
                else:
                    return result # 如果不是，返回原字符串和False
            else:
                raise Exception(f"Invalid API response: {resp}")
        except Exception as e:
            print(f"[Error] {e}")
            return None

    for entry in entries:
        trans = await get(item[entry])
        if trans is None:
            return None
        else:
            item[f"{entry}_zh"] = trans
    return item


async def process(id, item, semaphore):
    async with semaphore:
        file_name = f"{temp_path}/{output_prefix}_{id}.json"
        try:
            it = await getTranslation(item, entries)
            if it is None:
                raise Exception(file_name)
            async with aiofiles.open(file_name, "w") as f:
                await f.write(json.dumps(it, ensure_ascii=False, indent=4))
        except Exception as e:
            print(f"Error saving item: {e}")


async def main():
    try:
      with open(input_file, "r") as file:
          data = json.load(file)
    except json.JSONDecodeError:
      data = []
      with open(input_file, "r") as file:
          for line in file:
              entry = json.loads(line)
              data.append(entry)

    tasks = []

    semaphore = asyncio.Semaphore(concurrency_limit)

    skip_count = 0

    for id, item in enumerate(data):
        if os.path.exists(f"{temp_path}/{output_prefix}_{id}.json"):
            skip_count = skip_count + 1
            continue
        tasks.append(asyncio.create_task(process(id, item, semaphore)))

    # random.shuffle( tasks )
    print('skip ', skip_count )
    print('rest ', len(tasks))

    async for task in tqdm(tasks, total=len(tasks), desc="Processing items"):
        await task
        time.sleep(delay)

由于网络问题或OpenAI的限制会导致获取数据失败，此时脚本会跳过这部分数据

重新运行下面的单元格即可补充获取失败的数据

In [None]:
await main()

Processing items:  42%|████▏     | 91/218 [00:59<01:58,  1.07it/s]

[Error] That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID cc8490f290fe5d719cfce5ae3f78a5d2 in your message.)
Error saving item: /content/temp/WizardLM_tr_172.json


Processing items: 100%|██████████| 218/218 [01:21<00:00,  2.68it/s]


## 合并所有翻译数据

In [None]:
data = []
for filename in tqdm(os.listdir(temp_path)):
    if filename.startswith(output_prefix) and filename.endswith(".json"):
        with open(os.path.join(temp_path, filename), 'r', encoding='utf-8') as file:
            try:
                entry = json.load(file)
                data.append(entry)
            except json.JSONDecodeError:
                pass

100%|██████████| 217/217 [00:00<00:00, 15491.89it/s]


In [None]:
file_counter = 1
current_file_size = 0
output_file = f"{output_path}/{output_prefix}_{file_counter}.jsonl"

with open(output_file, 'w', encoding='utf-8') as out:
    for item in tqdm(data):
        item_json = json.dumps(item, ensure_ascii=False)
        item_size = len(item_json.encode('utf-8'))
        out.write(item_json + "\n")
        current_file_size += item_size
        if current_file_size > max_file_size:
            file_counter += 1
            output_file = f"{output_path}/{output_prefix}_{file_counter}.jsonl"
            out = open(output_file, 'w', encoding='utf-8')
            current_file_size = 0

100%|██████████| 217/217 [00:00<00:00, 50116.40it/s]


In [None]:
print(output_file)

/content/translate/WizardLM_tr_1.jsonl
