In [None]:
import json
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
import time
from tqdm.auto import tqdm

In [2]:
from retrying import retry
import openai

@retry(stop_max_attempt_number=5, wait_exponential_multiplier=1000, wait_exponential_max=10000)
def generate(prompt, history=[]):

    client = openai.OpenAI(api_key="sk-xxx", base_url="xxx")
    completion = client.chat.completions.create(
        model="xxx",
        messages = [{"role": "user", "content": prompt}],
        temperature = 0.
    )
    return completion.choices[0].message.content

In [3]:
with open("../../data/align/Align_Minos.json","r",encoding='utf-8-sig') as f:
    L = json.load(f)

In [4]:
input_data = []
for i in L:
    tmp = {}
    tmp['id'] = i['id']
    tmp['question'] = i['question']
    tmp['answer'] = i['answer']
    tmp['response'] = i['response']
    tmp['rank'] = i['rank']
    input_data.append(tmp)

In [5]:
prompt = '''请根据输入的开放式问题和参考答案，综合考虑答案的事实正确性、逻辑性、简洁性和清晰度，并结合你的思考，生成五个层次的答案，分别是优秀、良好、中等、较差、极差。

其中：*事实正确性*：分析回答提供的信息是否准确，并基于可信的事实和数据。*逻辑性*：分析回答是否逻辑清晰，推理合理，连贯一致。*简洁性*：分析回答是否简明扼要，避免冗长和不必要的细节。*清晰度*：分析回答是否表达清晰、易懂，语言是否简洁明了。

请遵循以下的 JSON 格式输出结果：
{
    "优秀":xxxx,
    "优秀":xxxx,
    "中等":xxxx,
    "较差":xxxx,
    "极差":xxxx,
}

问题：{question}
参考答案：{ref}

输出生成的回答：
'''

In [6]:
import re

def format_check(input):
    if isinstance(input,dict) and set(list(input)) == set(['优秀','良好','中等','较差','极差']):
        return True
    else:
        return False

# 定义一个函数，接受一个 markdown 字符串作为参数
def find_dicts(markdown):
    # 定义一个空列表，用于存储找到的 dict
    dicts = []
    
    if markdown == None:
        return dicts

    # 定义一个正则表达式，匹配 dict 的格式
    pattern = r"\{[^{}]*\}"
    # 使用 re.findall 方法，找出 markdown 字符串中所有匹配的子串
    matches = re.findall(pattern, markdown)
    # 遍历每个匹配的子串
    for match in matches:
        # 尝试将子串转换为 dict 类型，如果成功则添加到列表中
        try:
            d = eval(match)
            if isinstance(d, dict):
                dicts.append(d)
        except:
            # 如果转换失败，忽略该子串
            pass
    # 返回找到的 dict 列表
    return dicts

In [7]:
import traceback
import os
import json

import traceback
def deal(item,file_path):
    max_try_retry = 10
    try_num = 0
    content = prompt.replace('{question}',item['question']).replace('{ref}',item['answer'])
    for _ in range(max_try_retry):
        try:
            response = generate(content)
            try:
                tmps = json.loads(response)
                # print('yresss')
            except:
                tmps = find_dicts(response)[0]
            
            if format_check(tmps):
                item['example'] = tmps
                with open(file_path, "a+", encoding="utf8") as f:
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
                    f.flush()
                break

        except Exception as e:
            str_e = str(e)
            if try_num== max_try_retry:
                break

            if 'InvalidRequestError' in str_e:
                if 'maximum context' in str_e:
                    break
                try_num += 1
                continue
            else:
                traceback.print_exc()


In [None]:
file_path = "../../data/align/non_fact/examples.jsonl"

if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        pass

with open(file_path,'r',encoding='utf-8') as f:
    L = f.readlines()
finish_id = [json.loads(i)['id'] for i in L]

input_list = [i for i in input_data if i['id'] not in finish_id]

from tqdm.contrib import tzip
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib

num_worker = 32

with tqdm_joblib(desc="My calculation", total=len(input_list)) as progress_bar:
    Parallel(n_jobs=num_worker,prefer="threads")([delayed(deal)(x,file_path=file_path) for x in input_list])