In [9]:
import json
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
import time
from tqdm.auto import tqdm

In [10]:
from retrying import retry
import openai

@retry(stop_max_attempt_number=5, wait_exponential_multiplier=1000, wait_exponential_max=10000)
def generate(prompt, history=[]):

    client = openai.OpenAI(api_key="sk-xxx", base_url="xxx")
    completion = client.chat.completions.create(
        model="xxx",
        messages = [{"role": "user", "content": prompt}],
        temperature = 0.
    )
    return completion.choices[0].message.content

In [11]:
with open("../../data/align/Align_Minos.json","r",encoding='utf-8-sig') as f:
    L = json.load(f)

In [12]:
input_data = []
for i in L:
    tmp = {}
    tmp['id'] = i['id']
    tmp['question'] = i['question']
    tmp['answer'] = i['answer']
    tmp['response'] = i['response']
    tmp['rank'] = i['rank']
    tmp['label'] = i['label']
    if tmp['label'] == 0:
        input_data.append(tmp)

In [13]:
prompt = '''请根据输入的开放式问题和参考答案,以及提供的不同层次的答案样例，根据回复的质量对输入的模型回复进行排序，顺序越靠前的质量越高。

答案样例的层次有五个，分别是优秀、良好、中等、较差、极差。

请遵循以下的示例 JSON 格式输出结果：
{
    "rank": [”模型x“,“xxx”,…]
}

问题：{question}

参考答案：{ref}

不同层次的答案示例：
{instance}

模型回复：
{model_input}

输出：
'''

In [14]:
import re

def format_check(inputs, num):
    if isinstance(inputs,dict) and set(list(inputs)) == set(['优秀','良好','中等','较差','极差']):
        return True
    else:
        return False

def format_check(inputs,num):
    if isinstance(inputs,dict) and set(list(inputs)) == set(['rank']):
        if len(inputs['rank']) == num:
            return True
        else:
            return False
    else:
        return False

# 定义一个函数，接受一个 markdown 字符串作为参数
def find_dicts(markdown):
    # 定义一个空列表，用于存储找到的 dict
    dicts = []
    
    if markdown == None:
        return dicts

    # 定义一个正则表达式，匹配 dict 的格式
    pattern = r"\{[^{}]*\}"
    # 使用 re.findall 方法，找出 markdown 字符串中所有匹配的子串
    matches = re.findall(pattern, markdown)
    # 遍历每个匹配的子串
    for match in matches:
        # 尝试将子串转换为 dict 类型，如果成功则添加到列表中
        try:
            d = eval(match)
            if isinstance(d, dict):
                dicts.append(d)
        except:
            # 如果转换失败，忽略该子串
            pass
    # 返回找到的 dict 列表
    return dicts

In [15]:
with open("../../data/align/non_fact/examples.jsonl",'r',encoding='utf-8') as f:
    L = f.readlines()
id2instance = {}
for i in L:
    item = json.loads(i)
    id2instance[item['id']] = json.dumps(item['example'],indent=4,ensure_ascii=False)

In [16]:
import traceback
import os
import json

import traceback
def deal(item,file_path):
    max_try_retry = 5
    try_num = 0
    model2id = {}
    id2model = {}
    
    # print(item['id'])
    for num,model in enumerate(item['response']):
        model2id[model] = '模型'+str(num+1)
        id2model['模型'+str(num+1)] = model

    converted_output = "\n\n".join(
    [f"{model2id[key]}: {value}" for key, value in item['response'].items()]
)
    content = prompt.replace('{question}',item['question']).replace('{ref}',item['answer']).replace('{model_input}',converted_output).replace('{instance}',id2instance[item['id']])
    for _ in range(max_try_retry):
        try:
            response = generate(content)
            try:
                tmps = json.loads(response)
            except:
                tmps = find_dicts(response)[0]
            
            if format_check(tmps,len(id2model)):
                item['rank_our'] = [id2model[i] for i in tmps['rank']]
                with open(file_path, "a+", encoding="utf8") as f:
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
                    f.flush()
                break

        except Exception as e:
            str_e = str(e)
            if try_num== max_try_retry:
                break

            if 'InvalidRequestError' in str_e:
                if 'maximum context' in str_e:
                    break
                try_num += 1
                continue
            else:
                traceback.print_exc()

In [None]:
file_path ="../../data/align/non_fact/listwise_instance.jsonl"

if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        pass

with open(file_path,'r',encoding='utf-8') as f:
    L = f.readlines()

if len(L)>0:
    finish_id = [json.loads(i)['id'] for i in L]
else:
    finish_id = []

input_list = [i for i in input_data if i['id'] not in finish_id]

from tqdm.contrib import tzip
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib

num_worker = 16

with tqdm_joblib(desc="My calculation", total=len(input_list)) as progress_bar:
    Parallel(n_jobs=num_worker,prefer="threads")([delayed(deal)(x,file_path=file_path) for x in input_list])

In [22]:
with open(file_path,'r',encoding='utf-8-sig') as f:
    L = f.readlines()

LL = []
for i in L:
    LL.append(json.loads(i))
        
datas = sorted(LL, key=lambda x: x["id"])

In [23]:
model_name = "../../../models/mDeBERTa-v3-base-mnli-xnli"
with open("../../data/align/fact/align_points_"+model_name.split('/')[-1]+".json", 'r', encoding='utf-8') as f:
    datas1 = json.loads(f.read())
    for data in datas1:
        datas.append(data)

In [None]:
import json
import numpy as np
from scipy.stats import spearmanr, pearsonr, kendalltau
import math

def score(l1, l2, p=0.5):
    """
    Calculates Ranked Biased Overlap (RBO) score.
    l1 -- Ranked List 1
    l2 -- Ranked List 2
    """
    if l1 == None: l1 = []
    if l2 == None: l2 = []

    sl, ll = sorted([(len(l1), l1), (len(l2), l2)])
    s, S = sl
    l, L = ll
    if s == 0: return 0

    ss = set([])  # contains elements from the smaller list till depth i
    ls = set([])  # contains elements from the longer list till depth i
    x_d = {0: 0}
    sum1 = 0.0
    for i in range(l):
        x = L[i]
        y = S[i] if i < s else None
        d = i + 1

        if x == y:
            x_d[d] = x_d[d - 1] + 1.0
        else:
            ls.add(x)
            if y != None: ss.add(y)
            x_d[d] = x_d[d - 1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls else 0.0)
        sum1 += x_d[d] / d * pow(p, d)

    sum2 = 0.0
    for i in range(l - s):
        d = s + i + 1
        sum2 += x_d[d] * (d - s) / (d * s) * pow(p, d)

    sum3 = ((x_d[l] - x_d[s]) / l + x_d[s] / s) * pow(p, l)

    rbo_ext = (1 - p) / p * (sum1 + sum2) + sum3
    return rbo_ext

def calculate_pearson_coefficient(seq1, seq2):
    """计算Pearson相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    pearson_coefficient, _ = pearsonr(seq1, seq2)
    return pearson_coefficient

def calculate_spearman_coefficient(seq1, seq2):
    """计算Spearman相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    spearman_coefficient, _ = spearmanr(seq1, seq2)
    return spearman_coefficient

def calculate_kendalltau_coefficient(seq1, seq2):
    """计算Kendall Tau相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    kendall_coefficient, _ = kendalltau(seq1, seq2)
    return kendall_coefficient

def compute_rank_correlations(rank, rank_ours):
    """计算两个排名序列之间的相关系数"""
    # 将排名转换为排名索引（1-based index）
    rank_dict = {model: idx + 1 for idx, model in enumerate(rank)}
    rank_ours_dict = {model: idx + 1 for idx, model in enumerate(rank_ours)}
    
    # 为每个排名生成一个序列
    rank_sequence = [rank_dict.get(model, 0) for model in rank]
    rank_ours_sequence = [rank_ours_dict.get(model, 0) for model in rank]

    # 计算相关系数
    kendall = calculate_kendalltau_coefficient(rank_sequence, rank_ours_sequence)
    spearman = calculate_spearman_coefficient(rank_sequence, rank_ours_sequence)
    pearson = calculate_pearson_coefficient(rank_sequence, rank_ours_sequence)

    return kendall, spearman, pearson


kendall_scores = []
spearman_scores = []
pearson_scores = []
rbo_0_5_scores = [] 
rbo_0_9_scores = []

print(len(datas))
# 遍历每条数据，计算相关系数
for data in datas:
    rank = data['rank']
    if "points_score" in data:
        rank_our =  [key for key, value in sorted(data['points_score'].items(), key=lambda item: item[1], reverse=True)]
    else:
        rank_our = data['rank_our']

    rbo_0_5 = score(rank_our, rank, p=0.5)
    rbo_0_9 = score(rank_our, rank, p=0.9)

    # 计算每一对排名的相关系数
    kendall, spearman, pearson = compute_rank_correlations(rank_our, rank)
    
    # 存储相关系数
    kendall_scores.append(kendall)
    spearman_scores.append(spearman)
    pearson_scores.append(pearson)
    rbo_0_5_scores.append(rbo_0_5)
    rbo_0_9_scores.append(rbo_0_9)
    # print(kendall)

# 计算平均值
avg_kendall = np.mean(kendall_scores)
avg_spearman = np.mean(spearman_scores)
avg_pearson = np.mean(pearson_scores)
avg_rbo_0_5 = np.mean(rbo_0_5_scores)
avg_rbo_0_9 = np.mean(rbo_0_9_scores)

# 输出平均结果
print("Kendall Tau: %.4f" % avg_kendall)
print("Spearman: %.4f" % avg_spearman)
print("Pearson: %.4f" % avg_pearson)
print(f"RBO (p=0.5): {avg_rbo_0_5:.4f}")
print(f"RBO (p=0.9): {avg_rbo_0_9:.4f}")