In [47]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BertTokenizer, BertModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json
import random
import sys
import torch.nn.functional as F
import torch
import re
import jsonlines
import difflib
import warnings
import logging
import os
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from scipy.stats import kendalltau
from rouge import Rouge
rouge = Rouge()
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def truncate_and_tokenize(text, max_length=512):
    # 使用BERT的分词器进行分词
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text, add_special_tokens=True)))

    # 截断或填充以适应指定的最大长度
    tokens = tokens[:max_length-2]  # 保留 [CLS] 和 [SEP] 标记

    # 添加特殊标记 [CLS] 和 [SEP]
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    # 将词汇转换为对应的ID
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # 创建PyTorch张量
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)

    return input_ids_tensor

def calculate_bert_score(sentence1, sentence2, model, tokenizer):
    input_ids1 = truncate_and_tokenize(sentence1)
    input_ids2 = truncate_and_tokenize(sentence2)

    # 获取BERT模型的输出
    with torch.no_grad():
        outputs1 = model(input_ids1)
        outputs2 = model(input_ids2)

    # 获取最后一层的隐藏状态
    last_hidden_states1 = outputs1.last_hidden_state
    last_hidden_states2 = outputs2.last_hidden_state

    # 取第一个位置（[CLS]标记）的隐藏状态作为句子表示
    sentence_embedding1 = last_hidden_states1[:, 0, :]
    sentence_embedding2 = last_hidden_states2[:, 0, :]

    # 使用余弦相似度计算句子之间的相似度
    similarity = F.cosine_similarity(sentence_embedding1, sentence_embedding2)

    return similarity.item()

model_name = "../../models/bert-base-uncased"

from bert_score import score as bscore

In [48]:
import json
import numpy as np
from scipy.stats import spearmanr, pearsonr, kendalltau
import math

def score(l1, l2, p=0.5):
    """
    Calculates Ranked Biased Overlap (RBO) score.
    l1 -- Ranked List 1
    l2 -- Ranked List 2
    """
    if l1 == None: l1 = []
    if l2 == None: l2 = []

    sl, ll = sorted([(len(l1), l1), (len(l2), l2)])
    s, S = sl
    l, L = ll
    if s == 0: return 0

    ss = set([])  # contains elements from the smaller list till depth i
    ls = set([])  # contains elements from the longer list till depth i
    x_d = {0: 0}
    sum1 = 0.0
    for i in range(l):
        x = L[i]
        y = S[i] if i < s else None
        d = i + 1

        if x == y:
            x_d[d] = x_d[d - 1] + 1.0
        else:
            ls.add(x)
            if y != None: ss.add(y)
            x_d[d] = x_d[d - 1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls else 0.0)
        sum1 += x_d[d] / d * pow(p, d)

    sum2 = 0.0
    for i in range(l - s):
        d = s + i + 1
        sum2 += x_d[d] * (d - s) / (d * s) * pow(p, d)

    sum3 = ((x_d[l] - x_d[s]) / l + x_d[s] / s) * pow(p, l)

    rbo_ext = (1 - p) / p * (sum1 + sum2) + sum3
    return rbo_ext

def calculate_pearson_coefficient(seq1, seq2):
    """计算Pearson相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    pearson_coefficient, _ = pearsonr(seq1, seq2)
    return pearson_coefficient

def calculate_spearman_coefficient(seq1, seq2):
    """计算Spearman相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    spearman_coefficient, _ = spearmanr(seq1, seq2)
    return spearman_coefficient

def calculate_kendalltau_coefficient(seq1, seq2):
    """计算Kendall Tau相关系数"""
    if len(seq1) < 2 or len(seq2) < 2:
        return 0
    kendall_coefficient, _ = kendalltau(seq1, seq2)
    return kendall_coefficient

def compute_rank_correlations(rank, rank_ours):
    """计算两个排名序列之间的相关系数"""
    # 将排名转换为排名索引（1-based index）
    rank_dict = {model: idx + 1 for idx, model in enumerate(rank)}
    rank_ours_dict = {model: idx + 1 for idx, model in enumerate(rank_ours)}
    
    # 为每个排名生成一个序列
    rank_sequence = [rank_dict.get(model, 0) for model in rank]
    rank_ours_sequence = [rank_ours_dict.get(model, 0) for model in rank]

    # 计算相关系数
    kendall = calculate_kendalltau_coefficient(rank_sequence, rank_ours_sequence)
    spearman = calculate_spearman_coefficient(rank_sequence, rank_ours_sequence)
    pearson = calculate_pearson_coefficient(rank_sequence, rank_ours_sequence)

    return kendall, spearman, pearson

In [49]:
import jieba
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import json
from tqdm import tqdm
from collections import defaultdict

def count_score(r):
    with open('../../data/antique/ANTIQUE_S5/sample' + str(r) + '.json', 'r', encoding='utf-8-sig') as file:
        datas = json.load(file)

    # 初始化评分字典
    bert_array = defaultdict(list)
    bleu_array = defaultdict(list)
    golden_array = defaultdict(list)
    rouge1_array = defaultdict(list)
    rouge2_array = defaultdict(list)
    rougel_array = defaultdict(list)

    rouge = Rouge()
        
    for data in tqdm(datas):
        question = data['question']  # 获取问题
        ground = data['answer'][0]  # 获取正确答案
        doc_list = []
        
        for name, res in data['response'].items():
            doc_list.append(res)
        
        # 计算BERT评分
        bert_score_list = bscore(doc_list, [ground]*len(doc_list), model_type=model_name, device='cpu')[-1].tolist()
        
        for doc, bert_score in zip(doc_list, bert_score_list):
            prediction = doc  # 因为res本身就是文本，所以可以直接使用
            
            # 计算ROUGE评分
            scores = rouge.get_scores(prediction, ground)
            rouge_l = scores[0]["rouge-l"]["f"]
            rouge_1 = scores[0]["rouge-1"]["f"]
            rouge_2 = scores[0]["rouge-2"]["f"]
            
            # 分词
            predict_tokens = prediction.split()
            ground_tokens = ground.split()
            
            # 计算BLEU评分
            bleu_score = sentence_bleu([ground_tokens], predict_tokens)
            
            # 保存评分到字典
            rouge1_array[question].append(rouge_1)
            rouge2_array[question].append(rouge_2)
            rougel_array[question].append(rouge_l)
            bert_array[question].append(bert_score)
            bleu_array[question].append(bleu_score)
        golden_array[question] = data['rank']
        
    baseline_list = [bleu_array, rouge1_array, rouge2_array, rougel_array, bert_array]
    for m in baseline_list:
        kendall_scores = []
        spearman_scores = []
        pearson_scores = []
        rbo_0_5_scores = [] 
        rbo_0_9_scores = []
        for question, scores in m.items():
            print(scores)
            rank = golden_array[question]
            
            model_list = rank
            rank_our = [label for label, s in sorted(zip(model_list, scores), key=lambda x: x[1], reverse=True)]
            
            print(rank_our)
            rbo_0_5 = score(rank_our, rank, p=0.5)
            rbo_0_9 = score(rank_our, rank, p=0.9)

            # 计算每一对排名的相关系数
            kendall, spearman, pearson = compute_rank_correlations(rank_our, rank)
        
            # 存储相关系数
            kendall_scores.append(kendall)
            spearman_scores.append(spearman)
            pearson_scores.append(pearson)
            rbo_0_5_scores.append(rbo_0_5)
            rbo_0_9_scores.append(rbo_0_9)

        avg_kendall = np.mean(kendall_scores)
        avg_spearman = np.mean(spearman_scores)
        avg_pearson = np.mean(pearson_scores)
        avg_rbo_0_5 = np.mean(rbo_0_5_scores)
        avg_rbo_0_9 = np.mean(rbo_0_9_scores)

        # 输出平均结果
        print("Kendall Tau: %.4f" % avg_kendall)
        print("Spearman: %.4f" % avg_spearman)
        print("Pearson: %.4f" % avg_pearson)
        print(f"RBO (p=0.5): {avg_rbo_0_5:.4f}")
        print(f"RBO (p=0.9): {avg_rbo_0_9:.4f}")
        
        print('*' * 40)


In [None]:
for r in range(1, 6):
    count_score(r)