## Evaluate Calibration effectiveness

In [30]:
import json
import re

def clean_reason(permutation):
  # 找到第一次出现的排名序列
  ranking_pattern = re.findall(r'\[\d+\] >', permutation)
  new_response = ""
  # 提取这些模式中的数字
  ranking_numbers = [int(num) for pattern in ranking_pattern for num in re.findall(r'\d+', pattern)]
  new_response = " ".join([str(i )for i in ranking_numbers])
  # 单独处理最后一个排名元素
  # 查找文本中的最后一个 [数字] 模式
  last_number_match = re.search(r'\[\d+\](?!.*\[\d+\])', permutation)
  if last_number_match:
      last_number = last_number_match.group().strip('[]')
      new_response +=  " " + last_number
      # ranking_numbers.append(str(last_number))
  return new_response
T5_path = "../data/T5_results/msmarco_test_t5.jsonl"
GPT2_path = "../data/GPT2_results/msmarco_test_gpt2.jsonl"
t5_ranks = []
gpt35_ranks = [] 
with open(T5_path, "r") as f:
    data = []
    for i in f.readlines():
        item = json.loads(i)
        data.append(item)
        
        res = clean_reason(item["gpt2_reason"])
        res = res.split()
        res = [int(i) for i in res]
        res = list(set(res))
        for num in range(1, 6):
            if num not in res:
                res.append(num)
        count_larger = 0
        for num in res:
            if num >5:
                res.remove(num)
                count_larger+=1
        if count_larger>=1:
            continue
        gptres = [int(i) for i in item["re_rank_id"]]
        gptres = list(set(gptres))
        for num in range(1, 6):
            if num not in gptres:
                gptres.append(num)
        gpt35_ranks.append(gptres)
        # print(res)
        t5_ranks.append(res)

# Kendall's Tau
from scipy.stats import kendalltau

taus = []
p_values = []
for gpt35, t5 in zip(gpt35_ranks,t5_ranks):
    # gpt_ranks = [1, 2, 4, 3, 5]  # GPT的排名
    # t5_ranks = [3, 1, 5, 4, 2]  # T5的排名
    
    # 计算Kendall's Tau系数
    tau, p_value = kendalltau(gpt35, t5)
    taus.append(tau)
    p_values.append(p_value)
print("Kendall's Tau Coefficient:", sum(taus)/len(taus))
print("P-value:", sum(p_values)/len(p_values))
# GPT2
# Kendall's Tau Coefficient: 0.652941176470588
# P-value: 0.31764705882352934
# T5 
# Kendall's Tau Coefficient: 0.802
# P-value: 0.15

Kendall's Tau Coefficient: 0.652941176470588
P-value: 0.31764705882352934


## Evaluate similiarity between different reasons

In [55]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

T5_path = "../data/T5_results/msmarco_test_t5.jsonl"
gpt3_texts = []
t5_texts = []
with open(T5_path, "r") as f:
    data = []
    for i in f.readlines():
        item = json.loads(i)
        data.append(item)
        t5_texts.append(item["t5_reason"])
        gpt3_texts.append(item["reason"])
        
# GPT 和 T5 的原始解释文本
# gpt_text = """reason"[1] > [2] > [4] > [3] > [5] I ranked passage [1] as the most relevant because it directly provides a definition of declaratory judgment, explaining that it declares the rights, duties, or obligations of each party in a dispute without ordering any action or awarding damages. Passage [2] is ranked next as it also defines declaratory judgment by stating that it resolves a dispute by stating a fact, such as ownership of property or patents. Passage [4] is ranked third as it further elaborates on the types of situations where declaratory judgments are sought, such as determining rights under specific laws or regulations. Passage [3] is ranked fourth as it discusses a factor that supports the dismissal of a declaratory judgment action, which is relevant but not as directly related to defining declaratory judgment as the previous passages. Passage [5] is ranked last as it provides options available to insurers in certain situations, which is relevant but not as directly related to defining declaratory judgment as the other passages."""
# t5_text = """Here is the ranking of the passages based on their relevance to the search query "definition declaratory judgment": [3] > [1] > [5] > [4] > [2] Reasoning: - [3] provides a clear definition of declaratory judgment, providing a clear definition of the term. - [1] discusses the definition of declaratory judgment, which is relevant to understanding the concept. - [5] discusses the role of insurers in defending the insured, which is relevant to understanding the concept. - [4] discusses the conduct of insurers in filing declaratory judgment actions, which is relevant to understanding the concept. - [2] discusses the definition of declaratory judgment, which is relevant to understanding the concept."""

# def extract_explanations_gpt3(text):
#     """ Extract explanations from the provided GPT text based on passage identifiers. """
#     explanations = {}
#     matches = re.finditer(r"Passage \[(\d+)\] (is ranked .*?)(?= Passage|$)", text, re.DOTALL)
#     for match in matches:
#         index = int(match.group(1))
#         explanation = match.group(2).strip()
#         explanations[index] = explanation
#     return explanations
def extract_explanations_gpt3(text):
    """ Extract explanations from the provided GPT text based on passage identifiers. """
    explanations = {}
    # 提取段落编号和对应的解释，处理多行解释的情况
    matches = re.finditer(r"\[(\d+)\]:\s*([^[]+)(?=\[\d+\]:|$)", text, re.DOTALL)
    for match in matches:
        index = int(match.group(1))
        explanation = " ".join(match.group(2).strip().split('\n'))  # 去除多余的换行符，并将解释整理为单行
        explanations[index] = explanation
    return explanations

def extract_explanations_t5(text):
    """ Extract explanations from the provided T5 text based on passage identifiers. """
    explanations = {}
    matches = re.finditer(r"\-\s+\[(\d+)\]\s+(.*?)(?=\s*\- \[|\s*$)", text, re.DOTALL)
    for match in matches:
        index = int(match.group(1))
        explanation = match.group(2).strip()
        explanations[index] = explanation
    return explanations

overall_sim = []
work_example = 0
for gpt3_text, t5_text in zip(gpt3_texts, t5_texts):
    # 提取解释
    gpt3_explanations = extract_explanations_gpt3(gpt3_text)
    t5_explanations = extract_explanations_t5(t5_text)
    # print(gpt3_explanations)
    # 计算余弦相似度
    vectorizer = TfidfVectorizer()
    all_texts = list(gpt3_explanations.values()) + list(t5_explanations.values())
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # 分配索引
    gpt3_indices = {index: i for i, index in enumerate(gpt3_explanations.keys())}
    t5_indices = {index: i + len(gpt3_explanations) for i, index in enumerate(t5_explanations.keys())}
    # print(gpt3_indices)
    sum_sim = []
    # 打印相似度结果
    for index in gpt3_indices:
        if index in t5_indices:
            gpt_idx = gpt3_indices[index]
            t5_idx = t5_indices[index]
            similarity = cosine_similarity(tfidf_matrix[gpt_idx:gpt_idx+1], tfidf_matrix[t5_idx:t5_idx+1])[0][0]
            sum_sim.append(similarity)
            # print(f"Similarity between GPT and T5 explanations for passage [{index}]: {similarity:.3f}")
    if len(sum_sim)==0:
        continue
    work_example+=1
    overall_sim.append(sum(sum_sim)/len(sum_sim))
print(overall_sim)

[0.246220399521403, 0.3769048847428245, 0.3481947502162826, 0.42543153250196997, 0.309233521418506, 0.2567397198976442, 0.2896656907634423, 0.2143500703590344, 0.3599341219059331, 0.35396645544195604, 0.2391733136396949, 0.24934297753445098, 0.30986212877568897, 0.20640245834866996, 0.2610240840814875, 0.1311377927328061, 0.36652489391592347, 0.3503583678833506, 0.3498927134375653, 0.2537443117512478, 0.27204758534413254, 0.36452848372868474, 0.24748125052269612, 0.34916042893236166, 0.28430560410243333, 0.25089044806864697, 0.37697756402323734, 0.3599045632904195, 0.26895074881080494, 0.33032869033150486, 0.18209529504901367, 0.4123686409037216, 0.23900270969558424, 0.44192292898227353, 0.41056051463836896, 0.3516133363074111, 0.47209832505949845, 0.25493537297882823, 0.2787231503699099, 0.2554791976832552, 0.3972712774371111, 0.28028283644511964, 0.30150706595716675, 0.26186615316185013, 0.2774183775063283, 0.30191148839391146, 0.4094107136879803, 0.20497882756027203, 0.3234050904768

In [57]:
sum(overall_sim)/len(overall_sim)

0.30040532299050265

In [44]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

gpt2_path = "../data/GPT2_results/msmarco_test_gpt2.jsonl"
gpt3_texts = []
gpt2_texts = []
with open(gpt2_path, "r") as f:
    data = []
    for i in f.readlines():
        item = json.loads(i)
        data.append(item)
        gpt2_texts.append(item["gpt2_reason"])
        gpt3_texts.append(item["reason"])
        
# GPT 和 T5 的原始解释文本
# gpt_text = """reason"[1] > [2] > [4] > [3] > [5] I ranked passage [1] as the most relevant because it directly provides a definition of declaratory judgment, explaining that it declares the rights, duties, or obligations of each party in a dispute without ordering any action or awarding damages. Passage [2] is ranked next as it also defines declaratory judgment by stating that it resolves a dispute by stating a fact, such as ownership of property or patents. Passage [4] is ranked third as it further elaborates on the types of situations where declaratory judgments are sought, such as determining rights under specific laws or regulations. Passage [3] is ranked fourth as it discusses a factor that supports the dismissal of a declaratory judgment action, which is relevant but not as directly related to defining declaratory judgment as the previous passages. Passage [5] is ranked last as it provides options available to insurers in certain situations, which is relevant but not as directly related to defining declaratory judgment as the other passages."""
# t5_text = """Here is the ranking of the passages based on their relevance to the search query "definition declaratory judgment": [3] > [1] > [5] > [4] > [2] Reasoning: - [3] provides a clear definition of declaratory judgment, providing a clear definition of the term. - [1] discusses the definition of declaratory judgment, which is relevant to understanding the concept. - [5] discusses the role of insurers in defending the insured, which is relevant to understanding the concept. - [4] discusses the conduct of insurers in filing declaratory judgment actions, which is relevant to understanding the concept. - [2] discusses the definition of declaratory judgment, which is relevant to understanding the concept."""

def extract_explanations_gpt3(text):
    """ Extract explanations from the provided GPT text based on passage identifiers. """
    explanations = {}
    matches = re.finditer(r"Passage \[(\d+)\] (is ranked .*?)(?= Passage|$)", text, re.DOTALL)
    for match in matches:
        index = int(match.group(1))
        explanation = match.group(2).strip()
        explanations[index] = explanation
    return explanations

def extract_explanations_gpt2(text):
    """ Extract explanations from the provided T5 text based on passage identifiers. """
    explanations = {}
    matches = re.finditer(r"\-\s+\[(\d+)\]\s+(.*?)(?=\s*\- \[|\s*$)", text, re.DOTALL)
    for match in matches:
        index = int(match.group(1))
        explanation = match.group(2).strip()
        explanations[index] = explanation
    return explanations

overall_sim = []
work_example = 0
for gpt3_text, gpt2_text in zip(gpt3_texts, gpt2_texts):
    # 提取解释
    gpt3_explanations = extract_explanations_gpt3(gpt3_text)
    gpt2_explanations = extract_explanations_gpt2(gpt2_text)

    # print("t5_explanations", t5_explanations)
    # 计算余弦相似度
    vectorizer = TfidfVectorizer()
    all_texts = list(gpt3_explanations.values()) + list(gpt2_explanations.values())
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # 分配索引
    gpt3_indices = {index: i for i, index in enumerate(gpt3_explanations.keys())}
    gpt2_indices = {index: i + len(gpt3_explanations) for i, index in enumerate(gpt2_explanations.keys())}

    sum_sim = []
    # 打印相似度结果
    for index in gpt3_indices:
        if index in gpt2_indices:
            gpt_idx = gpt3_indices[index]
            gpt2_idx = gpt2_indices[index]
            similarity = cosine_similarity(tfidf_matrix[gpt_idx:gpt_idx+1], tfidf_matrix[gpt2_idx:gpt2_idx+1])[0][0]
            sum_sim.append(similarity)
            # print(f"Similarity between GPT and T5 explanations for passage [{index}]: {similarity:.3f}")
    if len(sum_sim)==0:
        continue
    work_example+=1
    overall_sim.append(sum(sum_sim)/len(sum_sim))
print(overall_sim)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
t5_path = "../data/T5_results/msmarco_test_t5.jsonl"
gpt3_texts = []
t5_texts = []
with open(t5_path, "r") as f:
    data = []
    for i in f.readlines():
        item = json.loads(i)
        data.append(item)
        t5_texts.append(item["t5_reason"])
        gpt3_texts.append(item["reason"])
similarities = []        
for t5_text, gpt3_text in zip(t5_texts, gpt3_texts):

    # 计算余弦相似度
    vectorizer = TfidfVectorizer()
    texts = [gpt3_text, t5_text]
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    # print(f"Overall Cosine Similarity between the two reason texts: {similarity:.3f}")
    similarities.append(similarity)
print(sum(similarities)/len(similarities))

0.6767335681996629


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
gpt2_path = "../data/GPT2_results/msmarco_test_gpt2.jsonl"
gpt3_texts = []
gpt2_texts = []
with open(gpt2_path, "r") as f:
    data = []
    for i in f.readlines():
        item = json.loads(i)
        data.append(item)
        gpt2_texts.append(item["gpt2_reason"])
        gpt3_texts.append(item["reason"])
similarities = []        
for gpt2_text, gpt3_text in zip(gpt2_texts, gpt3_texts):

    # 计算余弦相似度
    vectorizer = TfidfVectorizer()
    texts = [gpt3_text, gpt2_text]
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    # print(f"Overall Cosine Similarity between the two reason texts: {similarity:.3f}")
    similarities.append(similarity)
print(sum(similarities)/len(similarities))

0.34289941380256433
