In [42]:
import os
import pandas as pd
from utils import partial_match_scores


root = "../datasets/myriadlama/llama3.2_1b_it"
root = "../datasets/myriadlama/llama3.2_3b_it"
root = "../datasets/myriadlama/llama3.1_8b_it"
root = "../datasets/myriadlama/qwen2.5_3b_it"
root = "../datasets/myriadlama/qwen2.5_7b_it"

confi_df = pd.read_feather(os.path.join(root, "confidence.feather"))
confi_df["sample_lemmas"] = confi_df["sample_lemmas"].apply(lambda xs: [list(x) for x in xs])
confi_df["answer_lemmas"] = confi_df["answer_lemmas"].apply(lambda xs: [list(x) for x in xs])


In [43]:
from tqdm import tqdm
from utils import is_matched_str, partial_match

droot = os.path.join(root, "diversity")

ensemble_scores = []
consistency_scores = []
or_matches_scores = []
diversity_scores = []
avg_match_scores = []
new_ratio_scores = []
moe_ratio_scores = []

for fn in tqdm(os.listdir(droot)):
    if not fn.endswith(".feather"):
        continue
    
    if len(fn.split(",")) != 2:
        continue
    
    df = pd.read_feather(os.path.join(droot, fn))
    df["answer_lemmas"] = df["answer_lemmas"].apply(lambda xs: [list(x) for x in xs])
    
    # Compute partial match scores
    scores = partial_match_scores(df['predict_lemma'].tolist(), df["answer_lemmas"].tolist())

    predict_by_set = [[], [], [], []]
    for paraphrases, predict_lemma in zip(df["paraphrases"].tolist(), df['predict_lemma'].tolist()):
        for idx, paraphrase in enumerate(paraphrases):
            predict = confi_df[confi_df["paraphrase"] == paraphrase]['greedy_lemma'].tolist()[0]
            predict_by_set[idx].append(predict.tolist())
        predict_by_set[-2].append(predict_lemma)
        predict_by_set[-1].append(confi_df[confi_df["paraphrase"] == paraphrase]['answer_lemmas'].tolist()[0])
    
    consistency_matches = []
    or_matches = []
    and_matches = []
    avg_matches = []
    ensemble_matches = []

    
    diversity = []
    for predict1, predict2, ensemble_predict, answer_lemmas in zip(*predict_by_set):
        match1 = partial_match(predict1, answer_lemmas, birdirectional=False)
        match2 = partial_match(predict2, answer_lemmas, birdirectional=False)
        or_matches.append(match1 or match2)
        and_matches.append(match1 and match2)
        avg_matches.append(float(int(match1) + int(match2))/2)
        ensemble_matches.append(partial_match(ensemble_predict, answer_lemmas, birdirectional=False))
        consistency_matches.append(is_matched_str(predict1, predict2, birdirectional=True))
    
    moe_cnt = 0
    new_cnt = 0
    for and_match, or_match, ensemble_match in zip(and_matches, or_matches, ensemble_matches):
        if not and_match and or_match and ensemble_match:
            moe_cnt += 1
        if not and_match and not or_match and ensemble_match:
            new_cnt += 1
    moe_ratio_scores.append(moe_cnt / len(and_matches))
    new_ratio_scores.append(new_cnt / len(and_matches))

    # print(f"----------{fn}------------")
    # print("OR match score:\t ", sum(or_matches) / len(or_matches))
    # print("AND match score: ", sum(and_matches) / len(and_matches))
    # print("Jaccard score:\t ", sum(and_matches) / sum(or_matches))
    # print("Consistency score:", sum(consistency_matches) / len(consistency_matches))
    # print("Ensemble score:\t ", scores)
    ensemble_scores.append(scores)
    consistency_scores.append(sum(consistency_matches) / len(consistency_matches))
    or_matches_scores.append(sum(or_matches) / len(or_matches))
    diversity_scores.append(sum(or_matches) / sum(and_matches))
    avg_match_scores.append(sum(avg_matches) / len(avg_matches))

print("Pearson correlation between ensemble and consistency scores: ", pd.Series(ensemble_scores).corr(pd.Series(consistency_scores)))
print("Pearson correlation between ensemble and OR match scores: ", pd.Series(ensemble_scores).corr(pd.Series(or_matches_scores)))
print("Pearson correlation between ensemble and diversity scores: ", pd.Series(ensemble_scores).corr(pd.Series(diversity_scores)))
print("Pearson correlation between ensemble and avg match scores: ", pd.Series(ensemble_scores).corr(pd.Series(avg_match_scores)))
print(f"Mean of MOE ratio scores: {sum(moe_ratio_scores)/len(moe_ratio_scores):.4f}")
print(f"Mean of new ratio scores: {sum(new_ratio_scores)/len(new_ratio_scores):.4f}")

100%|██████████| 12/12 [00:10<00:00,  1.11it/s]

Pearson correlation between ensemble and consistency scores:  0.7181726579923353
Pearson correlation between ensemble and OR match scores:  0.8592995356762926
Pearson correlation between ensemble and diversity scores:  -0.6755791627609598
Pearson correlation between ensemble and avg match scores:  0.932651380517281
Mean of MOE ratio scores: 0.1323
Mean of new ratio scores: 0.0085



