In [17]:
# Vanilla testing

import pandas as pd

all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
result_403_path = "result/MTMT/{}/MR0_round_original.pickle"
result_s97_path = "result/MTMT/{}/S97_MR0_SNP_s06_original.pickle"

CHECK_LLM = 0

row_bugs = []
row_ETC = []
row_BTC = []
row_BSV = []
row_PRate = []

for chosen_lm in all_lms:
    df_403 = pd.read_pickle(result_403_path.format(chosen_lm))
    df_s97 = pd.read_pickle(result_s97_path.format(chosen_lm))

    df_full = pd.concat([df_403, df_s97], axis=0, ignore_index=True)

    df_full["score_semantic"]=[item[0] if item else None for item in df_full["score_semantic"]]
    Bugs = len(df_full[df_full["score_semantic"]<0.6])
    ETC = len(df_full[df_full["score_semantic"]<0.6]["source_uuid"].unique())
    BTC = Bugs/ETC
    PRate = len(df_full[df_full["score_semantic"]<0.6]) / 7983

    row_bugs.append(Bugs)
    row_ETC.append(ETC)
    row_BTC.append(BTC)
    row_PRate.append(PRate)

for row in [row_bugs, row_ETC, row_BTC, row_PRate]:
    row_text = " & ".join([str(round(item,4)) for item in row])
    print(row_text)

6654 & 5582 & 5028 & 5014 & 3042 & 1999
500 & 500 & 495 & 500 & 485 & 470
13.308 & 11.164 & 10.1576 & 10.028 & 6.2722 & 4.2532
0.8335 & 0.6992 & 0.6298 & 0.6281 & 0.3811 & 0.2504


# RQ1: Effectiveness

### MORTAR

In [24]:
import copy
import json 
import pandas as pd
import numpy as np
from itertools import combinations, product
from tqdm import tqdm
from multi_turn_test import MultiTurnConversation as MTC
score_tools = MTC.score_utils()

MORTAR_Perts = [
    "P1_round_shuffle",
    "P2_round_reduce",
    "P3_round_duplicate",
    "P4_round_reduce_shuffle",
    "P5_round_shuffle_duplicate",
    ]

dataset_jsons = {}
for this_pert in MORTAR_Perts:
    with open(f"data/{this_pert}.json", "r") as f:
        dataset_jsons[this_pert] = json.load(f)

mapping_original_round_appear_in_pert = {} # Oid-r1 -> [P1-r1, P2-r3 ...]
all_original_round_ids = []

dialogue_round_keys_403 = json.load(open("data/dialogue_round_keys_403.json", "r"))
for original_dialogue_key in dialogue_round_keys_403.keys():
    for original_round_key in dialogue_round_keys_403[original_dialogue_key]:
        original_round_uid = original_dialogue_key + "##" + original_round_key
        all_original_round_ids.append(original_round_uid)
        mapping_original_round_appear_in_pert[original_round_uid] = []

for pert_type in MORTAR_Perts:
    this_pert_dataset = dataset_jsons[pert_type]
    for pert_dialogue_key in this_pert_dataset.keys():
        for pert_round_key in this_pert_dataset[pert_dialogue_key].keys():
            pert_round_uid = pert_type + "#pert#" +pert_dialogue_key + "#pert#" + pert_round_key
            original_round = this_pert_dataset[pert_dialogue_key][pert_round_key]['is_original_round']
            original_round_uid = pert_dialogue_key + "##" + original_round # key in mapping_original_round_appear_in_pert
            mapping_original_round_appear_in_pert[original_round_uid].append(pert_round_uid)


list_pert_dialogue_usage_mr1 = {} # O1-r3 : [P1-O1-r2, P2...], (pert_round_uid), expect output(P1-O1-r2) == answer(O1-r3)
list_pert_dialogue_usage_mr2 = {} # O1-r2 : [P2-O1-r1, ...], (pert_round_uid), expect output(P2-O1-r1) != answer(O1-r2)

dict_pert_dialogue_usage_mr3 = {} # O1-r1 : {1:[P1-O1-r1, P2-O1-r3, ...]}, expect output(P1-O1-r1) == output(P2-O1-r3) and ...
dict_pert_dialogue_usage_mr4 = {} # O1-r1 : {0:[P2-O1-r1] 1:[P1-O1-r1, P2-O1-r3]}, expect output(P2-O1-r1) != output(P1-O1-r1) and output(P2-O1-r1) != output(P2-O1-r3)

for original_round_id in all_original_round_ids:
    list_pert_dialogue_usage_mr1[original_round_id] = []
    list_pert_dialogue_usage_mr2[original_round_id] = []

    this_ori_dialogue_appear_in_pert = mapping_original_round_appear_in_pert[original_round_id]

    pert_answerable_q_sum = 0
    dict_pert_answerability = {}

    for pert_round_id in this_ori_dialogue_appear_in_pert:
        pert_type = pert_round_id.split("#pert#")[0]
        pert_dialogue_key = pert_round_id.split("#pert#")[1]
        pert_round_key = pert_round_id.split("#pert#")[2]

        answerability_this_pert_round = (dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["Answer"] != "unknown") # changed answerability
        dict_pert_answerability[pert_round_key] = answerability_this_pert_round
        pert_answerable_q_sum += answerability_this_pert_round

        if answerability_this_pert_round:
            list_pert_dialogue_usage_mr1[original_round_id].append(pert_round_id)
        else:
            if dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["FLAG_answer_changed"]:
                list_pert_dialogue_usage_mr2[original_round_id].append(pert_round_id)

    if pert_answerable_q_sum == len(this_ori_dialogue_appear_in_pert) and pert_answerable_q_sum > 0: # all pert rounds are answerable
        dict_pert_dialogue_usage_mr3[original_round_id] = {1:[]}
        dict_pert_dialogue_usage_mr3[original_round_id][1] = copy.deepcopy(list_pert_dialogue_usage_mr1[original_round_id])
    elif 0<pert_answerable_q_sum<len(this_ori_dialogue_appear_in_pert): # some pert rounds are answerable, some are not
        dict_pert_dialogue_usage_mr4[original_round_id] = {0:[],1:[]}
        dict_pert_dialogue_usage_mr4[original_round_id][0] = copy.deepcopy(list_pert_dialogue_usage_mr2[original_round_id])
        dict_pert_dialogue_usage_mr4[original_round_id][1] = copy.deepcopy(list_pert_dialogue_usage_mr1[original_round_id])

for original_round_id in list_pert_dialogue_usage_mr1.copy().keys():
    if len(list_pert_dialogue_usage_mr1[original_round_id]) == 0:
        del list_pert_dialogue_usage_mr1[original_round_id]

for original_round_id in list_pert_dialogue_usage_mr2.copy().keys():
    if len(list_pert_dialogue_usage_mr2[original_round_id]) == 0:
        del list_pert_dialogue_usage_mr2[original_round_id]

for original_round_id in dict_pert_dialogue_usage_mr3.copy().keys():
    if len(dict_pert_dialogue_usage_mr3[original_round_id][1]) < 2:
        del dict_pert_dialogue_usage_mr3[original_round_id]

for original_round_id in dict_pert_dialogue_usage_mr4.copy().keys():
    if len(dict_pert_dialogue_usage_mr4[original_round_id][0]) == 0 or len(dict_pert_dialogue_usage_mr4[original_round_id][1]) == 0:
        del dict_pert_dialogue_usage_mr4[original_round_id]

all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]

dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

MORTAR_Perts = ["P1_round_shuffle", "P2_round_reduce", "P3_round_duplicate", "P4_round_reduce_shuffle", "P5_round_shuffle_duplicate"]
result_path = "result/MTMT/{}/{}.pickle"

dataset_original = json.load(open("data/500_MR0_SNP_s06_original.json", "r"))

MORTAR_result_DataFrames = {}
for chosen_lm in all_lms:
    for chosen_pert in MORTAR_Perts:
        df_result = pd.read_pickle(result_path.format(chosen_lm, chosen_pert))
        df_result = df_result[df_result["role"] == "assistant"]
        df_result["score_semantic"]=[item[0] if item else None for item in df_result["score_semantic"]]
        df_result["row_uid"] = [ (df_result.loc[row, "source_uuid"] + "##" + str(df_result.loc[row,"round"])) for row in df_result.index]
        df_result["id_pert_round"] = [chosen_pert + "#pert#" + df_result.loc[row, "source_uuid"] + "#pert#" + "Round " + str(df_result.loc[row,"round"]) for row in df_result.index]
        MORTAR_result_DataFrames[chosen_lm + "_" + chosen_pert] = copy.deepcopy(df_result)

def get_original_answer(id_pert_round):
    pert_dialogue_key = id_pert_round.split("#pert#")[1]
    original_round_key = id_pert_round.split("#pert#")[2]
    return dataset_original[pert_dialogue_key][original_round_key]["Answer"]

def get_pert_round_answer(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = MORTAR_result_DataFrames[chosen_lm + "_" + pert_type]
    return df_result[df_result["id_pert_round"] == id_pert_round]["content"].values[0]

def get_pert_round_expected_answer(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = MORTAR_result_DataFrames[chosen_lm + "_" + pert_type]
    return df_result[df_result["id_pert_round"] == id_pert_round]["expected_answer"].values[0][0]

def MORTAR_MR1_conflict_detector(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = MORTAR_result_DataFrames[chosen_lm + "_" + pert_type]
    score = df_result[df_result["id_pert_round"] == id_pert_round]["score_semantic"].values[0]

    BUG_FLAG=False
    if score < 0.6:
        BUG_FLAG=True
    ERROR = abs((1 - score)/2)
    return BUG_FLAG, ERROR

def MORTAR_MR2_conflict_detector(id_pert_round):
    output = get_pert_round_answer(id_pert_round)
    answer_original = get_original_answer(id_pert_round)
    score = score_tools.semantic_similarity(output, answer_original)

    BUG_FLAG=False
    if score > 0.6:
        BUG_FLAG=True

    similarity_to_unknown = score_tools.semantic_similarity(output, "unknown")
    ERROR = abs((1 - similarity_to_unknown)/2)
    return BUG_FLAG, ERROR

def MORTAR_MR3_conflict_detector(list_pert_round_1):
    two_element_combinations = list(combinations(list_pert_round_1, 2))
    max_similarity = -1
    for pair in two_element_combinations:
        sentence_1 = get_pert_round_answer(pair[0])
        sentence_2 = get_pert_round_answer(pair[1])
        similarity = score_tools.semantic_similarity(sentence_1, sentence_2)

        if similarity > max_similarity:
            max_similarity = similarity

        if similarity < 0.6:
            ERROR = abs((1 - similarity)/2)
            return True, ERROR, pair
    return False, abs((1 - max_similarity)/2), None

def MORTAR_MR4_conflict_detector(list_pert_rounds_1, list_pert_rounds_0):
    max_similarity = -1
    pairs = product(list_pert_rounds_1, list_pert_rounds_0)
    for pair in pairs:
        id_pert_round_1 = pair[0]
        id_pert_round_0 = pair[1]
        output_1 = get_pert_round_answer(id_pert_round_1)
        output_0 = get_pert_round_answer(id_pert_round_0)

        similarity = score_tools.semantic_similarity(output_1, output_0)
        
        if similarity > max_similarity:
            max_similarity = similarity

        if similarity > 0.6:
            answer_1 = get_pert_round_expected_answer(id_pert_round_1)
            answer_0 = get_pert_round_expected_answer(id_pert_round_0)
            
            embed_1 = score_tools.encoder(output_1)-score_tools.encoder(output_0)
            embed_2 = score_tools.encoder(answer_1)-score_tools.encoder(answer_0)
            cos = score_tools.cosine_similarity(embed_1, embed_2)
            ERROR = abs((1 - cos)/2)
            return True, ERROR, pair
    
    return False, abs((-1 - max_similarity)/2), None


PAPER_RESULT_MORTAR = {}
FOUNDED_Bugs_MORTAR = {}

for chosen_lm in all_lms:

    MR1_TOOGLE = True
    MR2_TOOGLE = True
    MR3_TOOGLE = True
    MR4_TOOGLE = True

    MORTAR_Bugs = 0
    MORTAR_ETC = []
    MORTAR_BTC = 0
    MORTAR_BSV_sum = 0
    MORTAR_Amount_detection = 0

    # MR1
    MR1_Amount_detection = 0
    MR1_Bugs = []
    MR1_ETC = []
    MR1_BSV_sum = 0

    if MR1_TOOGLE:
        print("MR1")
        for original_round_id in tqdm(list_pert_dialogue_usage_mr1.keys()):
            original_dialogue_key = original_round_id.split("##")[0]

            for pert_round_id in list_pert_dialogue_usage_mr1[original_round_id]:
                pert_type = pert_round_id.split("#pert#")[0]
                pert_dialogue_key = pert_round_id.split("#pert#")[1]

                BUG_FLAG, ERROR = MORTAR_MR1_conflict_detector(pert_round_id)
                MR1_Amount_detection += 1
                if BUG_FLAG:
                    id_bug = "MR1#mr#" + original_round_id + "#bug#" + pert_round_id
                    MR1_Bugs.append(id_bug)
                    MR1_ETC.append(original_dialogue_key)
                    MR1_BSV_sum += ERROR

    # MR2
    MR2_Amount_detection = 0
    MR2_Bugs = []
    MR2_ETC = []
    MR2_BSV_sum = 0

    if MR2_TOOGLE:
        print("MR2")
        for original_round_id in tqdm(list_pert_dialogue_usage_mr2.keys()):
            original_dialogue_key = original_round_id.split("##")[0]

            for pert_round_id in list_pert_dialogue_usage_mr2[original_round_id]:
                pert_type = pert_round_id.split("#pert#")[0]
                pert_dialogue_key = pert_round_id.split("#pert#")[1]
                # pert_round_key = pert_round_id.split("#pert#")[2]

                # id_test_case = pert_type + "#testcase#"
                BUG_FLAG, ERROR = MORTAR_MR2_conflict_detector(pert_round_id)
                MR2_Amount_detection += 1
                if BUG_FLAG:
                    id_bug = "MR2#mr#" + original_round_id + "#bug#" + pert_round_id
                    MR2_Bugs.append(id_bug)
                    MR2_ETC.append(original_dialogue_key)
                    MR2_BSV_sum += ERROR

    # MR3
    MR3_Amount_detection = 0
    MR3_Bugs = []
    MR3_ETC = []
    MR3_BSV_sum = 0
    if MR3_TOOGLE:
        print("MR3")
        for original_round_id in tqdm(dict_pert_dialogue_usage_mr3.keys()):
            original_dialogue_key = original_round_id.split("##")[0]

            # id_test_case = pert_type + "#testcase#"
            BUG_FLAG, ERROR, Bug_pair = MORTAR_MR3_conflict_detector(dict_pert_dialogue_usage_mr3[original_round_id][1])
            MR3_Amount_detection += 1
            if BUG_FLAG:
                id_bug =  "MR3#mr#" + original_round_id + "#bug#" + Bug_pair[0] + "#bug#" + Bug_pair[1]
                MR3_Bugs.append(id_bug)
                MR3_ETC.append(original_dialogue_key)
                MR3_BSV_sum += ERROR

    # MR4
    MR4_Amount_detection = 0
    MR4_Bugs = []
    MR4_ETC = []
    MR4_BSV_sum = 0
    if MR4_TOOGLE:
        print("MR4")
        for original_round_id in tqdm(dict_pert_dialogue_usage_mr4.keys()):
            original_dialogue_key = original_round_id.split("##")[0]

            list_pert_round_id_1 = dict_pert_dialogue_usage_mr4[original_round_id][1]
            list_pert_round_id_0 = dict_pert_dialogue_usage_mr4[original_round_id][0]

            BUG_FLAG, ERROR, Bug_pair = MORTAR_MR4_conflict_detector(dict_pert_dialogue_usage_mr4[original_round_id][1], dict_pert_dialogue_usage_mr4[original_round_id][0])
            MR4_Amount_detection += 1
            if BUG_FLAG:
                id_bug = "MR4#mr#" + original_round_id + "#bug#" + Bug_pair[0] + "#bug#" + Bug_pair[1]
                MR4_Bugs.append(id_bug)
                MR4_ETC.append(original_dialogue_key)
                MR4_BSV_sum += ERROR

    MORTAR_Bug_founded = MR1_Bugs + MR2_Bugs + MR3_Bugs + MR4_Bugs
    MORTAR_Bugs = len(MORTAR_Bug_founded)
    MORTAR_ETC_ids = set(MR1_ETC + MR2_ETC + MR3_ETC + MR4_ETC)
    MORTAR_ETC = round(len(MORTAR_ETC_ids)/403*100, 1) # percentage
    MORTAR_BTC = MORTAR_Bugs/len(MORTAR_ETC_ids)
    MORTAR_BTS = MORTAR_Bugs/403
    MORTAR_BSV = (MR1_BSV_sum + MR2_BSV_sum + MR3_BSV_sum + MR4_BSV_sum)/MORTAR_Bugs
    MORTAR_Amount_detection = MR1_Amount_detection + MR2_Amount_detection + MR3_Amount_detection + MR4_Amount_detection
    MORTAR_Rate_Plus = round(MORTAR_Bugs/MORTAR_Amount_detection*100, 1) 
    MORTAR_BugScore = MORTAR_BSV * MORTAR_BTS

    PAPER_RESULT_MORTAR[chosen_lm] = {"Bugs":MORTAR_Bugs, "ETC":MORTAR_ETC, "BTC":round(MORTAR_BTC,3), "BSV":round(MORTAR_BSV,3), "Rate+":MORTAR_Rate_Plus, "BugScore":round(MORTAR_BugScore,3)}
    FOUNDED_Bugs_MORTAR[chosen_lm] = MORTAR_Bug_founded

    print("{} & {}  & {}\\%  & {}  & {}  & {}\\% & {} \\\\ ".format(dist_LLM_DS_name[chosen_lm], MORTAR_Bugs, MORTAR_ETC, round(MORTAR_BTC,3), round(MORTAR_BSV,3), MORTAR_Rate_Plus, round(MORTAR_BugScore,3)))


MR1


100%|██████████| 6375/6375 [00:16<00:00, 380.70it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 37.89it/s]


MR3


100%|██████████| 6194/6194 [01:56<00:00, 53.09it/s]


MR4


100%|██████████| 151/151 [00:05<00:00, 25.17it/s]


DS1 & 31942  & 100.0\%  & 79.261  & 0.375  & 86.5\% & 29.688 \\ 
MR1


100%|██████████| 6375/6375 [00:16<00:00, 389.57it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 41.56it/s]


MR3


100%|██████████| 6194/6194 [02:08<00:00, 48.33it/s]


MR4


100%|██████████| 151/151 [00:06<00:00, 23.61it/s]


DS2 & 27799  & 100.0\%  & 68.98  & 0.368  & 75.3\% & 25.378 \\ 
MR1


100%|██████████| 6375/6375 [00:16<00:00, 393.32it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 42.61it/s]


MR3


100%|██████████| 6194/6194 [04:38<00:00, 22.26it/s]


MR4


100%|██████████| 151/151 [00:06<00:00, 24.44it/s]


DS3 & 24634  & 100.0\%  & 61.127  & 0.35  & 66.7\% & 21.381 \\ 
MR1


100%|██████████| 6375/6375 [00:16<00:00, 388.38it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 40.85it/s]


MR3


100%|██████████| 6194/6194 [05:40<00:00, 18.20it/s]


MR4


100%|██████████| 151/151 [00:05<00:00, 28.74it/s]


DS4 & 22919  & 100.0\%  & 56.871  & 0.344  & 62.1\% & 19.551 \\ 
MR1


100%|██████████| 6375/6375 [00:16<00:00, 390.13it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 42.12it/s]


MR3


100%|██████████| 6194/6194 [05:01<00:00, 20.56it/s]


MR4


100%|██████████| 151/151 [00:05<00:00, 26.86it/s]


DS5 & 17264  & 100.0\%  & 42.839  & 0.346  & 46.8\% & 14.831 \\ 
MR1


100%|██████████| 6375/6375 [00:16<00:00, 393.96it/s]


MR2


100%|██████████| 151/151 [00:03<00:00, 42.67it/s]


MR3


100%|██████████| 6194/6194 [06:37<00:00, 15.59it/s]


MR4


100%|██████████| 151/151 [00:05<00:00, 27.42it/s]

DS6 & 12805  & 99.8\%  & 31.853  & 0.348  & 34.7\% & 11.067 \\ 





### METAL

In [23]:
import copy
import json 
import pandas as pd
import numpy as np
from itertools import combinations, product
from tqdm import tqdm
from multi_turn_test import MultiTurnConversation as MTC
score_tools = MTC.score_utils()

METAL_perts = [
    "500_MR10_SNP_s06_synonym_replacement",
    "500_MR13_SNP_s06_add_words",
    "500_MR11_SNP_s06_introduce_typos",
    "500_MR12_SNP_s06_to_leet",
]

dataset_jsons = {}

for this_pert in METAL_perts:
    with open(f"data/{this_pert}.json", "r") as f:
        dataset_jsons[this_pert] = json.load(f)

for this_pert in METAL_perts:
    id_pert_dialogues = []
    id_pert_dialogue_has_uanswerability_round = []
    id_pert_dialogue_usage_mr1 = []
    id_pert_dialogue_usage_mr2 = []

    id_pert_rounds = []
    id_pert_rounds_un_answerable = []
    id_pert_rounds_question_changed = []
    
    this_pert_dataset = dataset_jsons[this_pert]
    for dialogue_key in this_pert_dataset.keys():
        id_pert_dialogues.append(dialogue_key)
        for pert_round_key in this_pert_dataset[dialogue_key].keys():
            pert_round_uid = dialogue_key + "##" + pert_round_key
            id_pert_rounds.append(pert_round_uid)

            if this_pert_dataset[dialogue_key][pert_round_key]["Question"] != this_pert_dataset[dialogue_key][pert_round_key]["Original_question"]:
                id_pert_rounds_question_changed.append(pert_round_uid)
                id_pert_dialogue_usage_mr1.append(dialogue_key) if dialogue_key not in id_pert_dialogue_usage_mr1 else None

mapping_original_round_appear_in_pert = {} # Oid-r1 -> [P1-r1, P2-r3 ...]
all_original_round_ids = []
all_pert_round_ids = []

dialogue_round_keys_500 = json.load(open("data/dialogue_round_keys_500.json", "r"))
for original_dialogue_key in dialogue_round_keys_500.keys():
    for original_round_key in dialogue_round_keys_500[original_dialogue_key]:
        original_round_uid = original_dialogue_key + "##" + original_round_key
        all_original_round_ids.append(original_round_uid)
        mapping_original_round_appear_in_pert[original_round_uid] = []

for pert_type in METAL_perts:
    this_pert_dataset = dataset_jsons[pert_type]
    for pert_dialogue_key in this_pert_dataset.keys():
        for pert_round_key in this_pert_dataset[pert_dialogue_key].keys():
            id_pert_round = pert_type + "#pert#" +pert_dialogue_key + "#pert#" + pert_round_key
            all_pert_round_ids.append(id_pert_round)
            original_round = this_pert_dataset[pert_dialogue_key][pert_round_key]['is_original_round']
            original_round_uid = pert_dialogue_key + "##" + original_round # key in mapping_original_round_appear_in_pert
            mapping_original_round_appear_in_pert[original_round_uid].append(id_pert_round)


def get_original_round_id(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    pert_dialogue_key = id_pert_round.split("#pert#")[1]
    pert_round_key = id_pert_round.split("#pert#")[2]
    this_pert_dataset = dataset_jsons[pert_type]
    return pert_dialogue_key + "##" + this_pert_dataset[pert_dialogue_key][pert_round_key]['is_original_round']

def get_following_round_ids(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    pert_dialogue_key = id_pert_round.split("#pert#")[1]
    pert_round_key = id_pert_round.split("#pert#")[2]

    this_pert_dataset = dataset_jsons[pert_type]
    following_round_ids = []
    for round_key in this_pert_dataset[pert_dialogue_key].keys():
        if int(round_key.split(" ")[1]) > int(pert_round_key.split(" ")[1]):
            following_round_ids.append(pert_type + "#pert#" + pert_dialogue_key + "#pert#" + round_key)
    return following_round_ids

def question_changed(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    pert_dialogue_key = id_pert_round.split("#pert#")[1]
    pert_round_key = id_pert_round.split("#pert#")[2]
    return dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["Question"] != dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["Original_question"]


dict_pert_dialogue_usage_mrt1 = {} # O1-r3 : [P1-O1-r2, P2...], (pert_round_uid), expect output(P1-O1-r2) == answer(O1-r3)

for original_round_id in all_original_round_ids:
    dict_pert_dialogue_usage_mrt1[original_round_id] = []


# unperturbed rounds after perturbed round will not be excluded
for id_pert_round in all_pert_round_ids:
    original_round_id = get_original_round_id(id_pert_round)
    if question_changed(id_pert_round):
        dict_pert_dialogue_usage_mrt1[original_round_id].append(id_pert_round)

for original_round_id in dict_pert_dialogue_usage_mrt1.copy().keys():
    if len(dict_pert_dialogue_usage_mrt1[original_round_id]) == 0:
        del dict_pert_dialogue_usage_mrt1[original_round_id]


for original_round_id in dict_pert_dialogue_usage_mrt1.copy().keys():
    new_list = [item.lstrip("500_") for item in dict_pert_dialogue_usage_mrt1[original_round_id]]
    dict_pert_dialogue_usage_mrt1[original_round_id] = new_list


all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]

dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

METAL_Perts = [
    "MR10_SNP_s06_synonym_replacement",
    "MR11_SNP_s06_introduce_typos",
    "MR12_SNP_s06_to_leet",
    "MR13_SNP_s06_add_words"]

METAL_Perts_S97 = [
    "S97_MR10_SNP_s06_synonym_replacement",
    "S97_MR11_SNP_s06_introduce_typos",
    "S97_MR12_SNP_s06_to_leet",
    "S97_MR13_SNP_s06_add_words"]


result_path = "result/MTMT/{}/{}.pickle"

dataset_original = json.load(open("data/500_MR0_SNP_s06_original.json", "r"))

METAL_result_DataFrames = {}
for chosen_lm in all_lms:
    for i_chosen_pert in range(len(METAL_Perts)):
        chosen_pert = METAL_Perts[i_chosen_pert]
        
        chosen_pert_s97 = METAL_Perts_S97[i_chosen_pert]
        df_result = pd.read_pickle(result_path.format(chosen_lm, chosen_pert))
        df_result_s97 = pd.read_pickle(result_path.format(chosen_lm, chosen_pert_s97))
        df_result = pd.concat([df_result, df_result_s97], ignore_index=True)

        df_result = df_result[df_result["role"] == "assistant"]
        df_result["score_semantic"]=[item[0] if item else None for item in df_result["score_semantic"]]
        df_result["row_uid"] = [ (df_result.loc[row, "source_uuid"] + "##" + str(df_result.loc[row,"round"])) for row in df_result.index]
        df_result["id_pert_round"] = [chosen_pert + "#pert#" + df_result.loc[row, "source_uuid"] + "#pert#" + "Round " + str(df_result.loc[row,"round"]) for row in df_result.index]
        METAL_result_DataFrames[chosen_lm + "_" + chosen_pert] = copy.deepcopy(df_result)

def get_original_answer(id_pert_round):
    pert_dialogue_key = id_pert_round.split("#pert#")[1]
    original_round_key = id_pert_round.split("#pert#")[2]
    return dataset_original[pert_dialogue_key][original_round_key]["Answer"]

def get_pert_round_answer(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = METAL_result_DataFrames[chosen_lm + "_" + pert_type]
    return df_result[df_result["id_pert_round"] == id_pert_round]["content"].values[0]

def get_pert_round_expected_answer(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = METAL_result_DataFrames[chosen_lm + "_" + pert_type]
    return df_result[df_result["id_pert_round"] == id_pert_round]["expected_answer"].values[0][0]

def METAL_MRT1_conflict_detector(id_pert_round):
    pert_type = id_pert_round.split("#pert#")[0]
    df_result = METAL_result_DataFrames[chosen_lm + "_" + pert_type]
    score = df_result[df_result["id_pert_round"] == id_pert_round]["score_semantic"].values[0]

    BUG_FLAG=False
    if score < 0.6:
        BUG_FLAG=True
    ERROR = abs((1 - score)/2)
    return BUG_FLAG, ERROR

PAPER_RESULT_METAL={}
FOUNDED_Bugs_METAL = {}

for chosen_lm in all_lms[:]:

    MR1_TOOGLE = True
    MR3_TOOGLE = True

    METAL_Bugs = 0
    METAL_ETC = []
    METAL_BTC = 0
    METAL_BSV_sum = 0
    METAL_Amount_detection = []

    # MR1
    MR1_Amount_detection = []
    MR1_Bugs = []
    MR1_ETC = []
    MR1_BSV_sum = 0

    if MR1_TOOGLE:
        print("MR1")
        for original_round_id in tqdm(dict_pert_dialogue_usage_mrt1.keys()):
            original_dialogue_key = original_round_id.split("##")[0]
            original_round_key = original_round_id.split("##")[1]

            for pert_round_id in dict_pert_dialogue_usage_mrt1[original_round_id]:
                pert_type = pert_round_id.split("#pert#")[0]
                pert_dialogue_key = pert_round_id.split("#pert#")[1]
 
                BUG_FLAG, ERROR = METAL_MRT1_conflict_detector(pert_round_id)
                id_MR_check = "MR1" + "#MRCheck#" + pert_round_id
                MR1_Amount_detection.append(id_MR_check)
                if BUG_FLAG:
                    id_bug = "MR1#mr#" + original_round_id + "#bug#" + pert_type + "#pert#" + pert_dialogue_key
                    MR1_Bugs.append(id_bug)
                    MR1_ETC.append(pert_dialogue_key)
                    MR1_BSV_sum += ERROR

    METAL_Bug_founded = MR1_Bugs
    METAL_Bugs = len(METAL_Bug_founded)
    METAL_ETC_ids = set(MR1_ETC)
    METAL_ETC = round(len(METAL_ETC_ids)/500*100, 1) # percentage
    METAL_BTC = METAL_Bugs/len(METAL_ETC_ids)
    METAL_BTS = METAL_Bugs/500
    METAL_BSV = (MR1_BSV_sum)/METAL_Bugs
    METAL_Amount_detection = len(MR1_Amount_detection)
    METAL_Rate_Plus = round(METAL_Bugs/METAL_Amount_detection*100, 1) 
    METAL_BugScore = METAL_BSV * METAL_BTS

    PAPER_RESULT_METAL[chosen_lm] = {"Bugs":METAL_Bugs, "ETC":METAL_ETC, "BTC":round(METAL_BTC,3), "BSV":round(METAL_BSV,3), "Rate+":METAL_Rate_Plus, "BugScore":round(METAL_BugScore,3)}
    FOUNDED_Bugs_METAL[chosen_lm] = METAL_Bug_founded

    print("{} & {}  & {}\\%  & {}  & {}  & {}\\% & {} \\\\ ".format(dist_LLM_DS_name[chosen_lm], METAL_Bugs, METAL_ETC, round(METAL_BTC,3), round(METAL_BSV,3), METAL_Rate_Plus, round(METAL_BugScore,3)))

MR1


100%|██████████| 6203/6203 [00:09<00:00, 686.29it/s]


DS1 & 12652  & 100.0\%  & 25.304  & 0.388  & 86.3\% & 9.823 \\ 
MR1


100%|██████████| 6203/6203 [00:09<00:00, 680.52it/s]


DS2 & 10710  & 100.0\%  & 21.42  & 0.37  & 73.0\% & 7.935 \\ 
MR1


100%|██████████| 6203/6203 [00:09<00:00, 677.06it/s]


DS3 & 9754  & 100.0\%  & 19.508  & 0.354  & 66.5\% & 6.9 \\ 
MR1


100%|██████████| 6203/6203 [00:09<00:00, 676.21it/s]


DS4 & 9876  & 100.0\%  & 19.752  & 0.357  & 67.3\% & 7.043 \\ 
MR1


100%|██████████| 6203/6203 [00:09<00:00, 683.73it/s]


DS5 & 6631  & 99.8\%  & 13.289  & 0.356  & 45.2\% & 4.718 \\ 
MR1


100%|██████████| 6203/6203 [00:08<00:00, 696.14it/s]

DS6 & 4199  & 98.2\%  & 8.552  & 0.346  & 28.6\% & 2.908 \\ 





### Result

In [26]:
import pickle

with open("RQ1_PAPER_RESULT_METAL.pickle", "rb") as f:
    PAPER_RESULT_METAL = pickle.load(f)
with open("RQ1_FOUNDED_Bugs_METAL.pickle", "rb") as f:
    FOUNDED_Bugs_METAL = pickle.load(f) 
with open("RQ1_PAPER_RESULT_MORTAR.pickle", "rb") as f:
    PAPER_RESULT_MORTAR = pickle.load(f)
with open("RQ1_FOUNDED_Bugs_MORTAR.pickle", "rb") as f:
    FOUNDED_Bugs_MORTAR = pickle.load(f)

In [29]:
all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]

for chosen_lm in all_lms:
    print("{} & {}  & {}\\%  & {}  & {}\\%  & {}  & {}\\%  & {} & {}\\% \\\\ ".format(
        dist_LLM_DS_name[chosen_lm],
        PAPER_RESULT_METAL[chosen_lm]["Bugs"], 
        PAPER_RESULT_METAL[chosen_lm]["ETC"], 
        PAPER_RESULT_METAL[chosen_lm]["BTC"], 
        PAPER_RESULT_METAL[chosen_lm]["Rate+"], 
        PAPER_RESULT_MORTAR[chosen_lm]["Bugs"], 
        PAPER_RESULT_MORTAR[chosen_lm]["ETC"], 
        PAPER_RESULT_MORTAR[chosen_lm]["BTC"],  
        PAPER_RESULT_MORTAR[chosen_lm]["Rate+"]))
    
print("Average & {}  & {}\\%  & {}  & {}\\%  & {}  & {}\\%  & {}    & {}\\%   \\\\ ".format(
    int(np.mean([PAPER_RESULT_METAL[llm]["Bugs"] for llm in all_lms])),
    round(np.mean([PAPER_RESULT_METAL[llm]["ETC"] for llm in all_lms]), 1),
    round(np.mean([PAPER_RESULT_METAL[llm]["BTC"] for llm in all_lms]), 3),
    # round(np.mean([PAPER_RESULT_METAL[llm]["BSV"] for llm in all_lms]), 3),
    round(np.mean([PAPER_RESULT_METAL[llm]["Rate+"] for llm in all_lms]), 1),
    # round(np.mean([PAPER_RESULT_METAL[llm]["BugScore"] for llm in all_lms]), 3),
    
    int(np.mean([PAPER_RESULT_MORTAR[llm]["Bugs"] for llm in all_lms])),
    round(np.mean([PAPER_RESULT_MORTAR[llm]["ETC"] for llm in all_lms]), 1),
    round(np.mean([PAPER_RESULT_MORTAR[llm]["BTC"] for llm in all_lms]), 3),
    # round(np.mean([PAPER_RESULT_MORTAR[llm]["BSV"] for llm in all_lms]), 3),
    round(np.mean([PAPER_RESULT_MORTAR[llm]["Rate+"] for llm in all_lms]), 1),
    # round(np.mean([PAPER_RESULT_MORTAR[llm]["BugScore"] for llm in all_lms]), 3)
    )
)

DS1 & 12652  & 100.0\%  & 25.304  & 86.3\%  & 31942  & 100.0\%  & 79.261 & 86.5\% \\ 
DS2 & 10710  & 100.0\%  & 21.42  & 73.0\%  & 27799  & 100.0\%  & 68.98 & 75.3\% \\ 
DS3 & 9754  & 100.0\%  & 19.508  & 66.5\%  & 24634  & 100.0\%  & 61.127 & 66.7\% \\ 
DS4 & 9876  & 100.0\%  & 19.752  & 67.3\%  & 22919  & 100.0\%  & 56.871 & 62.1\% \\ 
DS5 & 6631  & 99.8\%  & 13.289  & 45.2\%  & 17264  & 100.0\%  & 42.839 & 46.8\% \\ 
DS6 & 4199  & 98.2\%  & 8.552  & 28.6\%  & 12805  & 99.8\%  & 31.853 & 34.7\% \\ 
Average & 8970  & 99.7\%  & 17.971  & 61.2\%  & 22893  & 100.0\%  & 56.822    & 62.0\%   \\ 


# RQ2: Bug Quality

## 2.1 Diversity

In [None]:
df_MORTAR_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_MORTAR_with_bug_type.pickle")
df_METAL_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_METAL_with_bug_type.pickle")

all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

for llm in all_lms:
    this_llm_l1_MORTAR = len(df_MORTAR_bugs[(df_MORTAR_bugs["SUT"] == llm) & (df_MORTAR_bugs["bug_type"] == 1)])
    this_llm_l2_MORTAR = len(df_MORTAR_bugs[(df_MORTAR_bugs["SUT"] == llm) & (df_MORTAR_bugs["bug_type"] == 2)])
    this_llm_l3_MORTAR = len(df_MORTAR_bugs[(df_MORTAR_bugs["SUT"] == llm) & (df_MORTAR_bugs["bug_type"] == 3)])

    this_llm_MORTAR_total = len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == llm])

    this_llm_l1_METAL = len(df_METAL_bugs[(df_METAL_bugs["SUT"] == llm) & (df_METAL_bugs["bug_type"] == 1)])
    this_llm_l2_METAL = len(df_METAL_bugs[(df_METAL_bugs["SUT"] == llm) & (df_METAL_bugs["bug_type"] == 2)])
    this_llm_l3_METAL = len(df_METAL_bugs[(df_METAL_bugs["SUT"] == llm) & (df_METAL_bugs["bug_type"] == 3)])
    
    this_llm_METAL_total = len(df_METAL_bugs[df_METAL_bugs["SUT"] == llm])

    print("{} & {} & {} &{} & {} & {} &{} &{} & {} \\\\ ".format(
        dist_LLM_DS_name[llm],
        this_llm_l1_METAL,
        # round(this_llm_l1_METAL/this_llm_METAL_total*100,1),
        this_llm_l2_METAL,
        # round(this_llm_l2_METAL/this_llm_METAL_total*100,1),
        this_llm_l3_METAL,
        # round(this_llm_l3_METAL/this_llm_METAL_total*100,1),
        this_llm_METAL_total,
        this_llm_l1_MORTAR,
        # round(this_llm_l1_MORTAR/this_llm_MORTAR_total*100,1),
        this_llm_l2_MORTAR,
        # round(this_llm_l2_MORTAR/this_llm_MORTAR_total*100,1),
        this_llm_l3_MORTAR,
        # round(this_llm_l3_MORTAR/this_llm_MORTAR_total*100,1),
        this_llm_MORTAR_total
    ))
print("\\midrule")

print("Average & {} & {} &{} & {} & {} &{} &{} & {} \\\\ ".format(
    int(df_METAL_bugs["bug_type"].value_counts()[1]/6),
    int(df_METAL_bugs["bug_type"].value_counts()[2]/6),
    int(df_METAL_bugs["bug_type"].value_counts()[3]/6),
    len(df_METAL_bugs),
    int(df_MORTAR_bugs["bug_type"].value_counts()[1]/6),
    int(df_MORTAR_bugs["bug_type"].value_counts()[2]/6),
    int(df_MORTAR_bugs["bug_type"].value_counts()[3]/6),
    len(df_MORTAR_bugs)
))

DS1 & 10903 & 1749 &0 & 12652 & 27723 &4219 &0 & 31942 \\ 
DS2 & 8103 & 2607 &0 & 10710 & 21504 &6295 &0 & 27799 \\ 
DS3 & 7723 & 2000 &31 & 9754 & 19083 &5399 &152 & 24634 \\ 
DS4 & 7463 & 2413 &0 & 9876 & 18716 &4203 &0 & 22919 \\ 
DS5 & 4052 & 2490 &89 & 6631 & 10355 &6664 &245 & 17264 \\ 
DS6 & 2555 & 1573 &71 & 4199 & 6819 &5756 &230 & 12805 \\ 
\midrule
Average & 6799 & 2138 &31 & 53822 & 17366 &5422 &104 & 137363 \\ 


In [7]:
import numpy as np

def coefficient_of_variation(data, *, sample=True):
    values = np.asarray(data, dtype=float)
    mean = values.mean()
    ddof = 1 if sample else 0
    std = values.std(ddof=ddof)
    return std / mean if mean != 0 else np.nan

data = [
    [10903 , 1749 ,0 , 12652 , 27723 ,4219 ,0 , 31942 ],
    [8103 , 2607 ,0 , 10710 , 21504 ,6295 ,0 , 27799 ],
    [7723 , 2000 ,31 , 9754 , 19083 ,5399 ,152 , 24634 ],
    [7463 , 2413 ,0 , 9876 , 18716 ,4203 ,0 , 22919 ],
    [4052 , 2490 ,89 , 6631 , 10355 ,6664 ,245 , 17264 ],
    [2555 , 1573 ,71 , 4199 , 6819 ,5756 ,230 , 12805]
    ]

DS = ["DS1", "DS2", "DS3", "DS4", "DS5", "DS6"]

METAL_sum = 0
MORTAR_sum = 0

for i in range(6):
    METAL = coefficient_of_variation(data[i][0:2])
    MORTAR = coefficient_of_variation(data[i][4:6])
    print(f"{DS[i]}: METAL = {METAL:.3f}, MORTAR = {MORTAR:.3f}")   
    METAL_sum += METAL
    MORTAR_sum += MORTAR

print(f"METAL_sum = {METAL_sum/6:.3f}")
print(f"MORTAR_sum = {MORTAR_sum/6:.3f}")

DS1: METAL = 1.023, MORTAR = 1.041
DS2: METAL = 0.726, MORTAR = 0.774
DS3: METAL = 0.832, MORTAR = 0.790
DS4: METAL = 0.723, MORTAR = 0.896
DS5: METAL = 0.338, MORTAR = 0.307
DS6: METAL = 0.336, MORTAR = 0.120
METAL_sum = 0.663
MORTAR_sum = 0.654


## 2.2 Precision (Manual Check)

### MORTAR

In [None]:
import json

dataset_dev_full= json.load(open("data/extracted_dev_all_final_update_3.json"))
dataset_original = json.load(open("data/500_MR0_SNP_s06_original.json", "r"))

import json 
import pandas as pd
import copy

# MORTAR dataset
MORTAR_Perts = [
    "P1_round_shuffle",
    "P2_round_reduce",
    "P3_round_duplicate",
    "P4_round_reduce_shuffle",
    "P5_round_shuffle_duplicate",
    ]
MORTAR_dataset_jsons = {}
for this_pert in MORTAR_Perts:
    with open(f"data/{this_pert}.json", "r") as f:
        MORTAR_dataset_jsons[this_pert] = json.load(f)


# MORTAR result
all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
result_path = "result/MTMT/{}/{}.pickle"
dataset_original = json.load(open("data/500_MR0_SNP_s06_original.json", "r"))
MORTAR_result_DataFrames = {}
for chosen_lm in all_lms:
    for chosen_pert in MORTAR_Perts:
        df_result = pd.read_pickle(result_path.format(chosen_lm, chosen_pert))
        df_result = df_result[df_result["role"] == "assistant"]
        df_result["score_semantic"]=[item[0] if item else None for item in df_result["score_semantic"]]
        df_result["row_uid"] = [ (df_result.loc[row, "source_uuid"] + "##" + str(df_result.loc[row,"round"])) for row in df_result.index]
        df_result["id_pert_round"] = [chosen_pert + "#pert#" + df_result.loc[row, "source_uuid"] + "#pert#" + "Round " + str(df_result.loc[row,"round"]) for row in df_result.index]
        MORTAR_result_DataFrames[chosen_lm + "_" + chosen_pert] = copy.deepcopy(df_result)

def get_original_story(dialogue_key):
    return dataset_dev_full[dialogue_key]["story_material"]

def get_original_qa(dialogue_key):
    return dataset_dev_full[dialogue_key]['combined']

def get_perturbed_qa_MORTAR(dialogue_key, pert):
    return MORTAR_dataset_jsons[pert][dialogue_key]

def pert_to_readable(pert_rounds):
    result = ""
    all_rounds = pert_rounds.keys()
    for round in all_rounds:
        result += f"{round} - {pert_rounds[round]['Question']} \n {round} - {pert_rounds[round]['Answer']}\n"
    return result

df_MORTAR_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_MORTAR_df.pickle")
df_MORTAR_check_bugs = df_MORTAR_bugs.sample(100)
df_MORTAR_check_bugs.sort_values(by=["dialogue_key", "original_round_num", "SUT"], inplace=True)

temp_container = []
for index in df_MORTAR_check_bugs.index:
    row_bug = df_MORTAR_check_bugs.loc[index]
    bug_id = row_bug["bug_id"]
    use_mr = row_bug["use_mr"]
    use_pert = row_bug["use_pert_1"]
    dialogue_key = row_bug["dialogue_key"]
    story = get_original_story(dialogue_key)
    original_qa = get_original_qa(dialogue_key)

    target_round_1 = row_bug["pert_round_num_1"]
    pertuabed_1 = pert_to_readable(get_perturbed_qa_MORTAR(dialogue_key, row_bug["use_pert_1"]))
    pert_round_id = dialogue_key + "##" + target_round_1 # 3dr23u6we5exclen4th8uq9rb42tel##1
    temp_df = MORTAR_result_DataFrames[row_bug["SUT"] + "_" + use_pert]
    llm_output_1 = temp_df[temp_df["row_uid"] == pert_round_id]["content"].values[0]
    expect_output_1 = temp_df[temp_df["row_uid"] == pert_round_id]["expected_answer"].values[0][0]

    if use_mr in ["MR1", "MR2"]:
        temp_container.append({"bug_id":bug_id,
                               "dialogue_key":dialogue_key,
                               "story":story,
                               "story_cn":None,
                               "original_qa":original_qa,
                               "pertuabed_1":pertuabed_1,
                               "target_round_1":target_round_1,
                               "llm_output_1":llm_output_1,
                               "expect_output_1":expect_output_1,
                                "pertuabed_2":None,
                                "target_round_2":None,
                                "llm_output_2":None,
                                "expect_output_2":None,
                               "note": "Answer should be both correct and short.",
                               "isTruePositive":None}),

    elif use_mr in ["MR3","MR4"]:
        target_round_2 = row_bug["pert_round_num_2"]
        pertuabed_2 = pert_to_readable(get_perturbed_qa_MORTAR(dialogue_key, row_bug["use_pert_2"]))
        pert_round_id = dialogue_key + "##" + target_round_2
        temp_df = MORTAR_result_DataFrames[row_bug["SUT"] + "_" + row_bug["use_pert_2"]]
        llm_output_2 = temp_df[temp_df["row_uid"] == pert_round_id]["content"].values[0]
        expect_output_2 = temp_df[temp_df["row_uid"] == pert_round_id]["expected_answer"].values[0][0]
        temp_container.append({"bug_id":bug_id,
                               "dialogue_key":dialogue_key,
                               "story":story,
                               "story_cn":None,
                               "original_qa":original_qa,
                               "pertuabed_1":pertuabed_1,
                               "target_round_1":target_round_1,
                               "llm_output_1":llm_output_1,
                               "expect_output_1":expect_output_1,
                                "pertuabed_2":pertuabed_2,
                                "target_round_2":target_round_2,
                                "llm_output_2":llm_output_2,
                                "expect_output_2":expect_output_2,
                               "note": "Answer should be both correct and short.",
                               "isTruePositive":None}),

df_mc_MORTAR = pd.DataFrame(columns = ["bug_id","dialogue_key", "story", "story_cn", "original_qa", "pertuabed_1", "target_round_1", "llm_output_1", "expect_output_1", "pertuabed_2", "target_round_2", "llm_output_2", "expect_output_2", "note","isTruePositive"], data = temp_container)
df_mc_MORTAR.to_excel("RQ142_FOUNDED_Bugs_MORTAR_manual_check.xlsx", index=False)


### METAL

In [None]:
all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]

# METAL dataset
METAL_perts = [
    "500_MR10_SNP_s06_synonym_replacement",
    "500_MR13_SNP_s06_add_words",
    "500_MR11_SNP_s06_introduce_typos",
    "500_MR12_SNP_s06_to_leet",
]

METAL_dataset_jsons = {}

for this_pert in METAL_perts:
    with open(f"data/{this_pert}.json", "r") as f:
        METAL_dataset_jsons[this_pert.lstrip("500_")] = json.load(f)

# result
METAL_Perts = [
    "MR10_SNP_s06_synonym_replacement",
    "MR11_SNP_s06_introduce_typos",
    "MR12_SNP_s06_to_leet",
    "MR13_SNP_s06_add_words"]

METAL_Perts_S97 = [
    "S97_MR10_SNP_s06_synonym_replacement",
    "S97_MR11_SNP_s06_introduce_typos",
    "S97_MR12_SNP_s06_to_leet",
    "S97_MR13_SNP_s06_add_words"]


result_path = "result/MTMT/{}/{}.pickle"
METAL_result_DataFrames = {}
for chosen_lm in all_lms:
    for i_chosen_pert in range(len(METAL_Perts)):
        chosen_pert = METAL_Perts[i_chosen_pert]
        
        chosen_pert_s97 = METAL_Perts_S97[i_chosen_pert]
        df_result = pd.read_pickle(result_path.format(chosen_lm, chosen_pert))
        df_result_s97 = pd.read_pickle(result_path.format(chosen_lm, chosen_pert_s97))
        df_result = pd.concat([df_result, df_result_s97], ignore_index=True)

        df_result = df_result[df_result["role"] == "assistant"]
        df_result["score_semantic"]=[item[0] if item else None for item in df_result["score_semantic"]]
        df_result["row_uid"] = [ (df_result.loc[row, "source_uuid"] + "##" + str(df_result.loc[row,"round"])) for row in df_result.index]
        df_result["id_pert_round"] = [chosen_pert + "#pert#" + df_result.loc[row, "source_uuid"] + "#pert#" + "Round " + str(df_result.loc[row,"round"]) for row in df_result.index]
        METAL_result_DataFrames[chosen_lm + "_" + chosen_pert] = copy.deepcopy(df_result)

def get_original_story(dialogue_key):
    return dataset_dev_full[dialogue_key]["story_material"]

def get_original_qa(dialogue_key):
    return dataset_dev_full[dialogue_key]['combined']

def get_perturbed_qa_METAL(dialogue_key, pert):
    return METAL_dataset_jsons[pert][dialogue_key]

def pert_to_readable(pert_rounds):
    result = ""
    all_rounds = pert_rounds.keys()
    for round in all_rounds:
        result += f"{round} - {pert_rounds[round]['Question']} \n {round} - {pert_rounds[round]['Answer']}\n"
    return result

df_METAL_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_METAL_df.pickle")
df_METAL_check_bugs = df_METAL_bugs.sample(100)
df_METAL_check_bugs.sort_values(by=["dialogue_key", "original_round_num", "SUT"], inplace=True)

temp_container = []
for index in df_METAL_check_bugs.index:
    row_bug = df_METAL_check_bugs.loc[index]
    bug_id = row_bug["bug_id"]
    use_mr = row_bug["use_mr"]
    use_pert = row_bug["use_pert_1"]
    dialogue_key = row_bug["dialogue_key"]
    story = get_original_story(dialogue_key)
    original_qa = get_original_qa(dialogue_key)

    target_round_1 = row_bug["pert_round_num_1"]
    pertuabed_1 = pert_to_readable(get_perturbed_qa_METAL(dialogue_key, row_bug["use_pert_1"]))
    pert_round_id = dialogue_key + "##" + target_round_1 # 3dr23u6we5exclen4th8uq9rb42tel##1
    temp_df = METAL_result_DataFrames[row_bug["SUT"] + "_" + use_pert]
    llm_output_1 = temp_df[temp_df["row_uid"] == pert_round_id]["content"].values[0]
    expect_output_1 = temp_df[temp_df["row_uid"] == pert_round_id]["expected_answer"].values[0][0]

    if use_mr in ["MR1", "MR2"]:
        temp_container.append({"bug_id":bug_id,
                               "dialogue_key":dialogue_key,
                               "story":story,
                               "story_cn":None,
                               "original_qa":original_qa,
                               "pertuabed_1":pertuabed_1,
                               "target_round_1":target_round_1,
                               "llm_output_1":llm_output_1,
                               "expect_output_1":expect_output_1,
                                "pertuabed_2":None,
                                "target_round_2":None,
                                "llm_output_2":None,
                                "expect_output_2":None,
                               "note": "Answer should be both correct and short.",
                               "isTruePositive":None}),

    elif use_mr in ["MR3","MR4"]:
        target_round_2 = row_bug["pert_round_num_2"]
        pertuabed_2 = pert_to_readable(get_perturbed_qa_METAL(dialogue_key, row_bug["use_pert_2"]))
        pert_round_id = dialogue_key + "##" + target_round_2
        temp_df = METAL_result_DataFrames[row_bug["SUT"] + "_" + row_bug["use_pert_2"]]
        llm_output_2 = temp_df[temp_df["row_uid"] == pert_round_id]["content"].values[0]
        expect_output_2 = temp_df[temp_df["row_uid"] == pert_round_id]["expected_answer"].values[0][0]
        temp_container.append({"bug_id":bug_id,
                               "dialogue_key":dialogue_key,
                               "story":story,
                               "story_cn":None,
                               "original_qa":original_qa,
                               "pertuabed_1":pertuabed_1,
                               "target_round_1":target_round_1,
                               "llm_output_1":llm_output_1,
                               "expect_output_1":expect_output_1,
                                "pertuabed_2":pertuabed_2,
                                "target_round_2":target_round_2,
                                "llm_output_2":llm_output_2,
                                "expect_output_2":expect_output_2,
                               "note": "Answer should be both correct and short.",
                               "isTruePositive":None}),

df_mc_METAL = pd.DataFrame(columns = ["bug_id","dialogue_key", "story", "story_cn", "original_qa", "pertuabed_1", "target_round_1", "llm_output_1", "expect_output_1", "pertuabed_2", "target_round_2", "llm_output_2", "expect_output_2", "note","isTruePositive"], data = temp_container)
df_mc_METAL.to_excel("RQ142_FOUNDED_Bugs_METAL_manual_check.xlsx", index=False)

### Result

In [10]:
import pandas as pd
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, matthews_corrcoef

df_mc_MORTAR = pd.read_excel("data/manual_check/Merge_RQ142_FOUNDED_Bugs_MORTAR_manual_check.xlsx")
print("MORTAR")
print("kappa:",cohen_kappa_score((df_mc_MORTAR["c1_llm_correct"]<=6).to_list(), (df_mc_MORTAR["c2_llm_correct"]<=6).to_list()))
print("TPR:",(len(df_mc_MORTAR[df_mc_MORTAR["c2_llm_correct"]<=6])+len(df_mc_MORTAR[df_mc_MORTAR["c1_llm_correct"]<=6]))/200)
print("METAL")
df_mc_METAL = pd.read_excel("data/manual_check/Merge_RQ142_FOUNDED_Bugs_METAL_manual_check.xlsx")
print("kappa:",cohen_kappa_score((df_mc_METAL["c1_llm_correct"]<=6).to_list(), (df_mc_METAL["c2_llm_correct"]<=6).to_list()))
print("TPR:",(len(df_mc_METAL[df_mc_METAL["c2_llm_correct"]<=6])+len(df_mc_METAL[df_mc_METAL["c1_llm_correct"]<=6]))/200)

MORTAR
kappa: 0.8318924111431316
TPR: 0.705
METAL
kappa: 0.7784045124899275
TPR: 0.455


## 2.3 Uniqueness

In [None]:
import pandas as pd

df_MORTAR_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_MORTAR_with_bug_type.pickle")
df_MORTAR_bugs["bug_round_seed"] = df_MORTAR_bugs["dialogue_key"] + "##" + df_MORTAR_bugs["original_round_num"]
df_METAL_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_METAL_with_bug_type.pickle")
df_METAL_bugs["bug_round_seed"] = df_METAL_bugs["dialogue_key"] + "##" + df_METAL_bugs["original_round_num"]

bug_levels = [1,2,3]
all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

venn_container = {}

MORTAR_unique_l1_bugs = []
MORTAR_unique_l2_bugs = []
MORTAR_unique_l3_bugs = []

METAL_unique_l1_bugs = []
METAL_unique_l2_bugs = []
METAL_unique_l3_bugs = []

for llm in all_lms:
    venn_container[llm] = {}
    for bug_level in bug_levels:
        venn_container[llm][bug_level] = {"MORTAR":[], "METAL":[], "Overlap":[]}

        df_MORTAR_llm_this_buglevel = df_MORTAR_bugs[
            (df_MORTAR_bugs["bug_type"] == bug_level) &
            (df_MORTAR_bugs["SUT"] == llm)
            ]
        list_MORTAR_llm_this_buglevel_bug_round_seed = df_MORTAR_llm_this_buglevel["bug_round_seed"].tolist()
        venn_container[llm][bug_level]["MORTAR"] = list_MORTAR_llm_this_buglevel_bug_round_seed

        df_METAL_llm_this_buglevel = df_METAL_bugs[
            (df_METAL_bugs["bug_type"] == bug_level) &
            (df_METAL_bugs["SUT"] == llm)
            ]
        list_METAL_llm_this_buglevel_bug_round_seed = df_METAL_llm_this_buglevel["bug_round_seed"].tolist()
        venn_container[llm][bug_level]["METAL"] = list_METAL_llm_this_buglevel_bug_round_seed

        venn_container[llm][bug_level]["Overlap"] = [item for item in list_MORTAR_llm_this_buglevel_bug_round_seed if item in list_METAL_llm_this_buglevel_bug_round_seed]

        this_llm_this_level_mortar_unique_bugs = [item for item in list_MORTAR_llm_this_buglevel_bug_round_seed if item not in list_METAL_llm_this_buglevel_bug_round_seed]
        this_llm_this_level_metal_unique_bugs = [item for item in list_METAL_llm_this_buglevel_bug_round_seed if item not in list_MORTAR_llm_this_buglevel_bug_round_seed]

        if bug_level == 1:
            MORTAR_unique_l1_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l1_bugs += this_llm_this_level_metal_unique_bugs
        elif bug_level == 2:
            MORTAR_unique_l2_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l2_bugs += this_llm_this_level_metal_unique_bugs
        elif bug_level == 3:
            MORTAR_unique_l3_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l3_bugs += this_llm_this_level_metal_unique_bugs

# MORTAR

print("SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\\\ ")
print("\\midrule")

for llm in all_lms:
    mortar_l1_unique_bugs = [item for item in venn_container[llm][1]["MORTAR"] if item not in venn_container[llm][1]["Overlap"]]
    mortar_l2_unique_bugs = [item for item in venn_container[llm][2]["MORTAR"] if item not in venn_container[llm][2]["Overlap"]]
    mortar_l3_unique_bugs = [item for item in venn_container[llm][3]["MORTAR"] if item not in venn_container[llm][3]["Overlap"]]
    print("{} & {}\% & {}\% & {}\% & {}\% \\\\ ".format(
        dist_LLM_DS_name[llm],
        round(len(mortar_l1_unique_bugs)/len(venn_container[llm][1]["MORTAR"])*100, 1),
        round(len(mortar_l2_unique_bugs)/len(venn_container[llm][2]["MORTAR"])*100, 1),
        round(len(mortar_l3_unique_bugs)/len(venn_container[llm][3]["MORTAR"])*100, 1) if len(venn_container[llm][3]["MORTAR"])!=0 else 0,
        round((len(mortar_l1_unique_bugs) + len(mortar_l2_unique_bugs) + len(mortar_l3_unique_bugs))/(len(venn_container[llm][1]["MORTAR"]) + len( venn_container[llm][2]["MORTAR"]) + len(venn_container[llm][3]["MORTAR"]))*100, 1)
    ))

print("\\midrule")
print("Average & {}\% & {}\% & {}\% & {}\% \\\\".format(
    round(len(MORTAR_unique_l1_bugs)/len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 1])*100,1),
    round(len(MORTAR_unique_l2_bugs)/len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 2])*100,1),
    round(len(MORTAR_unique_l3_bugs)/len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 3])*100,1),
    round((len(MORTAR_unique_l1_bugs) + len(MORTAR_unique_l2_bugs) + len(MORTAR_unique_l3_bugs))/(len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 1]) + len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 2]) + len(df_MORTAR_bugs[df_MORTAR_bugs["bug_type"] == 3]))*100,1)
))

# METAL
print("SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\\\ ")
print("\\midrule")

for llm in all_lms:
    metal_l1_unique_bugs = [item for item in venn_container[llm][1]["METAL"] if item not in venn_container[llm][1]["Overlap"]]
    metal_l2_unique_bugs = [item for item in venn_container[llm][2]["METAL"] if item not in venn_container[llm][2]["Overlap"]]
    metal_l3_unique_bugs = [item for item in venn_container[llm][3]["METAL"] if item not in venn_container[llm][3]["Overlap"]]
    print("{} & {}\% & {}\% & {}\% & {}\% \\\\ ".format(
        dist_LLM_DS_name[llm],
        round(len(metal_l1_unique_bugs)/len(venn_container[llm][1]["METAL"])*100, 1),
        round(len(metal_l2_unique_bugs)/len(venn_container[llm][2]["METAL"])*100, 1),
        round(len(metal_l3_unique_bugs)/len(venn_container[llm][3]["METAL"])*100, 1) if len(venn_container[llm][3]["METAL"])!=0 else 0,
        round((len(metal_l1_unique_bugs) + len(metal_l2_unique_bugs) + len(metal_l3_unique_bugs))/(len(venn_container[llm][1]["METAL"]) + len( venn_container[llm][2]["METAL"]) + len(venn_container[llm][3]["METAL"]))*100, 1)
    ))

print("\\midrule")
print("Average & {}\% & {}\% & {}\% & {}\% \\\\".format(
    round(len(METAL_unique_l1_bugs)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 1])*100,1),
    round(len(METAL_unique_l2_bugs)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 2])*100,1),
    round(len(METAL_unique_l3_bugs)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 3])*100,1),
    round((len(METAL_unique_l1_bugs) + len(METAL_unique_l2_bugs) + len(METAL_unique_l3_bugs))/(len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 1]) + len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 2]) + len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 3]))*100,1)
))

SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\ 
\midrule
DS1 & 25.6\% & 29.5\% & 0\% & 26.1\% \\ 
DS2 & 30.0\% & 35.3\% & 0\% & 31.2\% \\ 
DS3 & 28.4\% & 47.9\% & 65.1\% & 32.9\% \\ 
DS4 & 30.9\% & 35.8\% & 0\% & 31.8\% \\ 
DS5 & 31.8\% & 57.4\% & 62.0\% & 42.1\% \\ 
DS6 & 36.5\% & 71.6\% & 77.0\% & 53.0\% \\ 
\midrule
Average & 29.3\% & 47.6\% & 68.3\% & 33.8\% \\
SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\ 
\midrule
DS1 & 20.5\% & 22.9\% & 0\% & 20.9\% \\ 
DS2 & 20.6\% & 27.6\% & 0\% & 22.3\% \\ 
DS3 & 20.2\% & 32.1\% & 35.5\% & 22.7\% \\ 
DS4 & 21.5\% & 38.0\% & 0\% & 25.5\% \\ 
DS5 & 22.9\% & 43.6\% & 53.9\% & 31.1\% \\ 
DS6 & 24.4\% & 49.8\% & 63.4\% & 34.6\% \\ 
\midrule
Average & 21.1\% & 35.4\% & 54.5\% & 24.7\% \\


In [13]:
# unique 403

import json
import pandas as pd

all_403_dialogues = json.load(open("data/dialogue_round_keys_403.json", "r"))
all_403_dialogues.keys()


df_MORTAR_bugs_all = pd.read_pickle("RQ1_FOUNDED_Bugs_MORTAR_with_bug_type.pickle")
df_MORTAR_bugs = df_MORTAR_bugs_all[df_MORTAR_bugs_all["dialogue_key"].isin(all_403_dialogues.keys())]
df_MORTAR_bugs["bug_round_seed"] = df_MORTAR_bugs["dialogue_key"] + "##" + df_MORTAR_bugs["original_round_num"]
df_METAL_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_METAL_with_bug_type.pickle")
df_METAL_bugs = df_METAL_bugs[df_METAL_bugs["dialogue_key"].isin(all_403_dialogues.keys())]
df_METAL_bugs["bug_round_seed"] = df_METAL_bugs["dialogue_key"] + "##" + df_METAL_bugs["original_round_num"]

bug_levels = [1,2,3]
all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}


# bug_level = bug_levels[0]


venn_container_403 = {}

MORTAR_unique_l1_bugs = []
MORTAR_unique_l2_bugs = []
MORTAR_unique_l3_bugs = []

METAL_unique_l1_bugs_403 = []
METAL_unique_l2_bugs_403 = []
METAL_unique_l3_bugs_403 = []

for llm in all_lms:
    venn_container_403[llm] = {}
    for bug_level in bug_levels:
        venn_container_403[llm][bug_level] = {"MORTAR":[], "METAL":[], "Overlap":[]}

        df_MORTAR_llm_this_buglevel = df_MORTAR_bugs[
            (df_MORTAR_bugs["bug_type"] == bug_level) &
            (df_MORTAR_bugs["SUT"] == llm)
            ]
        list_MORTAR_llm_this_buglevel_bug_round_seed = df_MORTAR_llm_this_buglevel["bug_round_seed"].tolist()
        venn_container_403[llm][bug_level]["MORTAR"] = list_MORTAR_llm_this_buglevel_bug_round_seed

        df_METAL_llm_this_buglevel = df_METAL_bugs[
            (df_METAL_bugs["bug_type"] == bug_level) &
            (df_METAL_bugs["SUT"] == llm)
            ]
        list_METAL_llm_this_buglevel_bug_round_seed = df_METAL_llm_this_buglevel["bug_round_seed"].tolist()
        venn_container_403[llm][bug_level]["METAL"] = list_METAL_llm_this_buglevel_bug_round_seed

        venn_container_403[llm][bug_level]["Overlap"] = [item for item in list_MORTAR_llm_this_buglevel_bug_round_seed if item in list_METAL_llm_this_buglevel_bug_round_seed]

        this_llm_this_level_mortar_unique_bugs = [item for item in list_MORTAR_llm_this_buglevel_bug_round_seed if item not in list_METAL_llm_this_buglevel_bug_round_seed]
        this_llm_this_level_metal_unique_bugs = [item for item in list_METAL_llm_this_buglevel_bug_round_seed if item not in list_MORTAR_llm_this_buglevel_bug_round_seed]

        if bug_level == 1:
            MORTAR_unique_l1_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l1_bugs_403 += this_llm_this_level_metal_unique_bugs
        elif bug_level == 2:
            MORTAR_unique_l2_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l2_bugs_403 += this_llm_this_level_metal_unique_bugs
        elif bug_level == 3:
            MORTAR_unique_l3_bugs += this_llm_this_level_mortar_unique_bugs
            METAL_unique_l3_bugs_403 += this_llm_this_level_metal_unique_bugs

# METAL
print("METAL SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\\\ ")
print("\\midrule")

for llm in all_lms:
    metal_l1_unique_bugs_403 = [item for item in venn_container_403[llm][1]["METAL"] if item not in venn_container_403[llm][1]["Overlap"]]
    metal_l2_unique_bugs_403 = [item for item in venn_container_403[llm][2]["METAL"] if item not in venn_container_403[llm][2]["Overlap"]]
    metal_l3_unique_bugs_403 = [item for item in venn_container_403[llm][3]["METAL"] if item not in venn_container_403[llm][3]["Overlap"]]
    print("{} & {}\% & {}\% & {}\% & {}\% \\\\ ".format(
        dist_LLM_DS_name[llm],
        round(len(metal_l1_unique_bugs_403)/len(venn_container_403[llm][1]["METAL"])*100, 1),
        round(len(metal_l2_unique_bugs_403)/len(venn_container_403[llm][2]["METAL"])*100, 1),
        round(len(metal_l3_unique_bugs_403)/len(venn_container_403[llm][3]["METAL"])*100, 1) if len(venn_container_403[llm][3]["METAL"])!=0 else 0,
        round((len(metal_l1_unique_bugs_403) + len(metal_l2_unique_bugs_403) + len(metal_l3_unique_bugs_403))/(len(venn_container_403[llm][1]["METAL"]) + len( venn_container_403[llm][2]["METAL"]) + len(venn_container_403[llm][3]["METAL"]))*100, 1)
    ))

print("\\midrule")
print("Average & {}\% & {}\% & {}\% & {}\% \\\\".format(
    round(len(METAL_unique_l1_bugs_403)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 1])*100,1),
    round(len(METAL_unique_l2_bugs_403)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 2])*100,1),
    round(len(METAL_unique_l3_bugs_403)/len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 3])*100,1),
    round((len(METAL_unique_l1_bugs_403) + len(METAL_unique_l2_bugs_403) + len(METAL_unique_l3_bugs_403))/(len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 1]) + len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 2]) + len(df_METAL_bugs[df_METAL_bugs["bug_type"] == 3]))*100,1)
))

METAL SUTs & L1-Bugs & L2-Bugs & L3-Bugs & Overall \\ 
\midrule
DS1 & 1.0\% & 2.7\% & 0\% & 1.2\% \\ 
DS2 & 1.3\% & 7.8\% & 0\% & 2.9\% \\ 
DS3 & 1.5\% & 17.9\% & 13.0\% & 5.0\% \\ 
DS4 & 2.6\% & 22.6\% & 0\% & 7.5\% \\ 
DS5 & 4.9\% & 30.4\% & 38.8\% & 14.9\% \\ 
DS6 & 5.5\% & 38.9\% & 43.5\% & 18.8\% \\ 
\midrule
Average & 2.1\% & 19.8\% & 36.0\% & 6.5\% \\


# RQ3: Component Contribution

In [18]:
import copy
import json 
import pandas as pd

MORTAR_Perts = [
    "P1_round_shuffle",
    "P2_round_reduce",
    "P3_round_duplicate",
    "P4_round_reduce_shuffle",
    "P5_round_shuffle_duplicate",
    ]

dataset_jsons = {}

for this_pert in MORTAR_Perts:
    with open(f"data/{this_pert}.json", "r") as f:
        dataset_jsons[this_pert] = json.load(f)

for this_pert in MORTAR_Perts:
    id_pert_dialogues = []
    id_pert_dialogue_has_uanswerability_round = []
    id_pert_dialogue_usage_mr1 = []
    id_pert_dialogue_usage_mr2 = []

    id_pert_rounds = []
    id_pert_rounds_un_answerable = []
    id_pert_rounds_answerability_changed = []
    
    this_pert_dataset = dataset_jsons[this_pert]
    for dialogue_key in this_pert_dataset.keys():
        id_pert_dialogues.append(dialogue_key)
        for pert_round_key in this_pert_dataset[dialogue_key].keys():
            pert_round_uid = dialogue_key + "##" + pert_round_key
            id_pert_rounds.append(pert_round_uid)

            if this_pert_dataset[dialogue_key][pert_round_key]["FLAG_answer_changed"]:
                id_pert_rounds_answerability_changed.append(pert_round_uid)
                id_pert_dialogue_usage_mr2.append(dialogue_key) if dialogue_key not in id_pert_dialogue_usage_mr2 else None

            if this_pert_dataset[dialogue_key][pert_round_key]["Answer"] == "unknown":
                id_pert_rounds_un_answerable.append(pert_round_uid)
                id_pert_dialogue_has_uanswerability_round.append(dialogue_key) if dialogue_key not in id_pert_dialogue_has_uanswerability_round else None
            else:
                id_pert_dialogue_usage_mr1.append(dialogue_key) if dialogue_key not in id_pert_dialogue_usage_mr1 else None
            
mapping_original_round_appear_in_pert = {} # Oid-r1 -> [P1-r1, P2-r3 ...]
all_original_round_ids = []

dialogue_round_keys_403 = json.load(open("data/dialogue_round_keys_403.json", "r"))
for original_dialogue_key in dialogue_round_keys_403.keys():
    for original_round_key in dialogue_round_keys_403[original_dialogue_key]:
        original_round_uid = original_dialogue_key + "##" + original_round_key
        all_original_round_ids.append(original_round_uid)
        mapping_original_round_appear_in_pert[original_round_uid] = []

for pert_type in MORTAR_Perts:
    this_pert_dataset = dataset_jsons[pert_type]
    for pert_dialogue_key in this_pert_dataset.keys():
        for pert_round_key in this_pert_dataset[pert_dialogue_key].keys():
            pert_round_uid = pert_type + "#pert#" +pert_dialogue_key + "#pert#" + pert_round_key
            original_round = this_pert_dataset[pert_dialogue_key][pert_round_key]['is_original_round']
            original_round_uid = pert_dialogue_key + "##" + original_round # key in mapping_original_round_appear_in_pert
            mapping_original_round_appear_in_pert[original_round_uid].append(pert_round_uid)


list_pert_dialogue_usage_mr1 = {} # O1-r3 : [P1-O1-r2, P2...], (pert_round_uid), expect output(P1-O1-r2) == answer(O1-r3)
list_pert_dialogue_usage_mr2 = {} # O1-r2 : [P2-O1-r1, ...], (pert_round_uid), expect output(P2-O1-r1) != answer(O1-r2)

dict_pert_dialogue_usage_mr3 = {} # O1-r1 : {1:[P1-O1-r1, P2-O1-r3, ...]}, expect output(P1-O1-r1) == output(P2-O1-r3) and ...
dict_pert_dialogue_usage_mr4 = {} # O1-r1 : {0:[P2-O1-r1] 1:[P1-O1-r1, P2-O1-r3]}, expect output(P2-O1-r1) != output(P1-O1-r1) and output(P2-O1-r1) != output(P2-O1-r3)

for original_round_id in all_original_round_ids:
    list_pert_dialogue_usage_mr1[original_round_id] = []
    list_pert_dialogue_usage_mr2[original_round_id] = []

    this_ori_dialogue_appear_in_pert = mapping_original_round_appear_in_pert[original_round_id]

    pert_answerable_q_sum = 0
    dict_pert_answerability = {}

    for pert_round_id in this_ori_dialogue_appear_in_pert:
        pert_type = pert_round_id.split("#pert#")[0]
        pert_dialogue_key = pert_round_id.split("#pert#")[1]
        pert_round_key = pert_round_id.split("#pert#")[2]

        answerability_this_pert_round = (dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["Answer"] != "unknown") # changed answerability
        dict_pert_answerability[pert_round_key] = answerability_this_pert_round
        pert_answerable_q_sum += answerability_this_pert_round

        if answerability_this_pert_round:
            list_pert_dialogue_usage_mr1[original_round_id].append(pert_round_id)
        else:
            if dataset_jsons[pert_type][pert_dialogue_key][pert_round_key]["FLAG_answer_changed"]:
                list_pert_dialogue_usage_mr2[original_round_id].append(pert_round_id)

    if pert_answerable_q_sum == len(this_ori_dialogue_appear_in_pert) and pert_answerable_q_sum > 0: # all pert rounds are answerable
        dict_pert_dialogue_usage_mr3[original_round_id] = {1:[]}
        dict_pert_dialogue_usage_mr3[original_round_id][1] = copy.deepcopy(list_pert_dialogue_usage_mr1[original_round_id])
    elif 0<pert_answerable_q_sum<len(this_ori_dialogue_appear_in_pert): # some pert rounds are answerable, some are not
        dict_pert_dialogue_usage_mr4[original_round_id] = {0:[],1:[]}
        dict_pert_dialogue_usage_mr4[original_round_id][0] = copy.deepcopy(list_pert_dialogue_usage_mr2[original_round_id])
        dict_pert_dialogue_usage_mr4[original_round_id][1] = copy.deepcopy(list_pert_dialogue_usage_mr1[original_round_id])

for original_round_id in list_pert_dialogue_usage_mr1.copy().keys():
    if len(list_pert_dialogue_usage_mr1[original_round_id]) == 0:
        del list_pert_dialogue_usage_mr1[original_round_id]

for original_round_id in list_pert_dialogue_usage_mr2.copy().keys():
    if len(list_pert_dialogue_usage_mr2[original_round_id]) == 0:
        del list_pert_dialogue_usage_mr2[original_round_id]

for original_round_id in dict_pert_dialogue_usage_mr3.copy().keys():
    if len(dict_pert_dialogue_usage_mr3[original_round_id][1]) < 2:
        del dict_pert_dialogue_usage_mr3[original_round_id]

for original_round_id in dict_pert_dialogue_usage_mr4.copy().keys():
    if len(dict_pert_dialogue_usage_mr4[original_round_id][0]) == 0 or len(dict_pert_dialogue_usage_mr4[original_round_id][1]) == 0:
        del dict_pert_dialogue_usage_mr4[original_round_id]


In [19]:
usage_mr1 = {
    "P1_round_shuffle":0,
    "P2_round_reduce":0,
    "P3_round_duplicate":0,
    "P4_round_reduce_shuffle":0,
    "P5_round_shuffle_duplicate":0,
}

for original_round_id in list_pert_dialogue_usage_mr1.keys():
    pert_round_ids = list_pert_dialogue_usage_mr1[original_round_id]
    for pert_round_id in pert_round_ids:
        pert_type = pert_round_id.split("#pert#")[0]
        usage_mr1[pert_type] += 1
print(usage_mr1["P1_round_shuffle"]+ usage_mr1["P2_round_reduce"]+ usage_mr1["P3_round_duplicate"]+ usage_mr1["P4_round_reduce_shuffle"]+ usage_mr1["P5_round_shuffle_duplicate"])
usage_mr1

30385


{'P1_round_shuffle': 6315,
 'P2_round_reduce': 4539,
 'P3_round_duplicate': 7548,
 'P4_round_reduce_shuffle': 4510,
 'P5_round_shuffle_duplicate': 7473}

In [20]:
usage_mr2 = {
    "P1_round_shuffle":0,
    "P2_round_reduce":0,
    "P3_round_duplicate":0,
    "P4_round_reduce_shuffle":0,
    "P5_round_shuffle_duplicate":0,
}

for original_round_id in list_pert_dialogue_usage_mr2.keys():
    pert_round_ids = list_pert_dialogue_usage_mr2[original_round_id]
    for pert_round_id in pert_round_ids:
        pert_type = pert_round_id.split("#pert#")[0]
        usage_mr2[pert_type] += 1

print(usage_mr2["P1_round_shuffle"]+usage_mr2["P2_round_reduce"]+usage_mr2["P3_round_duplicate"]+usage_mr2["P4_round_reduce_shuffle"]+usage_mr2["P5_round_shuffle_duplicate"])
usage_mr2

178


{'P1_round_shuffle': 60,
 'P2_round_reduce': 24,
 'P3_round_duplicate': 0,
 'P4_round_reduce_shuffle': 53,
 'P5_round_shuffle_duplicate': 41}

In [21]:
usage_mr3 = {
    "P1_round_shuffle":0,
    "P2_round_reduce":0,
    "P3_round_duplicate":0,
    "P4_round_reduce_shuffle":0,
    "P5_round_shuffle_duplicate":0,
}

for original_round_id in dict_pert_dialogue_usage_mr3.keys():
    true_pert_round_ids = dict_pert_dialogue_usage_mr3[original_round_id][1]
    for pert_round_id in true_pert_round_ids:
        pert_type = pert_round_id.split("#pert#")[0]
        usage_mr3[pert_type] += 1
print(len(dict_pert_dialogue_usage_mr3))
usage_mr3

6194


{'P1_round_shuffle': 6194,
 'P2_round_reduce': 4426,
 'P3_round_duplicate': 7347,
 'P4_round_reduce_shuffle': 4431,
 'P5_round_shuffle_duplicate': 7341}

In [22]:
usage_mr4 = {
    "P1_round_shuffle":0,
    "P2_round_reduce":0,
    "P3_round_duplicate":0,
    "P4_round_reduce_shuffle":0,
    "P5_round_shuffle_duplicate":0,
}

for original_round_id in dict_pert_dialogue_usage_mr4.keys():
    true_pert_round_ids = dict_pert_dialogue_usage_mr4[original_round_id][1]
    false_pert_round_ids = dict_pert_dialogue_usage_mr4[original_round_id][0]

    for pert_round_id in true_pert_round_ids:
        pert_type = pert_round_id.split("#pert#")[0]
        usage_mr4[pert_type] += 1
    for pert_round_id in false_pert_round_ids:
        pert_type = pert_round_id.split("#pert#")[0]
        usage_mr4[pert_type] += 1

print(len(dict_pert_dialogue_usage_mr4))
usage_mr4


151


{'P1_round_shuffle': 151,
 'P2_round_reduce': 114,
 'P3_round_duplicate': 177,
 'P4_round_reduce_shuffle': 116,
 'P5_round_shuffle_duplicate': 164}

In [None]:
import pandas as pd
df_MORTAR_bugs = pd.read_pickle("RQ1_FOUNDED_Bugs_MORTAR_with_bug_type.pickle")

all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
MORTAR_Perts = ["P1_round_shuffle","P2_round_reduce","P3_round_duplicate","P4_round_reduce_shuffle","P5_round_shuffle_duplicate"]
dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

result_rq_321 = {
    "qwen2_0B5": {},
    "qwen2_1B5": {},
    "qwen2_7B": {},
    "mistral03_7B": {},
    "llama3_8B": {},
    "gemma2_9B": {}
}

for lm in all_lms:
    result_rq_321[lm] = {
        "P1_round_shuffle": 0,
        "P2_round_reduce": 0,
        "P3_round_duplicate": 0,
        "P4_round_reduce_shuffle": 0,
        "P5_round_shuffle_duplicate": 0,
    }
    for pert in MORTAR_Perts:
        result_rq_321[lm][pert] += len(df_MORTAR_bugs[(df_MORTAR_bugs["use_pert_1"] == pert) & (df_MORTAR_bugs["SUT"] == lm)])
        result_rq_321[lm][pert] += len(df_MORTAR_bugs[(df_MORTAR_bugs["use_pert_2"] == pert) & (df_MORTAR_bugs["SUT"] == lm)])
    
    print("{} &{} ({}\%) &{} ({}\%) &{} ({}\%) &{} ({}\%) &{} ({}\%) \\\\".format(
        dist_LLM_DS_name[lm],
        result_rq_321[lm]["P1_round_shuffle"],
        round(result_rq_321[lm]["P1_round_shuffle"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_321[lm]["P2_round_reduce"],
        round(result_rq_321[lm]["P2_round_reduce"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_321[lm]["P3_round_duplicate"],
        round(result_rq_321[lm]["P3_round_duplicate"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_321[lm]["P4_round_reduce_shuffle"],
        round(result_rq_321[lm]["P4_round_reduce_shuffle"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_321[lm]["P5_round_shuffle_duplicate"],
        round(result_rq_321[lm]["P5_round_shuffle_duplicate"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),)
    )

print("\midrule")
print("Average & {}({}\%) & {}({}\%) & {}({}\%) & {}({}\%) & {}({}\%) \\\\".format(
    
    round(sum([result_rq_321[lm]["P1_round_shuffle"] for lm in all_lms])/6),
    round(sum([result_rq_321[lm]["P1_round_shuffle"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    round(sum([result_rq_321[lm]["P2_round_reduce"] for lm in all_lms])/6),
    round(sum([result_rq_321[lm]["P2_round_reduce"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    round(sum([result_rq_321[lm]["P3_round_duplicate"] for lm in all_lms])/6),
    round(sum([result_rq_321[lm]["P3_round_duplicate"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    round(sum([result_rq_321[lm]["P4_round_reduce_shuffle"] for lm in all_lms])/6),
    round(sum([result_rq_321[lm]["P4_round_reduce_shuffle"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    round(sum([result_rq_321[lm]["P5_round_shuffle_duplicate"] for lm in all_lms])/6),
    round(sum([result_rq_321[lm]["P5_round_shuffle_duplicate"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),)
)


DS1 &11380 (35.6\%) &7110 (22.3\%) &8494 (26.6\%) &4155 (13.0\%) &6835 (21.4\%) \\
DS2 &10369 (37.3\%) &6296 (22.6\%) &7584 (27.3\%) &3604 (13.0\%) &5878 (21.1\%) \\
DS3 &8433 (34.2\%) &4864 (19.7\%) &6414 (26.0\%) &3511 (14.3\%) &5799 (23.5\%) \\
DS4 &7479 (32.6\%) &4499 (19.6\%) &6003 (26.2\%) &3448 (15.0\%) &5322 (23.2\%) \\
DS5 &6775 (39.2\%) &3807 (22.1\%) &4262 (24.7\%) &2599 (15.1\%) &3920 (22.7\%) \\
DS6 &5200 (40.6\%) &2623 (20.5\%) &3042 (23.8\%) &1856 (14.5\%) &3160 (24.7\%) \\
\midrule
Average & 8273(36.1\%) & 4866(21.3\%) & 5966(26.1\%) & 3196(14.0\%) & 5152(22.5\%) \\


In [4]:
# MR contribution to bug discovery

all_lms = ["qwen2_0B5", "qwen2_1B5", "qwen2_7B", "mistral03_7B", "llama3_8B", "gemma2_9B"]
MRs = ["MR1", "MR2", "MR3", "MR4"]
dist_LLM_DS_name = {"qwen2_0B5":"DS1", "qwen2_1B5":"DS2", "qwen2_7B":"DS3", "mistral03_7B":"DS4", "llama3_8B":"DS5", "gemma2_9B":"DS6"}

result_rq_322 = {
    "qwen2_0B5": {},
    "qwen2_1B5": {},
    "qwen2_7B": {},
    "mistral03_7B": {},
    "llama3_8B": {},
    "gemma2_9B": {}
}

for lm in all_lms:
    result_rq_322[lm] = {
        "MR1": 0,
        "MR2": 0,
        "MR3": 0,
        "MR4": 0,
    }
    for MR in MRs:
        result_rq_322[lm][MR] += len(df_MORTAR_bugs[(df_MORTAR_bugs["use_mr"] == MR) & (df_MORTAR_bugs["SUT"] == lm)])

    print("{} &{}({}\%) &{}({}\%) &{}({}\%) &{}({}\%) \\\\".format(
        dist_LLM_DS_name[lm],
        result_rq_322[lm]["MR1"],
        round(result_rq_322[lm]["MR1"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_322[lm]["MR2"],
        round(result_rq_322[lm]["MR2"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,2),
        result_rq_322[lm]["MR3"],
        round(result_rq_322[lm]["MR3"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
        result_rq_322[lm]["MR4"],
        round(result_rq_322[lm]["MR4"]/len(df_MORTAR_bugs[df_MORTAR_bugs["SUT"] == lm])*100,1),
    ))
print("\midrule")

print("Average & {}({}\%) & {}({}\%) & {}({}\%) & {}({}\%) \\\\".format(
    round(sum([result_rq_322[lm]["MR1"] for lm in all_lms])/6),
    round(sum([result_rq_322[lm]["MR1"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    round(sum([result_rq_322[lm]["MR2"] for lm in all_lms])/6),
    round(sum([result_rq_322[lm]["MR2"] for lm in all_lms])/len(df_MORTAR_bugs)*100,2),

    round(sum([result_rq_322[lm]["MR3"] for lm in all_lms])/6),
    round(sum([result_rq_322[lm]["MR3"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),

    round(sum([result_rq_322[lm]["MR4"] for lm in all_lms])/6),
    round(sum([result_rq_322[lm]["MR4"] for lm in all_lms])/len(df_MORTAR_bugs)*100,1),
    )
)


DS1 &25903(81.1\%) &7(0.02\%) &5966(18.7\%) &66(0.2\%) \\
DS2 &21861(78.6\%) &6(0.02\%) &5844(21.0\%) &88(0.3\%) \\
DS3 &20242(82.2\%) &5(0.02\%) &4275(17.4\%) &112(0.5\%) \\
DS4 &19074(83.2\%) &13(0.06\%) &3692(16.1\%) &140(0.6\%) \\
DS5 &13153(76.2\%) &12(0.07\%) &3975(23.0\%) &124(0.7\%) \\
DS6 &9722(75.9\%) &7(0.05\%) &2956(23.1\%) &120(0.9\%) \\
\midrule
Average & 18326(80.0\%) & 8(0.04\%) & 4451(19.4\%) & 108(0.5\%) \\


In [5]:
for MR in MRs:
    print("{} & {}\% & {}\% & {}\% \\\\ ".format(
        MR,
        
        round(len(df_MORTAR_bugs[(df_MORTAR_bugs["bug_type"] == 1) & (df_MORTAR_bugs["use_mr"] == MR)])/len(df_MORTAR_bugs[df_MORTAR_bugs["use_mr"] == MR])*100,1),
        
        round(len(df_MORTAR_bugs[(df_MORTAR_bugs["bug_type"] == 2) & (df_MORTAR_bugs["use_mr"] == MR)])/len(df_MORTAR_bugs[df_MORTAR_bugs["use_mr"] == MR])*100,1),
        
        round(len(df_MORTAR_bugs[(df_MORTAR_bugs["bug_type"] == 3) & (df_MORTAR_bugs["use_mr"] == MR)])/len(df_MORTAR_bugs[df_MORTAR_bugs["use_mr"] == MR])*100,1),
    ))

MR1 & 78.2\% & 21.4\% & 0.4\% \\ 
MR2 & 48.0\% & 52.0\% & 0.0\% \\ 
MR3 & 66.9\% & 32.4\% & 0.7\% \\ 
MR4 & 51.4\% & 47.5\% & 1.1\% \\ 
