In [None]:
import os
import csv
import json
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from itertools import product
from pathlib import Path

In [None]:
from FLResultAnalyst import FLResultAnalyst
from Utils.STEnvConfig import get_pathConfig
from Utils.DatasetConfig import get_D4Jprojects, get_D4Jversions, get_SrcPath4D4J, get_TestCases4D4J 
from Utils.PandasHelper import move_column_to_pos, move_rows_with_value_to_end
from Utils.ColorPalette import GenshinImpactColorPalette as GIColorPalette

In [None]:
pathConfig = get_pathConfig()
if pathConfig:
    D4J = Path(pathConfig["D4J"])
    MBFL_Metric = Path(pathConfig["MBFL_Metric"])
print(D4J.as_posix())
print(MBFL_Metric.as_posix())

In [None]:
Granularity = ["Statement"]
Dataset = ["Defects4J"]

MutationType = ["NeuralMutation", "TraditionalMutation", "MergeMutation", "MergeSus"]
Tool = {
    "NeuralMutation": ["mBERT"],
    "TraditionalMutation": ["major"],
    "MergeMutation": ["major_SmBERT", "mBERT_Smajor", "U_mBERT_major"],
    "MergeSus": ["SusDRankAvg"]
}
Approach = ["FACombination"]
KillType = ["kill_type3"]
Aggregation = ["max"]
TieBreak = ["Avg"]
Metric = ["TopN", "EXAM", "MEAN"]
Formula = ["Dstar", "Ochiai", "Jaccard", "Op2", "Tarantula", "Gp13", "Muse"]

## **MTP (Mutant Test Pair)**

In [None]:
def get_executed_mutant_count4mBERT(project, version):
    from Utils.STEnvConfig import get_pathConfig
    from Utils.DatasetConfig import get_SrcPath4D4J
    from Utils.FileStatistic import get_files_with_suffix
    pathConfig = get_pathConfig()
    if pathConfig:
        NeuralMutationResult = Path(pathConfig["NeuralMutationResult"])
    project_version_MutantRepoPath = NeuralMutationResult / "mBERT" / "Defects4J/result4FaultFile_json" / f"{project}/{str(version)}b"
    
    if not project_version_MutantRepoPath.exists():
        return -1
    else:
        return len(get_files_with_suffix(project_version_MutantRepoPath, suffix=".json"))

def get_executed_mutant_count4major(project, version):
    from Utils.STEnvConfig import get_pathConfig
    from Utils.DatasetConfig import get_SrcPath4D4J
    from Utils.FileStatistic import get_files_with_suffix
    pathConfig = get_pathConfig()
    if pathConfig:
        TraditionalMutationResult = Path(pathConfig["TraditionalMutationResult"])
    
    if project in ["Collections", "Compress", "JacksonDatabind", "Jsoup", "JxPath"]:
        project_version_MutantRepoPath = TraditionalMutationResult / "major" / "Defects4J/result4FaultFile_json" / f"{project}/{str(version)}b"
    else:
        project_version_MutantRepoPath = TraditionalMutationResult / "major" / "Defects4J/result4FaultFile_json" / f"{project}/{str(version)}b"

    if not project_version_MutantRepoPath.exists():
        return -1
    else:
        return len(get_files_with_suffix(project_version_MutantRepoPath, suffix=".json"))

In [None]:
projects = get_D4Jprojects(DatasetVersion="v2.0")

results = []

for project in projects:
    
    versions = get_D4Jversions(project)
    
    mbert_total_MTP = 0
    major_total_MTP = 0
    mbert_num_versions = len(versions)
    major_num_versions = len(versions)
    
    for version in versions:
        print(f"---------{project} {version}---------")
        
        mbert_mutants = get_executed_mutant_count4mBERT(project, version)
        major_mutants = get_executed_mutant_count4major(project, version)
        
        version_testcases = len(get_TestCases4D4J(project, version))
        
        if mbert_mutants == -1:
            mbert_num_versions -= 1
        else:
            mbert_total_MTP += mbert_mutants * version_testcases
            
            print(f"mBERT [MTP:{mbert_total_MTP}] [mutants:{mbert_mutants}] [testcases:{version_testcases}] ")
            
        if major_mutants == -1:
            major_num_versions -= 1
        else:
            major_total_MTP += major_mutants * version_testcases
            
            print(f"mBERT [MTP:{major_total_MTP}] [mutants:{major_mutants}] [testcases:{version_testcases}] ")
    
    mbert_avg_MTP = mbert_total_MTP / mbert_num_versions if mbert_num_versions > 0 else 0
    major_avg_MTP = major_total_MTP / major_num_versions if major_num_versions > 0 else 0
    
    results.append([project, mbert_avg_MTP, major_avg_MTP])

overall_mbert_avg_MTP = sum(row[1] for row in results) / len(results) if len(results) > 0 else 0
overall_major_avg_MTP = sum(row[2] for row in results) / len(results) if len(results) > 0 else 0



with open('./Results/MTP.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    writer.writerow(['Project', 'mBERT Average MTP', 'Major Average MTP'])
    
    writer.writerows(results)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

from Utils.ColorPalette import GenshinImpactColorPalette as GIColorPalette

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'

title_fontsize = 24
label_fontsize = 20

In [None]:
palette_Tighnari = GIColorPalette.get_palette('Tighnari', format="hex")
palette_Nilou = GIColorPalette.get_palette('Nilou', format="hex")
palette_6 = [palette_Tighnari[5], palette_Tighnari[2], palette_Tighnari[7], palette_Tighnari[1], palette_Tighnari[9], palette_Nilou[4]]
palette_2 = palette_6[:2]

In [None]:
flra = FLResultAnalyst()
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
data = pd.read_csv('./Results/MTP.csv')
overall_data = data[data['Project'] == 'Overall']
measurements = overall_data[['Major Average MTP', 'mBERT Average MTP']].iloc[0]
labels = ['Traditional-MBFL', 'Neural-MBFL']
x = [0.5 * x for x in np.arange(len(labels))]  
width = 0.35  

fig, ax = plt.subplots(figsize=(6,6))
rects = ax.bar(x, measurements, width, color=palette_2)

ax.bar_label(rects, padding=3, fmt='%.0f', fontsize=label_fontsize)

ax.set_ylabel('Mutant-Test Pair (MTP)', fontsize=label_fontsize)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=label_fontsize)
ax.tick_params(axis='both', labelsize=label_fontsize)
ax.set_ylim(0, 1200000)  
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)  

plt.savefig("./Results/MTP/MTP.pdf", bbox_inches='tight', pad_inches=0.1)

## **Top-N, MAP**

In [None]:
flra = FLResultAnalyst()
param = {
    "Kill Type": ["kill_type3"],
    "Approach": ["FACombination"],
    "Mutation Type": ["NeuralMutation", "TraditionalMutation", "MergeMutation", "MergeSus"],
    "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
    "Aggregation": ["max"],
    "Formula": ["Dstar", "Ochiai", "Jaccard", "Op2", "Tarantula", "Gp13", "Muse"],
    "Tie Break": ["Avg"]
}

In [None]:
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
all_summary_dfs = flra.compare_topn_mean_summary_by_param(param)

reorder_col = [
    'Dataset', 'Granularity', 'Project', 'Kill Type', 'Approach',
    'Aggregation', 'Tie Break', 'Formula', 'Mutation Type', 'Mutation Method', 
    'top1', 'top3', 'top5', 'MAP',
]
all_summary_dfs = all_summary_dfs[reorder_col]

rename_col = {
    'top1': 'Top1',
    'top3': 'Top3',
    'top5': 'Top5',
}
all_summary_dfs = all_summary_dfs.rename(columns=rename_col)

MutationMethodOrder = ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"]
all_summary_dfs["Mutation Method"] = pd.Categorical(
    all_summary_dfs["Mutation Method"], 
    categories=MutationMethodOrder, ordered=True
)
FormulaOrder = ["Dstar", "Ochiai", "Jaccard", "Op2", "Tarantula", "Gp13", "Muse"]
all_summary_dfs["Formula"] = pd.Categorical(
    all_summary_dfs["Formula"], 
    categories=FormulaOrder, ordered=True
)
all_summary_dfs = all_summary_dfs.sort_values(by=["Formula", "Mutation Method"], ascending=[True, True])

In [None]:
summary_dfs = all_summary_dfs[(all_summary_dfs["Project"] == "Summary")]
summary_dfs_grouped = summary_dfs.groupby(
    # 'Mutation Method'
    [
        'Dataset', 'Granularity', 'Project', 'Kill Type', 'Approach',
        'Aggregation', 'Tie Break', 'Mutation Type', 'Mutation Method',
    ]
    , observed=True
).agg({
    "Formula": lambda x: "Avg.", 
    "Top1": 'mean',
    "Top3": 'mean',
    "Top5": 'mean',
    "MAP": 'mean',
}).reset_index()
summary_dfs = pd.concat([summary_dfs, summary_dfs_grouped], ignore_index=True)

FormulaOrder = ["Dstar", "Ochiai", "Jaccard", "Op2", "Tarantula", "Gp13", "Muse", "Avg."]
summary_dfs["Formula"] = pd.Categorical(
    summary_dfs["Formula"], 
    categories=FormulaOrder, ordered=True
)
summary_dfs = summary_dfs.sort_values(by=["Formula", "Mutation Method"], ascending=[True, True], ignore_index=True)

MutationMethodNameMap = {
    "major":"Traditional-MBFL",
    "mBERT":"Neural-MBFL",
    "major_SmBERT":"NeuraIntegra-MBFL$_{Mutation}^{Traditional-Center}$",
    "mBERT_Smajor":"NeuraIntegra-MBFL$_{Mutation}^{Neural-Center}$",
    "U_mBERT_major":"NeuraIntegra-MBFL$_{Mutation}^{Union}$",
    "SusDRankAvg":"NeuraIntegra-MBFL$_{Suspiciousness}$",
}
summary_dfs["Mutation Method"] = summary_dfs["Mutation Method"].cat.rename_categories(MutationMethodNameMap)

summary_dfs = summary_dfs.rename(columns={"Mutation Method": "Technique"})

summary_dfs[summary_dfs["Project"] == "Summary"]

In [None]:
agg_type = param["Aggregation"][0].upper()
summary_dfs_6 = summary_dfs[['Formula', 'Technique', 'Top1', 'Top3', 'Top5', 'MAP']]
summary_dfs_6.to_csv(f"./Results/TopN_MEAN_6_{agg_type}.csv", index=False)
summary_dfs_2 = summary_dfs_6[summary_dfs_6["Technique"].isin(["Traditional-MBFL", "Neural-MBFL"])]
summary_dfs_2.to_csv(f"./Results/TopN_MEAN_2_{agg_type}.csv", index=False)

## **EXAM Analysis**

In [None]:
flra = FLResultAnalyst()
param = {
    "Kill Type": ["kill_type3"],
    "Approach": ["FACombination"],
    "Mutation Type": ["NeuralMutation", "TraditionalMutation", "MergeMutation", "MergeSus"],
    "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
    "Aggregation": ["max"],
    "Formula": ["Dstar", "Ochiai", "Jaccard", "Op2", "Tarantula", "Gp13", "Muse"],
    "Tie Break": ["Avg"]
}

In [None]:
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
exam_summary_dfs = flra.compare_exam_summary_by_param(param, independent_variable="Mutation Method", drop_rank=True)

In [None]:
exam_summary_dfs

### **EXAM Distribution**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

from Utils.ColorPalette import GenshinImpactColorPalette as GIColorPalette

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'
title_fontsize = 36
label_fontsize = 30

In [None]:
palette_Tighnari = GIColorPalette.get_palette('Tighnari')
palette_Nilou = GIColorPalette.get_palette('Nilou')
palette_6 = [palette_Tighnari[5], palette_Tighnari[2], palette_Tighnari[7], palette_Tighnari[1], palette_Tighnari[9], palette_Nilou[4]]
palette_2 = palette_6[:2]

In [None]:
GIColorPalette.show_palette('Tighnari')

In [None]:
GIColorPalette.show_palette('Nilou')

In [None]:
sns.palplot(palette_6)
plt.title("Color Palette")

In [None]:
flra = FLResultAnalyst()
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
TechniqueNameMap = {
    "EXAM_major":"Traditional-MBFL",
    "EXAM_mBERT":"Neural-MBFL",
    "EXAM_major_SmBERT":"NeuraIntegra-MBFL$_{Mutation}^{Traditional-Center}$",
    "EXAM_mBERT_Smajor":"NeuraIntegra-MBFL$_{Mutation}^{Neural-Center}$",
    "EXAM_U_mBERT_major":"NeuraIntegra-MBFL$_{Mutation}^{Union}$",
    "EXAM_SusDRankAvg":"NeuraIntegra-MBFL$_{Suspiciousness}$",
}

In [None]:
for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["MergeSus", "MergeMutation", "NeuralMutation", "TraditionalMutation"],
        "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)

    
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    columns4analysis = [exam_columns[i] for i in [5, 4, 1, 2, 3, 0]]
    
    
    fig, axs = plt.subplots(1, 4, figsize=(42, 6))

    
    handles_labels_first = []
    handles_labels_second = []

    
    for column_index, column in enumerate(columns4analysis):
        color = palette_6[column_index]
        data = exam_summary_dfs[column].dropna()  

        
        x = np.linspace(data.min() - 3, data.max() + 3, 1000)

        
        kde = gaussian_kde(data)
        pdf = kde.evaluate(x)
        cdf = np.cumsum(pdf)
        cdf /= cdf[-1]  

        
        sns.kdeplot(data, bw_adjust=0.5, label=TechniqueNameMap[column], fill=True, color=color, alpha=0.05, ax=axs[0])
        axs[0].set_title('Probability Density Function', fontsize=title_fontsize)
        axs[0].tick_params(axis='both', labelsize=label_fontsize)
        axs[0].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[0].set_ylabel('Density', fontsize=label_fontsize)
        

        
        bins = 25  
        sns.histplot(data, stat='probability', bins=bins, label=TechniqueNameMap[column], element='step', common_norm=False, color=color, alpha=0.05, ax=axs[1])
        axs[1].set_title('Probability Distribution Function', fontsize=title_fontsize)
        axs[1].tick_params(axis='both', labelsize=label_fontsize)
        axs[1].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[1].set_ylabel('Probability', fontsize=label_fontsize)
        
        
        
        handles_labels_first.append(axs[0].get_legend_handles_labels())

        
        sns.ecdfplot(data, label=TechniqueNameMap[column], color=color, alpha=0.5, ax=axs[2])
        axs[2].set_title('Empirical Cumulative Distribution Function', fontsize=title_fontsize)
        axs[2].tick_params(axis='both', labelsize=label_fontsize)
        axs[2].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[2].set_ylabel('Cumulative Probability', fontsize=label_fontsize)

        
        axs[3].plot(x, cdf, label=TechniqueNameMap[column], color=color, alpha=0.5)
        axs[3].set_title('Smooth Cumulative Distribution Function', fontsize=title_fontsize)
        axs[3].tick_params(axis='both', labelsize=label_fontsize)
        axs[3].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[3].set_ylabel('Cumulative Probability', fontsize=label_fontsize)
        axs[3].set_xlim(-0.25, 1.25)
        
        
        handles_labels_second.append(axs[2].get_legend_handles_labels())
    
    if formula_index == 0:
        handles, labels = axs[0].get_legend_handles_labels()
        fig.legend(handles, labels, loc='upper center', ncol=3, bbox_to_anchor=(0.25, 1.26), fontsize=label_fontsize)
        
        handles, labels = axs[2].get_legend_handles_labels()
        fig.legend(handles, labels, loc='upper center', ncol=3, bbox_to_anchor=(0.75, 1.26), fontsize=label_fontsize)
    
    plt.tight_layout(rect=[0, 0, 1, 0.95])

    plt.savefig(f'./Results/EXAM/EXAM_6_{formula}.pdf', bbox_inches='tight', pad_inches=0.1)
    

In [None]:
for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["TraditionalMutation", "NeuralMutation"],
        "Mutation Method": ["major", "mBERT"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)

    fig, axs = plt.subplots(1, 4, figsize=(42, 6))
    
    for column_index, column in enumerate(exam_columns):
        color = palette_2[column_index]
        data = exam_summary_dfs[column].dropna()  

        x = np.linspace(data.min() - 3, data.max() + 3, 1000)

        kde = gaussian_kde(data)
        pdf = kde.evaluate(x)
        cdf = np.cumsum(pdf)
        cdf /= cdf[-1]  
        
        sns.kdeplot(data, bw_adjust=0.5, label=TechniqueNameMap[column], fill=True, color=color, alpha=0.05, ax=axs[0])
        axs[0].set_title('Probability Density Function', fontsize=title_fontsize)
        axs[0].tick_params(axis='both', labelsize=label_fontsize)
        axs[0].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[0].set_ylabel('Density', fontsize=label_fontsize)
        axs[0].set_ylim(0.0, 3.6)
        
        bins = 25  
        sns.histplot(data, stat='probability', bins=bins, label=TechniqueNameMap[column], element='step', common_norm=False, color=color, alpha=0.05, ax=axs[1])
        axs[1].set_title('Probability Distribution Function', fontsize=title_fontsize)
        axs[1].tick_params(axis='both', labelsize=label_fontsize)
        axs[1].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[1].set_ylabel('Probability', fontsize=label_fontsize)
        axs[1].set_ylim(0.0, 0.27)

        
        sns.ecdfplot(data, label=TechniqueNameMap[column], color=color, alpha=0.5, ax=axs[2])
        axs[2].set_title('Empirical Cumulative Distribution Function', fontsize=title_fontsize)
        axs[2].tick_params(axis='both', labelsize=label_fontsize)
        axs[2].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[2].set_ylabel('Cumulative Probability', fontsize=label_fontsize)

        
        axs[3].plot(x, cdf, label=TechniqueNameMap[column], color=color, alpha=0.5)
        axs[3].set_title('Smooth Cumulative Distribution Function', fontsize=title_fontsize)
        axs[3].tick_params(axis='both', labelsize=label_fontsize)
        axs[3].set_xlabel('EXAM', fontsize=label_fontsize)
        axs[3].set_ylabel('Cumulative Probability', fontsize=label_fontsize)
        axs[3].set_xlim(-0.25, 1.25)
        
    if formula_index == 0:
        handles, labels = axs[1].get_legend_handles_labels()
        fig.legend(handles, labels, loc='upper center', ncol=2, bbox_to_anchor=(0.25, 1.11), fontsize=label_fontsize)
        
        handles, labels = axs[3].get_legend_handles_labels()
        fig.legend(handles, labels, loc='upper center', ncol=2, bbox_to_anchor=(0.75, 1.11), fontsize=label_fontsize)
    
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    plt.savefig(f'./Results/EXAM/EXAM_2_{formula}.pdf', bbox_inches='tight', pad_inches=0.1)
    

### **Correlation Analysis**

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LinearSegmentedColormap

import numpy as np
import seaborn as sns
import scipy.stats as stats
from scipy import stats
import dcor

In [None]:

plt.rcParams['font.family'] = 'Times New Roman'


title_fontsize = 40
label_fontsize = 36

In [None]:
import fitz  

def crop_pdf(input_pdf_path, output_pdf_path, left=0, right=0, top=0, bottom=0):
    doc = fitz.open(input_pdf_path)

    for page in doc:
        rect = page.rect
        new_rect = fitz.Rect(rect.x0 + left, rect.y0 + top, rect.x1 - right, rect.y1 - bottom)
        page.set_cropbox(new_rect)

    doc.save(output_pdf_path)
    doc.close()

In [None]:
for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["TraditionalMutation", "NeuralMutation"],
        "Mutation Method": ["major", "mBERT"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)
    
    correlation = exam_summary_dfs['EXAM_mBERT'].corr(exam_summary_dfs['EXAM_major'], method='kendall')
    print(f'The number of point include in the analysis: {exam_summary_dfs.shape[0]}')
    print(f'The correlation between EXAM_mBERT and EXAM_major is: {correlation:.4f}')
    
    distance_corr = dcor.distance_correlation(exam_summary_dfs['EXAM_mBERT'], exam_summary_dfs['EXAM_major'])
    print(f"Distance Correlation between EXAM_mBERT and EXAM_major: {distance_corr:.4f}")
    
    data = exam_summary_dfs[['EXAM_mBERT', 'EXAM_major']].values.T
    kde = stats.gaussian_kde(data)
    density = kde(data)
    vmin, vmax = density.min(), density.max()

    fig, ax = plt.subplots(1, 1, figsize=(16, 12))

    reversed_cmap = custom_cmap.reversed()  # 假设 custom_cmap 是你的原始颜色映射变量名
    
    sns.scatterplot(x='EXAM_mBERT', y='EXAM_major', data=exam_summary_dfs, alpha=1, s=100, c=density, cmap=reversed_cmap)
    
    norm = plt.Normalize(vmin=0, vmax=vmax)
    sm = plt.cm.ScalarMappable(cmap=reversed_cmap, norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, ticks=np.arange(0, vmax + 0.25, 0.25))
    cbar.set_label('Density', fontsize=label_fontsize)
    cbar.ax.tick_params(axis='both', labelsize=label_fontsize)

    plt.xlabel('EXAM of Neural-MBFL', fontsize=label_fontsize)
    plt.ylabel('EXAM of Traditional-MBFL', fontsize=label_fontsize)
    ax.tick_params(axis='both', labelsize=label_fontsize)
    plt.xlim(-0.02, 1.02)
    plt.ylim(-0.02, 1.02)
    
    textstr = (
        r"$\bf{Distance\ Correlation\ Analysis\ of\ EXAM}$" + "\n"
        "Neural-MBFL and Traditional-MBFL" + f": {distance_corr:.4f}"
    )

    ax.text(-0.05, 1.15, textstr, transform=ax.transAxes, fontsize=label_fontsize,
              verticalalignment='top', bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='black', alpha=0.5))
    
    plt.grid(True)
    plt.savefig(f'./Results/CorrAnalysis/CorrAnalysis_2_{formula}.pdf', bbox_inches='tight', pad_inches=0.15)
    plt.show()

In [None]:
for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["TraditionalMutation", "NeuralMutation", "MergeSus"],
        "Mutation Method": ["major", "mBERT", "SusDRankAvg"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)

    x = exam_summary_dfs['EXAM_mBERT'].values
    y = exam_summary_dfs['EXAM_major'].values
    z = exam_summary_dfs['EXAM_SusDRankAvg'].values

    corr_mbert_susdrank = exam_summary_dfs['EXAM_SusDRankAvg'].corr(exam_summary_dfs['EXAM_mBERT'], method='pearson')
    corr_major_susdrank = exam_summary_dfs['EXAM_SusDRankAvg'].corr(exam_summary_dfs['EXAM_major'], method='pearson')
    print(f'The number of point include in the analysis: {exam_summary_dfs.shape[0]}')
    print(f"Correlation between EXAM_mBERT and EXAM_SusDRankAvg: {corr_mbert_susdrank:.2f}")
    print(f"Correlation between EXAM_major and EXAM_SusDRankAvg: {corr_major_susdrank:.2f}")
    
    distance_corr_mbert_susdrank = dcor.distance_correlation(exam_summary_dfs['EXAM_SusDRankAvg'], exam_summary_dfs['EXAM_mBERT'])
    distance_corr_major_susdrank = dcor.distance_correlation(exam_summary_dfs['EXAM_SusDRankAvg'], exam_summary_dfs['EXAM_major'])
    distance_corr_mbertmajor_susdrank = dcor.distance_correlation(
        exam_summary_dfs[['EXAM_mBERT', 'EXAM_major']].values, 
        exam_summary_dfs['EXAM_SusDRankAvg'].values
    )
    print(f"Distance Correlation between EXAM_mBERT and EXAM_SusDRankAvg: {distance_corr_mbert_susdrank:.4f}")
    print(f"Distance Correlation between EXAM_major and EXAM_SusDRankAvg: {distance_corr_major_susdrank:.4f}")
    print(f"Distance Correlation between EXAM_mBERT with EXAM_major and EXAM_SusDRankAvg: {distance_corr_mbertmajor_susdrank:.4f}")
    
    
    correlation_matrix = exam_summary_dfs[['EXAM_mBERT', 'EXAM_major', 'EXAM_SusDRankAvg']].corr(method='pearson')
    print(correlation_matrix)

    data = np.vstack([x, y, z])
    kde = stats.gaussian_kde(data)

    xi, yi = np.mgrid[0:1:100j, 0:1:100j]
    zi_levels = 5
    zi_values = [i / zi_levels for i in range(zi_levels + 1)]  

    fig = plt.figure(figsize=(30, 30))
    ax = fig.add_subplot(111, projection='3d')
    
    reversed_cmap = custom_cmap.reversed()  
    
    scatter = ax.scatter(x, y, z, s=100, c=kde(data), cmap=reversed_cmap, alpha=1)
    cbar = fig.colorbar(scatter, ax=ax, pad=0.1, shrink=0.5)
    cbar.set_label('Density', fontsize=label_fontsize)
    cbar.ax.tick_params(axis='both', labelsize=label_fontsize)

    label_pad_var = 20
    ax.set_xlabel('EXAM of Neural-MBFL', fontsize=label_fontsize, labelpad=label_pad_var+4)
    ax.set_ylabel('EXAM of Traditional-MBFL', fontsize=label_fontsize, labelpad=label_pad_var+2)
    ax.set_zlabel('EXAM of NeuraIntegra-MBFL$_{Suspiciousness}$', fontsize=label_fontsize, labelpad=label_pad_var)
    ax.set_xlim(-0.02, 1.02)
    ax.set_ylim(-0.02, 1.02)
    ax.set_zlim(-0.02, 1.02)
    ax.view_init(elev=15, azim=240)
    
    ax.tick_params(axis='both', labelsize=label_fontsize)
    ax.zaxis.set_ticks_position('lower')
    ax.zaxis.set_label_position('lower')
    
    textstr = (
        r"                       $\bf{Distance\ Correlation\ Analysis\ of\ EXAM}$" + "\n"
        "NeuraIntegra-MBFL$_{Suspiciousness}$ and                                    Neural-MBFL" + f": {distance_corr_mbert_susdrank:.4f}\n"
        "NeuraIntegra-MBFL$_{Suspiciousness}$ and                             Traditional-MBFL" + f": {distance_corr_major_susdrank:.4f}\n"
        "NeuraIntegra-MBFL$_{Suspiciousness}$ and (Neural-MBFL, Traditional-MBFL)" + f": {distance_corr_mbertmajor_susdrank:.4f}"
    )

    ax.text2D(0.045, 1.00, textstr, transform=ax.transAxes, fontsize=label_fontsize+8,
              verticalalignment='top', bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='black', alpha=0.5))


    plt.grid(True)
    input_pdf = f'./Results/CorrAnalysis/CorrAnalysis_3D_{formula}_ori.pdf'
    output_pdf = f'./Results/CorrAnalysis/CorrAnalysis_3D_{formula}.pdf'
    plt.savefig(input_pdf, bbox_inches='tight', pad_inches=0.2)
    
    crop_pdf(input_pdf, output_pdf, left=50, right=0, top=0, bottom=140)

### **Statistic Analysis**

In [None]:
import pandas as pd
from scipy.stats import wilcoxon
from StatisticAnalysis import summary_statistics, cliffs_delta

In [None]:
flra = FLResultAnalyst()
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
TechniqueNameMap = {
    "EXAM_major":"Traditional-MBFL",
    "EXAM_mBERT":"Neural-MBFL",
    "EXAM_major_SmBERT":"NeuraIntegra-MBFL$_{Mutation}^{Traditional-Center}$",
    "EXAM_mBERT_Smajor":"NeuraIntegra-MBFL$_{Mutation}^{Neural-Center}$",
    "EXAM_U_mBERT_major":"NeuraIntegra-MBFL$_{Mutation}^{Union}$",
    "EXAM_SusDRankAvg":"NeuraIntegra-MBFL$_{Suspiciousness}$",
}

In [None]:
columns = ['Formula', 'Comparison', 
           'P-value (two-sided)', 'P-value (one-sided less)', 'P-value (one-sided greater)', 
           'Cliff\'s Delta']
results_df = pd.DataFrame(columns=columns)

for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["MergeSus", "MergeMutation", "NeuralMutation", "TraditionalMutation"],
        "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_columns = [exam_columns[i] for i in [0, 5, 4, 1, 2, 3]]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)

    primary_data_name = exam_columns[0]
    primary_exam_data = exam_summary_dfs[primary_data_name].to_numpy()

    for i, data_name in enumerate(exam_columns[1:], start=1):
        exam_data = exam_summary_dfs[data_name].to_numpy()

        print('-'*15 + formula + '-'*15)
        print(f"{TechniqueNameMap[primary_data_name]} v.s. {TechniqueNameMap[data_name]}")
        print(f"{len(primary_exam_data)}, {len(exam_data)}")

        if len(primary_exam_data) == len(exam_data):
            sorted_primary_exam_data = np.sort(primary_exam_data)
            filtered_primary_exam_data = sorted_primary_exam_data[sorted_primary_exam_data <= 0.2]
            
            filtered_length = len(filtered_primary_exam_data)
            
            sorted_exam_data = np.sort(exam_data)
            filtered_exam_data = sorted_exam_data[:filtered_length]
            
            p_value = wilcoxon(filtered_primary_exam_data, filtered_exam_data).pvalue
            p_value_less = wilcoxon(filtered_primary_exam_data, filtered_exam_data, alternative='less').pvalue
            p_value_greater = wilcoxon(filtered_primary_exam_data, filtered_exam_data, alternative='greater').pvalue
            effective_size = cliffs_delta(filtered_primary_exam_data, filtered_exam_data)

            comparison = f"{TechniqueNameMap[primary_data_name]} v.s. {TechniqueNameMap[data_name]}"

            results_df = pd.concat([
                results_df, pd.DataFrame({
                    'Formula': [formula],
                    'Comparison': [comparison],
                    'P-value (two-sided)': [f"{p_value:.4E}"],
                    'P-value (one-sided less)': [f"{p_value_less:.4E}"],
                    'P-value (one-sided greater)': [f"{p_value_greater:.4E}"],
                    'Cliff\'s Delta': [f"{effective_size:.4f}"],
                })
            ], ignore_index=True)

            outputs = [
                f"P-value for two-sided test: {p_value:.4E}",
                f"P-value for one-sided test (less): {p_value_less:.4E}",
                f"P-value for one-sided test (greater): {p_value_greater:.4E}",
                f"Cliff's Delta (effect size): {effective_size:.4f}",
            ]

            output_results = "\n".join(outputs)
            print(output_results)
        else:
            print("Warning: data length is not same.")

results_df.to_csv('./Results/StatisticAnalysis_6_threshold.csv', index=False)

In [None]:
columns = ['Formula', 'Comparison', 
           'P-value (two-sided)', 'P-value (one-sided less)', 'P-value (one-sided greater)', 
           'Cliff\'s Delta']
results_df = pd.DataFrame(columns=columns)

for formula_index, formula in enumerate(flra.Formula):
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["MergeSus", "MergeMutation", "NeuralMutation", "TraditionalMutation"],
        "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }

    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
    exam_columns = [col for col in exam_summary_dfs.columns if col.startswith('EXAM_')]
    exam_columns = [exam_columns[i] for i in [4, 5]]
    exam_summary_dfs[exam_columns] = exam_summary_dfs[exam_columns].fillna(value=1)

    primary_data_name = exam_columns[0]
    primary_exam_data = exam_summary_dfs[primary_data_name].to_numpy()

    for i, data_name in enumerate(exam_columns[1:], start=1):
        exam_data = exam_summary_dfs[data_name].to_numpy()

        print('-'*15 + formula + '-'*15)
        print(f"{TechniqueNameMap[primary_data_name]} v.s. {TechniqueNameMap[data_name]}")
        print(f"{len(primary_exam_data)}, {len(exam_data)}")

        if len(primary_exam_data) == len(exam_data):
            sorted_primary_exam_data = np.sort(primary_exam_data)
            filtered_primary_exam_data = sorted_primary_exam_data[sorted_primary_exam_data <= 0.2]
            
            filtered_length = len(filtered_primary_exam_data)
            
            sorted_exam_data = np.sort(exam_data)
            filtered_exam_data = sorted_exam_data[:filtered_length]
            
            p_value = wilcoxon(filtered_primary_exam_data, filtered_exam_data).pvalue
            p_value_less = wilcoxon(filtered_primary_exam_data, filtered_exam_data, alternative='less').pvalue
            p_value_greater = wilcoxon(filtered_primary_exam_data, filtered_exam_data, alternative='greater').pvalue
            effective_size = cliffs_delta(filtered_primary_exam_data, filtered_exam_data)
            comparison = f"{TechniqueNameMap[primary_data_name]} v.s. {TechniqueNameMap[data_name]}"

            results_df = pd.concat([
                results_df, pd.DataFrame({
                    'Formula': [formula],
                    'Comparison': [comparison],
                    'P-value (two-sided)': [f"{p_value:.4E}"],
                    'P-value (one-sided less)': [f"{p_value_less:.4E}"],
                    'P-value (one-sided greater)': [f"{p_value_greater:.4E}"],
                    'Cliff\'s Delta': [f"{effective_size:.4f}"],
                })
            ], ignore_index=True)

            outputs = [
                f"P-value for two-sided test: {p_value:.4E}",
                f"P-value for one-sided test (less): {p_value_less:.4E}",
                f"P-value for one-sided test (greater): {p_value_greater:.4E}",
                f"Cliff's Delta (effect size): {effective_size:.4f}",
            ]

            output_results = "\n".join(outputs)
            print(output_results)
        else:
            print("Warning: data length is not same.")

# 将结果写入CSV文件
results_df.to_csv('./Results/StatisticAnalysis_2_threshold.csv', index=False)

## **Overlap Analysis**

In [None]:
palette_Tighnari = GIColorPalette.get_palette('Tighnari',format="hex")
palette_Nilou = GIColorPalette.get_palette('Nilou',format="hex")
palette_3 = [palette_Tighnari[2], palette_Tighnari[5], palette_Nilou[4]]
palette_2 = palette_3[:2]

In [None]:
import pandas as pd

def get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname):
    MaxRank_faults_df = exam_summary_dfs[exam_summary_dfs[Rank_colname] <= MaxRank].copy()
    MaxRank_faults_df.loc[:, 'combined'] = MaxRank_faults_df['Project'].astype(str) + '_' + MaxRank_faults_df['Version'].astype(str) + '_' + MaxRank_faults_df['faulty_entity'].astype(str)
    MaxRank_faults_set = set(MaxRank_faults_df['combined'])
    return MaxRank_faults_set

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
plt.rcParams['font.family'] = 'Times New Roman'

title_fontsize = 60
label_fontsize = 40

In [None]:
flra = FLResultAnalyst()
flra.DatasetVersion[flra.dataset]="v2.0"

In [None]:
TechniqueNameMap = {
    "Rank_major":"Traditional-MBFL",
    "Rank_mBERT":"Neural-MBFL",
    "Rank_major_SmBERT":"NeuraIntegra-MBFL$_{Mutation}^{Traditional-Center}$",
    "Rank_mBERT_Smajor":"NeuraIntegra-MBFL$_{Mutation}^{Neural-Center}$",
    "Rank_U_mBERT_major":"NeuraIntegra-MBFL$_{Mutation}^{Union}$",
    "Rank_SusDRankAvg":"NeuraIntegra-MBFL$_{Suspiciousness}$",
}

In [None]:
for MaxRank in [1, 3, 5]:
    fig, axs = plt.subplots(1, 7, figsize=(70, 10))
    axs = axs.flatten()

    for formula_index, formula in enumerate(flra.Formula):
        param = {
            "Kill Type": ["kill_type3"],
            "Approach": ["FACombination"],
            "Mutation Type": ["MergeSus", "MergeMutation", "NeuralMutation", "TraditionalMutation"],
            "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
            "Aggregation": ["max"],
            "Formula": [formula],
            "Tie Break": ["Avg"]
        }

        exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
        rank_columns = [col for col in exam_summary_dfs.columns if col.startswith('Rank_')]
        rank_columns = [rank_columns[i] for i in [0, 4, 5]]

        major_set = get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname='Rank_major')
        mBERT_set = get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname='Rank_mBERT')
        SusDRankAvg_set = get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname='Rank_SusDRankAvg')
        
        sets = [mBERT_set, major_set, SusDRankAvg_set]
        sets_labels = (TechniqueNameMap["Rank_mBERT"], TechniqueNameMap["Rank_major"], TechniqueNameMap["Rank_SusDRankAvg"])
        
        venn_diagram = venn3(
            [mBERT_set, major_set, SusDRankAvg_set], 
            set_labels=(TechniqueNameMap["Rank_mBERT"], TechniqueNameMap["Rank_major"], TechniqueNameMap["Rank_SusDRankAvg"]),
            alpha=0.5,
            ax=axs[formula_index]
        )

        total = len(major_set | mBERT_set | SusDRankAvg_set)
        for region in ['100', '010', '001', '110', '101', '011', '111']:
            if venn_diagram.get_label_by_id(region):
                venn_diagram.get_label_by_id(region).set_fontsize(label_fontsize)
            label = venn_diagram.get_label_by_id(region)
            if label:
                count = int(label.get_text())
                percentage = (count / total) * 100
                label.set_text(f"{count}\n({percentage:.1f}%)")

        venn_diagram.get_label_by_id('100').set_position((-0.5, 0.22))
        venn_diagram.get_label_by_id('010').set_position((0.5, 0.22))
        venn_diagram.get_label_by_id('101').set_position((-0.35, -0.05))
        venn_diagram.get_label_by_id('111').set_position((0.08, -0.05))
        venn_diagram.get_label_by_id('011').set_position((0.43, -0.05))
        venn_diagram.get_label_by_id('001').set_position((0.08, -0.48))
        
        venn_diagram.get_patch_by_id('100').set_facecolor(palette_3[0])  
        venn_diagram.get_patch_by_id('100').set_alpha(0.7)  
        
        venn_diagram.get_patch_by_id('010').set_facecolor(palette_3[1])  
        venn_diagram.get_patch_by_id('010').set_alpha(0.7)  
        
        venn_diagram.get_patch_by_id('001').set_facecolor(palette_3[2])  
        venn_diagram.get_patch_by_id('001').set_alpha(0.7)  

        positions = [(0.7, 0.53), (-0.7, 0.53), (0, -0.63)]  
        for label, pos, label_text in zip(venn_diagram.set_labels, positions, (TechniqueNameMap["Rank_major"], TechniqueNameMap["Rank_mBERT"], TechniqueNameMap["Rank_SusDRankAvg"])):
            label.set_fontsize(label_fontsize)
            label.set_position(pos)
            label.set_text(label_text)

        axs[formula_index].set_axis_on()
        axs[formula_index].text(0.5, 0.1, formula, fontsize=title_fontsize, ha='center', va='top', transform=axs[formula_index].transAxes)
        axs[formula_index].set_xlim(-0.73, 0.73)
        axs[formula_index].set_ylim(-0.9, 0.63)

        for spine in axs[formula_index].spines.values():
            spine.set_edgecolor('gray')  
            spine.set_linewidth(3)  

    plt.tight_layout(w_pad=2.5)
    plt.savefig(f"./Results/OverlapAnalysis/Venn_3_Top_{MaxRank}.pdf", format='pdf')

In [None]:
for MaxRank in [1, 3, 5]:
    fig, axs = plt.subplots(1, 7, figsize=(70, 10))
    axs = axs.flatten()

    for formula_index, formula in enumerate(flra.Formula):
        param = {
            "Kill Type": ["kill_type3"],
            "Approach": ["FACombination"],
            "Mutation Type": ["MergeSus", "MergeMutation", "NeuralMutation", "TraditionalMutation"],
            "Mutation Method": ["major", "mBERT", "major_SmBERT", "mBERT_Smajor", "U_mBERT_major", "SusDRankAvg"],
            "Aggregation": ["max"],
            "Formula": [formula],
            "Tie Break": ["Avg"]
        }

        exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)
        rank_columns = [col for col in exam_summary_dfs.columns if col.startswith('Rank_')]
        rank_columns = [rank_columns[i] for i in [4, 5]]

        major_set = get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname='Rank_major')
        mBERT_set = get_MaxRankFaults(exam_summary_dfs, MaxRank, Rank_colname='Rank_mBERT')
        
        sets = [mBERT_set, major_set]
        sets_labels = (TechniqueNameMap["Rank_mBERT"], TechniqueNameMap["Rank_major"])
        
        venn_diagram = venn2(
            [mBERT_set, major_set], 
            set_labels=(TechniqueNameMap["Rank_mBERT"], TechniqueNameMap["Rank_major"]),
            alpha=0.5,
            ax=axs[formula_index]
        )

        total = len(major_set | mBERT_set)
        for region in ['10', '01', '11']:
            if venn_diagram.get_label_by_id(region):
                venn_diagram.get_label_by_id(region).set_fontsize(label_fontsize + 4)
            label = venn_diagram.get_label_by_id(region)
            if label:
                count = int(label.get_text())
                percentage = (count / total) * 100
                label.set_text(f"{count}\n({percentage:.2f}%)")

        venn_diagram.get_label_by_id('10').set_position((-0.45, 0))
        venn_diagram.get_label_by_id('01').set_position((0.48, 0))
        venn_diagram.get_label_by_id('11').set_position((0.08, 0))
        
        venn_diagram.get_patch_by_id('10').set_facecolor(palette_2[0])  
        venn_diagram.get_patch_by_id('10').set_alpha(0.7)  
        
        venn_diagram.get_patch_by_id('01').set_facecolor(palette_2[1])  
        venn_diagram.get_patch_by_id('01').set_alpha(0.7)  
        
        positions = [(0.68, -0.5), (-0.68, -0.5)]  
        for label, pos, label_text in zip(venn_diagram.set_labels, positions, (TechniqueNameMap["Rank_major"], TechniqueNameMap["Rank_mBERT"])):
            label.set_fontsize(label_fontsize + 4)
            label.set_position(pos)
            label.set_text(label_text)

        axs[formula_index].set_axis_on()
        axs[formula_index].text(0.5, 0.1, formula, fontsize=title_fontsize, ha='center', va='top', transform=axs[formula_index].transAxes)
        axs[formula_index].set_xlim(-0.68, 0.68)
        axs[formula_index].set_ylim(-0.8, 0.5)

        for spine in axs[formula_index].spines.values():
            spine.set_edgecolor('gray')  
            spine.set_linewidth(3)  

    plt.tight_layout(w_pad=7.5)
    plt.savefig(f"./Results/OverlapAnalysis/Venn_2_Top_{MaxRank}.pdf", format='pdf')
    plt.show()

## **Repair Pattern Analysis**

In [None]:
palette_Tighnari = GIColorPalette.get_palette('Tighnari',format="hex")
palette_Nilou = GIColorPalette.get_palette('Nilou',format="hex")
palette_3 = [palette_Tighnari[2], palette_Tighnari[5], palette_Nilou[4]]
palette_2 = palette_3[:2]

In [None]:
def count_patterns(localized_bugs, repair_patterns):
    pattern_count = {}
    pattern_bugs = {}
    for bug in localized_bugs:
        project = bug.split('_')[0]
        bug_id = bug.split('_')[1]
        key = f'{project}_{bug_id}'
        if key in repair_patterns:
            for pattern in repair_patterns[key]:
                if pattern in pattern_count:
                    pattern_count[pattern] += 1
                    pattern_bugs[pattern].append(key)
                else:
                    pattern_count[pattern] = 1
                    pattern_bugs[pattern] = [key]
    return pattern_count, pattern_bugs

In [None]:
with open(D4J / "RepairInfo.json", mode = "r") as fp:
    repair_json = json.load(fp)
repair_patterns = {}
for bug_info in repair_json:
    repair_patterns[f"{bug_info['project']}_{bug_info['bugId']}"] = bug_info['repairPatterns']

In [None]:
flra = FLResultAnalyst()

mBERT_localized_bugs = {}  
major_localized_bugs = {}  

MaxRank = 5  

for formula in flra.Formula:
    param = {
        "Kill Type": ["kill_type3"],
        "Approach": ["FACombination"],
        "Mutation Type": ["NeuralMutation", "TraditionalMutation"],
        "Mutation Method": ["mBERT", "major"],
        "Aggregation": ["max"],
        "Formula": [formula],
        "Tie Break": ["Avg"]
    }
    exam_summary_dfs = flra.compare_exam_summary_by_param(param, "Mutation Method", drop_rank=False)

    rank_columns = [col for col in exam_summary_dfs.columns if col.startswith('Rank_')]

    localized_bug_MaxRank_sets = {}
    for column in rank_columns:
        localized_bug_MaxRank = set(
            exam_summary_dfs[exam_summary_dfs[column] <= MaxRank]
            .apply(lambda row: f"{row['Project']}_{row['Version']}", axis=1)
        )
        localized_bug_MaxRank_sets[column] = localized_bug_MaxRank

    mBERT_localized_bugs[formula] = localized_bug_MaxRank_sets['Rank_mBERT']
    major_localized_bugs[formula] = localized_bug_MaxRank_sets['Rank_major']

mBERT_unique_bugs = {}
major_unique_bugs = {}
for formula in mBERT_localized_bugs.keys():
    mBERT_unique_bugs[formula] = mBERT_localized_bugs[formula] - major_localized_bugs[formula]
    major_unique_bugs[formula] = major_localized_bugs[formula] - mBERT_localized_bugs[formula]
    
mBERT_unique_bugs["Union"] = set.union(*mBERT_unique_bugs.values())
major_unique_bugs["Union"] = set.union(*major_unique_bugs.values())

mBERT_unique_bugs["Intersection"] = set.intersection(*mBERT_unique_bugs.values())
major_unique_bugs["Intersection"] = set.intersection(*major_unique_bugs.values())

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Times New Roman'

title_fontsize = 36
label_fontsize = 36

In [None]:
for formula in mBERT_unique_bugs.keys():
    mBERT_pattern_count, mBERT_pattern_bugs = count_patterns(mBERT_unique_bugs[formula], repair_patterns)
    major_pattern_count, major_pattern_bugs = count_patterns(major_unique_bugs[formula], repair_patterns)

    all_patterns = set(mBERT_pattern_count.keys()) | set(major_pattern_count.keys())

    df = pd.DataFrame({
        'Pattern': list(all_patterns),
        'mBERT': [mBERT_pattern_count.get(pattern, 0) for pattern in all_patterns],
        'major': [major_pattern_count.get(pattern, 0) for pattern in all_patterns]
    })

    df['Total'] = df['mBERT'] + df['major']
    df['mBERT_Rel_Percent'] = df['mBERT'] / df['Total'] * 100
    df['major_Rel_Percent'] = df['major'] / df['Total'] * 100

    df['Total'] = df['mBERT'] + df['major']
    df['mBERT_Rel_Percent'] = df['mBERT'] / df['Total'] * 100
    df['major_Rel_Percent'] = df['major'] / df['Total'] * 100

    df_sorted_rel = df.sort_values(by=['mBERT_Rel_Percent', 'major_Rel_Percent', 'Pattern'], ascending=[True, False, False])

    fig, ax = plt.subplots(figsize=(16, 16))
    ax.barh(df_sorted_rel['Pattern'], df_sorted_rel['mBERT_Rel_Percent'], color=palette_2[0], label='Neural-MBFL')
    ax.barh(df_sorted_rel['Pattern'], df_sorted_rel['major_Rel_Percent'], left=df_sorted_rel['mBERT_Rel_Percent'], color=palette_2[1], label='Traditional-MBFL')

    ax.tick_params(axis='both', labelsize=label_fontsize)
    ax.set_xlabel('Percentage (%)', fontsize=label_fontsize)
    ax.set_ylim(-1, df.shape[0])
    
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=2, bbox_to_anchor=(0.627, 1.015), fontsize=label_fontsize)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"./Results/RepairPattern/RepairPatterns_2_MAX_{formula}.pdf")