In [None]:
import json
import numpy as np
import os

Evaluate Config

In [None]:
attack_name = "DeepWordBug"

# Dataset
dataset_name = "PeerRead_iclr_2017"
# dataset_name = 'AgentReview'

# Model
model_name = 'gpt-4o-mini'
# model_name = 'Llama-3.3-70B'
# model_name = "gpt-4o"
# model_name = 'Mistral-small-3.1'

Path

In [None]:
data_byline_path = f"../result_EMNLP/{dataset_name}/{model_name}/{attack_name}/{attack_name}_ExplainFalse.jsonl"
output_file_dir = f"result_score/{dataset_name}/{model_name}/{attack_name}/"
output_fig_dir = f"result_score/{dataset_name}/{model_name}/{attack_name}/fig/"

if not os.path.exists(output_file_dir):
    os.makedirs(output_file_dir)
if not os.path.exists(output_fig_dir):
    os.makedirs(output_fig_dir)

read data

In [None]:
output_list = []
with open(data_byline_path, "r") as f:
    for line in f.readlines():
        entry = json.loads(line.strip())
        output_list.append(entry)
        
print(f"[RESULT INFO] Dataset: {dataset_name}, Model: {model_name}, Attack: {attack_name}")
print(f"[RESULT INFO] Total number of papers: {len(output_list)} \n")

Success Rate

In [None]:
success = 0
num_qyeries = 0
score_shift_total = []
original_score_total, attack_score_total = [], []
t_test_pairs_total = []

original_pos, original_neg = [], []
attack_pos, attack_neg = [], []
pos_shift, neg_shift = [], []

for entry in output_list:
    if entry["attack_success"]:
        success += 1
    num_qyeries += entry["num_queries"]

    # prediction for original sample
    tags_original = entry["original_output"][0].keys()
    tags_original_pos = len([tag for tag in tags_original if "POSITIVE" in tag])
    tags_original_neg = len([tag for tag in tags_original if "NEGATIVE" in tag])
    original_pos.append(tags_original_pos)
    original_neg.append(tags_original_neg)

    # prediction for attacked sample
    tags_attack = entry["attacked_output"][0].keys()
    tags_attack_pos = len([tag for tag in tags_attack if "POSITIVE" in tag])
    tags_attack_neg = len([tag for tag in tags_attack if "NEGATIVE" in tag])
    attack_pos.append(tags_attack_pos)
    attack_neg.append(tags_attack_neg)

    # score & score shift
    original_score_total.append(entry["original_score"])
    attack_score_total.append(entry["attacked_score"])
    score_shift_total.append(entry["score_shift"])
    pos_shift.append(tags_attack_pos - tags_original_pos)
    neg_shift.append(tags_attack_neg - tags_original_neg)


score_shift_total = np.array(score_shift_total)

verbose_message = f"\
Dataset: {dataset_name}, Model: {model_name}, Attack: {attack_name}\n\
Total number of papers: {len(output_list)}\n\n\
[ASR] : \n\
attack success rate: {(success/len(output_list)):.4f}, total: {len(output_list)}, success: {success}\n\
[Score Prediction] : \n\
average score shift in total: {(np.mean(score_shift_total)):.4f}\n\
median score shift in total: {(np.median(score_shift_total)):.4f}\n\
average score of original: {(np.mean(original_score_total)):.4f}\n\
average score of attacked: {(np.mean(attack_score_total)):.4f}\n\n\
[Review Generation] : \n\
average # of positive tag shift: {(np.mean(pos_shift)):.4f}\n\
average # of negative tag shift: {(np.mean(neg_shift)):.4f}\n\
average # of original tag : positive = {(np.mean(original_pos)):.4f}, negative = {(np.mean(original_neg)):.4f}\n\
average # of attacked tag : positive = {(np.mean(attack_pos)):.4f}, negative = {(np.mean(attack_neg)):.4f}\n\
median # of original tag : positive = {(np.median(original_pos)):.4f}, negative = {(np.median(original_neg)):.4f}\n\
median # of attacked tag : positive = {(np.median(attack_pos)):.4f}, negative = {(np.median(attack_neg)):.4f}\n\n\
[Queries] : \n\
average # of queries for each paper: {num_qyeries/len(output_list):.4f}\n\n"

print(verbose_message)

Statistic Test (if the score differences are normally distributed)

In [None]:
import math
from scipy.stats import wilcoxon, ttest_rel, shapiro, t

differences = [x - y for x, y in zip(attack_score_total, original_score_total)]
stat, p_value_normal = shapiro(differences)

alpha = 0.05 # significance level
confidence_level = 1 - alpha # confidence level

confidence_interval_result = "" # string to store confidence interval

# test if the score differences are significant
if p_value_normal < alpha:
    print("The differences are not normally distributed. Use Wilcoxon Signed-Rank Test.")
    test_type = "Wilcoxon Signed-Rank Test"
    
    stat_total, p_value_total = wilcoxon(
        np.array(attack_score_total),
        np.array(original_score_total),
        alternative="greater",
    )
    
    # --- Confidence interval for Wilcoxon Signed-Rank Test ---
    # Since the `wilcoxon` test is about whether the median difference is zero,
    # a more robust approach is to calculate the confidence interval for the median difference.
    # For Wilcoxon test, the standard confidence interval is based on the Hodges-Lehmann estimator.
    # However, a more precise Hodges-Lehmann estimation usually requires manual implementation or specific packages.
    # Here we provide an approximate method for calculating the confidence interval of the difference **median** as reference.

    # Use bootstrap method to estimate confidence interval for median (more robust, but requires specifying iterations)
    # If the data size is small and computing time is not sensitive, bootstrap method is recommended
    
    # Median estimate
    hl_estimate = np.median(differences)

    # Bootstrap CI (recommended)
    n_bootstraps = 10000
    alpha = 0.05
    rng = np.random.default_rng(seed=42)

    medians = []
    for _ in range(n_bootstraps):
        sample = rng.choice(differences, size=len(differences), replace=True)
        medians.append(np.median(sample))

    ci_lower = np.percentile(medians, 100 * (alpha / 2))
    ci_upper = np.percentile(medians, 100 * (1 - alpha / 2))

    confidence_interval_result = (
        f"Hodges-Lehmann estimate (median difference): {hl_estimate}"
        f"\nBootstrap 95% CI: ({ci_lower:.4f}, {ci_upper:.4f})"
    )
    
else:
    print("The differences are normally distributed. Use Paired T-Test.")
    test_type = "Paired T-Test"
    
    stat_total, p_value_total = ttest_rel(
        np.array(attack_score_total),
        np.array(original_score_total),
        alternative="greater",
    )
    
    # --- Confidence interval for Paired T-Test ---
    mean_diff = np.mean(differences)
    std_diff = np.std(differences, ddof=1) # ddof=1 for sample standard deviation
    n = len(differences)
    se_diff = std_diff / np.sqrt(n)

    df = n - 1
    # Since we usually care about the two-sided confidence interval for the mean difference, even if the test is one-sided
    t_critical = t.ppf(1 - alpha / 2, df) # critical value for two-tailed test

    lower_bound = mean_diff - t_critical * se_diff
    upper_bound = mean_diff + t_critical * se_diff

    confidence_interval_result = (
        f"Confidence Interval for Mean Difference ({confidence_level*100:.0f}%): "
        f"[{lower_bound:.2f}, {upper_bound:.2f}]"
    )

print(f"Test result: test statistic: {stat_total:.2f}, p-value: {p_value_total:.4e}")
print(confidence_interval_result) # print confidence interval result

stat_verbose = "There is a significant difference in total scores before and after attack." if p_value_total < alpha else "There is no significant difference in total scores before and after attack."
print(stat_verbose)

In [None]:
print(f"Test result: test statistic: {stat_total}, p-value: {p_value_total:.4e}")
alpha = 0.05
stat_verbose = "There is a significant difference in total scores before and after attack." if p_value_total < alpha else "There is no significant difference in total scores before and after attack."
print(stat_verbose)
print(confidence_interval_result)


with open(os.path.join(output_file_dir, "attack_result.txt"), "w") as f:
    f.write(verbose_message)
    f.write("[Statistic Test] : \n")
    f.write(f"Test type : {test_type}, (by shapiro) test statistic:{stat}, p-value:{p_value_normal:.4e}\n")
    f.write(f"{test_type} test result: test statistic: {stat_total}, p-value: {p_value_total:.4e}, significant:{p_value_total<alpha}\n")
    f.write(confidence_interval_result + "\n") # write confidence interval result to file
    f.write(stat_verbose)

[Optional] image

In [None]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.use("Agg")

# 繪製攻擊前後數據的箱形圖
plt.boxplot([original_score_total, attack_score_total], labels=["Original", "Attacked"])
plt.title("Boxplot of Scores Before and After Attack")
# plt.show()
plt.savefig(os.path.join(output_fig_dir, f"{attack_name}_boxplot.png"))

# 繪製差異的直方圖
score_diff = [a - o for o, a in zip(original_score_total, attack_score_total)]
min_diff = min(score_diff)
max_diff = max(score_diff)
plt.figure(figsize=(8, 6))
plt.hist(
    score_diff,
    bins=range(min_diff, max_diff + 2),
    edgecolor="black",
    alpha=0.7,
    align="left",
)
# 設置 x 軸刻度，每個整數顯示
plt.xticks(range(min_diff, max_diff + 1))
plt.title("Distribution of Score Differences (Attacked - Original)", fontsize=14)
plt.xlabel("Score Difference (Attacked - Original)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(True)
# plt.show()
plt.savefig(os.path.join(output_fig_dir, f"{attack_name}_distribution.png"))


aspects = [
    "NONE",
    "SUMMARY",
    "MOTIVATION POSITIVE",
    "MOTIVATION NEGATIVE",
    "SUBSTANCE POSITIVE",
    "SUBSTANCE NEGATIVE",
    "ORIGINALITY POSITIVE",
    "ORIGINALITY NEGATIVE",
    "SOUNDNESS POSITIVE",
    "SOUNDNESS NEGATIVE",
    "CLARITY POSITIVE",
    "CLARITY NEGATIVE",
    "REPLICABILITY POSITIVE",
    "REPLICABILITY NEGATIVE",
    "MEANINGFUL COMPARISON POSITIVE",
    "MEANINGFUL COMPARISON NEGATIVE",
    "IMPACT POSITIVE",
    "IMPACT NEGATIVE",
]
aspect_labels = {key: 0 for key in aspects}


"""
aspect label distribution in bar plot
"""

for entry in output_list:
    for key in entry.keys():
        if key == "attacked_output":
            for tag in entry[key][0].keys():
                tag = tag.replace("_", " ")
                if tag in aspect_labels:
                    aspect_labels[tag] += 1

plt.figure(figsize=(10, 6))
plt.bar(aspect_labels.keys(), aspect_labels.values())
plt.xlabel("Aspect Label")
plt.ylabel("Frequency")
plt.title("Aspect Label Distribution")
plt.xticks(rotation=90)
plt.tight_layout()
# plt.show()
plt.savefig(os.path.join(output_fig_dir, f"{attack_name}_aspect_distribution.png"))

"""
calculate the aspect sentiment shift  (我不確定我的理解有沒有錯)
"""


if not os.path.exists(output_file_dir):
    os.makedirs(output_file_dir)

avg_aspect_score_changes = {
    "OVERALL": 0,
    "SUBSTANCE": 0,
    "APPROPRIATENESS": 0,
    "COMPARISON": 0,
    "CORRECTNESS": 0,
    "ORIGINALITY": 0,
    "CLARITY": 0,
    "IMPACT": 0,
}

with open(
    os.path.join(output_file_dir, f"{attack_name}_aspect_sentiment_shift.jsonl"), "w"
) as f:
    for entry in output_list:
        aspect_score_changes = {
            "OVERALL": 0,
            "SUBSTANCE": 0,
            "APPROPRIATENESS": 0,
            "COMPARISON": 0,
            "CORRECTNESS": 0,
            "ORIGINALITY": 0,
            "CLARITY": 0,
            "IMPACT": 0,
        }
        if "original_output" in entry and "attacked_output" in entry:
            original_output_scores = entry["original_output"][1]
            attacked_output_scores = entry["attacked_output"][1]
            for tag in entry["original_output"][1].keys():
                tag = tag.replace("_", " ")
                aspect_score_changes[tag] += (
                    attacked_output_scores[tag] - original_output_scores[tag]
                )
                avg_aspect_score_changes[tag] += (
                    attacked_output_scores[tag] - original_output_scores[tag]
                )
        output_data = {
            "paper_id": entry["paper_id"],
            "aspect_score_changes": aspect_score_changes,
        }

        json.dump(output_data, f)
        f.write("\n")

print("\naverage aspect sentiment shift :")
for key in avg_aspect_score_changes.keys():
    avg_aspect_score_changes[key] /= len(output_list)
    print(f"{key} : {round(avg_aspect_score_changes[key], 6)}")