### Quantify the degree of changes.
(1) **Modification Rate**: measures the percentage of tokens altered relative to the original text.

(2) **Semantic Similarity**: measured by BERTScore, to assess how closely the adversarial texts preserve the original semantics.

Set Path

In [None]:
import difflib
import json
import os
import re

from typing import List, Tuple
from bert_score import score as bert_score

# Directory configuration
DATASET_NAME = "PeerRead"
# DATASET_NAME = "AgentReview"    
DATASET_FULL_NAME = "PeerRead_iclr_2017" if DATASET_NAME == "PeerRead" else "AgentReview"

ATTACK_NAME = "DeepWordBug"
MODEL_NAME = "gpt-4o-mini"

JSONL_PATH = f"../result_EMNLP/{DATASET_FULL_NAME}/{MODEL_NAME}/{ATTACK_NAME}/{ATTACK_NAME}_ExplainFalse.jsonl"
OUTPUT_DIR = f"../evaluation_EMNLP/result_modification/{DATASET_NAME}/{MODEL_NAME}/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

Get modification in content

In [None]:
def clean_parsed_text(text):
    """Clean the parsed text by removing unnecessary tags and formatting."""
    text = text.replace("<UnmodifiableEnd>", "").replace("<UnmodifiableStart>", "") # remove <Unmodifiable> tags
    text = text.replace("\n", " ").strip() # remove newlines and standardize spaces
    text = re.sub(r'\s+', ' ', text)   # remove extra spaces
    return text.strip()


def get_diff_sentences(original: str, attacked: str, max_diff=3):
    """Get the differences between original and attacked sentences."""
    original_sents = re.split(r'(?<=[。！？.!?])\s+', original)
    attacked_sents = re.split(r'(?<=[。！？.!?])\s+', attacked)

    diff = []
    for o_sent, a_sent in zip(original_sents, attacked_sents):
        if o_sent.strip() != a_sent.strip():
            diff.append((o_sent.strip(), a_sent.strip()))
        if len(diff) >= max_diff:
            break
    return diff


def compute_char_changed_rate(original: str, attacked: str) -> float:
    """
    Compute the normalized character-level changed rate between original and attacked text.
    """
    seq = difflib.SequenceMatcher(None, original, attacked)
    
    changed_chars = sum(
        max(i2 - i1, j2 - j1)
        for tag, i1, i2, j1, j2 in seq.get_opcodes()
        if tag != 'equal'
    )
    
    avg_len = (len(original) + len(attacked)) / 2
    return changed_chars / avg_len if avg_len > 0 else 0.0

In [None]:
# average BERTScore and char-based changed rate
def evaluate_attack_metrics(pairs: List[Tuple[str, str]], lang='en') -> dict:
    originals = [o for o, _ in pairs]
    attacks = [a for _, a in pairs]

    # BERTScore calculation (returns P/R/F1)
    P, R, F1 = bert_score(attacks, originals, lang=lang, verbose=False)

    # Average char-based changed rate
    changed_rates = [compute_char_changed_rate(o, a) for o, a in pairs]

    return {
        "Avg_Char_Changed_Rate": sum(changed_rates) / len(changed_rates),
        "Avg_BERTScore_P": float(P.mean()),
        "Avg_BERTScore_R": float(R.mean()),
        "Avg_BERTScore_F1": float(F1.mean()),
    }

In [None]:
# Step 1: Read JSONL and build paper_id -> difference sentence mapping
all_diffs = []
with open(JSONL_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        paper_id = data['paper_id']
        
        original = clean_parsed_text(data['original_content'])
        attacked = clean_parsed_text(data['attacked_content'])
        diffs = get_diff_sentences(original, attacked, max_diff=10)
        if diffs:
            all_diffs.extend([t for t in diffs])

# Step 2: Calculate average character change rate
result = evaluate_attack_metrics(all_diffs, lang='en')
verbose = f"Avg_Char_Changed_Rate: {result['Avg_Char_Changed_Rate']:.4f}\n" \
            f"Avg_BERTScore_P: {result['Avg_BERTScore_P']:.4f}\n" \
            f"Avg_BERTScore_R: {result['Avg_BERTScore_R']:.4f}\n" \
            f"Avg_BERTScore_F1: {result['Avg_BERTScore_F1']:.4f}\n"
print(verbose)

# Step 3: Save results
output_path = os.path.join(OUTPUT_DIR, f"{ATTACK_NAME}_modification_metric.txt")
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(verbose)