In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get install -y tesseract-ocr
!pip install pytesseract
!pip install pytesseract opencv-python-headless

In [3]:
import json
import cv2
import pytesseract
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# CONFIG 
BASE_PATH = "/content/drive/MyDrive/VLM_Project"
ATTACK_BASE = f"{BASE_PATH}/attack_dataset_all"
ATTACK_META = f"{ATTACK_BASE}/metadata/attacks.json"
ATTACK_IMG_DIR = f"{ATTACK_BASE}/attacked_images"

# OCR config
TESS_CONFIG = "--oem 3 --psm 6"  


# OCR config
TESS_CONFIG = "--oem 3 --psm 6"

In [None]:
from collections import Counter
import cv2
import pytesseract

def ocr_text_adaptive(img, attack_type, bbox=None):
    if img is None:
        return ""

    # if we have bbox（boundary / covert），cut first
    if bbox is not None:
        x, y, w, h = bbox
        img = img[y:y+h, x:x+w]

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    #  depends on different attack_type 
    if attack_type.endswith("strong"):
        config = "--oem 3 --psm 6"
        proc = gray

    else:
        proc = cv2.threshold(
            gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )[1]
        config = "--oem 3 --psm 11"  # sparse text

    text = pytesseract.image_to_string(proc, config=config)
    return text.lower().strip()


# WORD RECALL
def word_recall(expected, recognized):
    exp_words = Counter(str(expected).lower().split())
    rec_words = Counter(str(recognized).lower().split())
    if not exp_words:
        return 0.0
    hit = sum(min(exp_words[w], rec_words.get(w,0)) for w in exp_words)
    return hit / sum(exp_words.values())


In [None]:
# LOAD METADATA
with open(ATTACK_META, "r") as f:
    attack_metadata = json.load(f)

results = []

In [None]:
!ls "/content/drive/MyDrive/VLM_Project/attack_dataset_all/"
!ls "/content/drive/MyDrive/VLM_Project/attack_dataset_all/attacked_images/"


In [8]:
print(attack_metadata[0]['attacked_image'])

attack_0000_transparent_alpha90.jpg


In [None]:
results = []

for record in tqdm(attack_metadata, desc="Stage1 OCR"):
    attacked_path = f"{ATTACK_IMG_DIR}/{record['attacked_image']}"
    img = cv2.imread(attacked_path)

    expected_text = record.get(
        "expected_ocr_text",
        record.get("injected_prompt", "")
    )

    bbox = record.get("bbox", None)

    recognized_text = ocr_text_adaptive(
        img,
        attack_type=record["attack_type"],
        bbox=bbox
    )

    recall = word_recall(expected_text, recognized_text)

    results.append({
        "attack_id": record.get("image_id", -1),
        "attack_type": record["attack_type"],
        "word_recall": recall
    })


In [None]:
results_df = pd.DataFrame(results)
records_csv = f"{ATTACK_BASE}/results_stage1_ocr_all_records.csv"
results_df.to_csv(records_csv, index=False)
print(f"Stage1 OCR results saved to {records_csv}")


In [None]:
#GROUP STATISTICS
summary = results_df.groupby("attack_type")["word_recall"].agg(['mean','std','count']).reset_index()
summary_csv = f"{ATTACK_BASE}/results_stage1_ocr_all_summary.csv"
summary.to_csv(summary_csv, index=False)
print(f"Stage1 OCR summary saved to {summary_csv}")
print(summary)

# BAR PLOT
attack_types = summary['attack_type'].tolist()
ocr_means = summary['mean'].tolist()
ocr_stds = summary['std'].tolist()

x = np.arange(len(attack_types))
width = 0.6

plt.figure(figsize=(10,6))
plt.bar(x, ocr_means, yerr=ocr_stds, capsize=5, width=width, color='salmon', edgecolor='black')
plt.xticks(x, attack_types, rotation=45, ha='right', fontsize=11)
plt.ylabel('OCR Word Recall', fontsize=12)
plt.title('Stage 1 OCR Visibility (All Attacks)', fontsize=14)
plt.ylim(0, max(ocr_means)+max(ocr_stds)*1.5)

for i, (mean, std) in enumerate(zip(ocr_means, ocr_stds)):
    plt.text(i, mean + std + 0.01, f"{mean:.2f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read Stage1 OCR CSV
OCR_CSV = "/content/drive/MyDrive/VLM_Project/attack_dataset_all/results_stage1_ocr_all_summary.csv"
ocr_stats = pd.read_csv(OCR_CSV, index_col="attack_type")

attack_types = ocr_stats.index.tolist()
ocr_means = ocr_stats['mean'].tolist()
ocr_stds  = ocr_stats['std'].tolist()

# plot
x = np.arange(len(attack_types))
width = 0.6

plt.figure(figsize=(10,6))
plt.bar(x, ocr_means, yerr=ocr_stds, capsize=5, width=width, color='salmon', edgecolor='black')
plt.xticks(x, attack_types, rotation=45, ha='right', fontsize=11)
plt.ylabel('OCR Word Recall', fontsize=12)
plt.title('Stage 1 OCR Visibility (Strong Attack)', fontsize=14)
plt.ylim(0, max(ocr_means)+max(ocr_stds)*1.5)

for i, (mean, std) in enumerate(zip(ocr_means, ocr_stds)):
    plt.text(i, mean + std + 0.01, f"{mean:.2f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

OCR_CSV = "/content/drive/MyDrive/VLM_Project/attack_dataset_all/results_stage1_ocr_all_summary.csv"
ocr_stats = pd.read_csv(OCR_CSV)

print(ocr_stats.head())
print(ocr_stats.columns)
