In [None]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch
from PIL import Image

model_id = "llava-hf/llava-1.5-7b-hf"

processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()


Load Sentence-BERT

In [None]:
!pip install -q sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

embedder = SentenceTransformer("all-MiniLM-L6-v2")


Define function 

In [None]:
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = processor(
        text="<image>\nDescribe the image in one sentence.",
        images=image,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)

    return processor.decode(output[0], skip_special_tokens=True)


Load metadata

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import json

BASE_PATH = "/content/drive/MyDrive/VLM_Project"

with open(f"{BASE_PATH}/attack_dataset_diffusion/metadata/attacks_diffusion.json") as f:
    metadata = json.load(f)


Main Loop

In [None]:
import pandas as pd
from tqdm import tqdm

records = []
clean_cache = {}

for item in tqdm(metadata):
    clean_path = f"{BASE_PATH}/attack_dataset_diffusion/clean_images/{item['clean_image']}"
    attacked_path = f"{BASE_PATH}/attack_dataset_diffusion/diffused_images/{item['diffusion_cleaned_image']}"


    # clean baseline
    if item['clean_image'] not in clean_cache:
        clean_cache[item['clean_image']] = generate_caption(clean_path)

    clean_text = clean_cache[item['clean_image']]
    attacked_text = generate_caption(attacked_path)

    # embedding similarity
    emb = embedder.encode([clean_text, attacked_text])
    sim = cosine_similarity([emb[0]], [emb[1]])[0][0]

    records.append({
        "attack_id": item["image_id"],
        "attack_type": item["attack_type"],
        "clean_text": clean_text,
        "attacked_text": attacked_text,
        "semantic_similarity": sim,
        "behavior_shift": 1 - sim
    })


Save and summrize results 

In [None]:
df = pd.DataFrame(records)

save_path = f"{BASE_PATH}/attack_dataset_diffusion/results_stage2_behavior_shift_afterdiffusion.csv"
df.to_csv(save_path, index=False)

print(" Stage 2 results saved to:")
print(save_path)

print("\n Mean behavior shift by attack type:")
print(df.groupby("attack_type")["behavior_shift"].mean())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# read Stage2 CSV
STAGE2_CSV = "/content/drive/MyDrive/VLM_Project/attack_dataset_diffusion/results_stage2_behavior_shift_afterdiffusion.csv"
df_stage2 = pd.read_csv(STAGE2_CSV)

# Calculate mean and std
stats = df_stage2.groupby("attack_type")["behavior_shift"].agg(['mean', 'std']).reset_index()
attack_types = stats['attack_type'].tolist()
means = stats['mean'].tolist()
stds  = stats['std'].tolist()

# Plotting
x = np.arange(len(attack_types))
width = 0.6

plt.figure(figsize=(10,6))
plt.bar(x, means, yerr=stds, capsize=5, width=width, color='skyblue', edgecolor='black')
plt.xticks(x, attack_types, rotation=45, ha='right', fontsize=11)
plt.ylabel('Mean Behavior Shift', fontsize=12)
plt.title('Stage 2: Behavior Shift After Diffusion-clean', fontsize=14)
plt.ylim(0, max(means) + max(stds)*1.5)

for i, (mean, std) in enumerate(zip(means, stds)):
    plt.text(i, mean + std + 0.002, f"{mean:.3f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()
