### 1. Import packages

In [None]:
import numpy as np
from PIL import Image
import copy
import os
import random
import re
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor,Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
import os, sys, time

### 2. Load the target model

In this workbook (and reported in the paper), we demonstrate VLM reliability concerns of discerning image realness/authenticity in Qwen-based models. The query functions are written with the default inference setups in mind. When changing the target model, be sure to read through the code and understand how textual query and image inputs are processed. 

Failure to do the necessary checks may result in some errors.


In [None]:
# For Qwen model: select the appropriate model from huggingface hub repo.
modelName = "Qwen/Qwen2.5-VL-3B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    modelName, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(modelName)

### 3. Define the helper functions and the experimental setup.

The noise_params dictionary object stores parameters used to control the perturbation strength. The "direction" of the perturbation is dependent on the current VLM score and the target threshold.

The repository contains a "test_images" directory with some images from the RGFreq dataset - which you can download from IEEE dataport.

In [None]:
def image_to_array(image):
    return np.asarray(image).astype(np.float32)

def array_to_image(array):
    array = np.clip(array, 0, 255).astype(np.uint8)
    return Image.fromarray(array)
def get_shuffled_image_list(directory, seed=42):
    random.seed(seed)  # Ensure reproducibility
#     image_list = os.listdir(directory)
    random.shuffle(directory)
    return directory

noise_params = {
    "num_sparse_points": 500,
    "sparse_noise_std": 15000,     # default num_sparse_points + sparse_noise_std values that will be updated
    "min_freq_band": 0.85,         # lower bound for high freq region
    "max_freq_band": 1.00          # upper bound for high freq region
}

# images to be tested and transformed 
imageDir = './test_images/'

# directory where the images will be saved
targetDir = "./perturbed_images/realism/"

# for quick testing of code
maxTestImages = 100

# perturbation search hyperparameters
TARGET_REALISM_THRESHOLD = 7
MAX_SEARCH_ITERATIONS = 5
CANDIDATES_PER_ITERATION = 10

### 4. Define VLM reliability implementation functions.

Here, our technical contributions leverage the following function blocks.
1. **query_model_realness**(img)
 - input is the test image. Outputs the VLM's realness prediction as an integer, given the prompt: "*What is the likelihood that this is a real image? Give me a score out of 10. Output a single number.*" The function constructs a Qwen-readable message, processing the text and image inputs, parsing this through the VLM.
    
2. **guided_frequency_search_increase_realism**(image: Image.Image, query_model_fn,target_threshold: int = 5, max_iters: int = 5, candidates_per_iter: int = 4, target_path = None,noise_params) -> (Image.Image, int)
 - This function handles the search for the optimal frequency perturbation, based on the target output and the VLM that is queried. The running code uses this function in the iterator. After spatial perturbations are applied, the VLM is queried to see if the realism likelihood has increased. This is the effective "search" taking place. As per the paper, we are demonstrating that imperceptible perturbations can move the VLM decision across "*binary*" decision boundaries

3. **add_sparse_fequency_domain_noise_patch_channel**(image: Image.Image, num_sparse_points: int = 100, sparse_noise_std: float = 1500, min_freq_band: float = 0.85, max_freq_band: float = 1.00) -> Image.Image:
- This is the spatial frequency perturbation function. Given an image and perturbation parameters, it returns a perturbed image (with the candidate perturbation applied). FFT and inverse FFT is called to transform the image to and from the frequency domain.

In [None]:
def query_model_realness(img):
    
    # This is the prompt used for the paper.
    prompt = "What is the likelihood that this is a real image? Give me a score out of 10. Output a single number."
    device = "cuda"
    
    # create a message structure for the VLM to read
    message = [[
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": prompt},
            ],
        }
    ]]
    
    # process text and image inputs and send to device
    texts = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        for msg in message]
    image_inputs, video_inputs = process_vision_info(message)
    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    
    # Batch Inference
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_texts = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    # We typecast the output as an int to ensure that a single number is extracted.
    # If the output is not a single number, try and output a float,
    # else return 0 as the VLM could not return an appropriate number. 
    # This is just to catch string errors, found that the second exception would rarely arise when the default
    # prompt is used
    try:
        return int(output_texts[0])
    except ValueError:
        try:
            # Try parsing as float then int
            return int(float(output_texts[0]))
        except ValueError:
            # Handle if it's not even a float
            print(f"Warning: Cannot convert {output_texts[0]} to int. Returning 0")
            return 0

def guided_frequency_search_increase_realism(
    image: Image.Image,
    query_model_fn,
    target_threshold: int = 5,
    max_iters: int = 5,
    candidates_per_iter: int = 4,
    target_path = None,
    **noise_params
) -> (Image.Image, int):
    start = time.time()
    """
    Guided search using frequency-domain perturbations to boost likelihood.

    Parameters:
    image (Image.Image): Input PIL image.
    query_model_fn (function): Black-box model query function.
    target_threshold (int): Desired likelihood score.
    max_iters (int): Max optimization iterations.
    candidates_per_iter (int): Number of perturbation candidates per iteration.
    noise_params: Parameters for the noise function.

    Returns:
    (Image.Image, int): Best transformed image and final likelihood score.
    """
    
    # Hyper-parameters used for determining the sparsity and intensity of the perturbation on the image.
    # As per the paper: σ = 0.025 × H × W , i.e., 2.5% standard deviation, proportional to input size.
    #                   ρ = 0.1 × H × W , i.e., 10% data points transformed, proportional to input size.
    NPix = 0.1
    Nstd = 0.025
    
    # Retreive the current state of the image and the corresponding base score
    current_image = image.copy()
    current_score = query_model_fn(current_image)
    
    # Define spatial perturbation parameters and update noise parameter dictionary
    noise_params["num_sparse_points"] = int(NPix*current_image.size[0]*current_image.size[1])
    
    # If the image size is really small (like CIFAR-10), adjust the standard deviation parameter
    if current_image.size[0]*current_image.size[1] < 100 * 100:
        noise_params["sparse_noise_std"] = int(Nstd*2*current_image.size[0]*current_image.size[1])
    else:
        noise_params["sparse_noise_std"] = int(Nstd*current_image.size[0]*current_image.size[1])
        
    
    # If the model already predicts that the image is likely to be real, exit early
    if current_score >= target_threshold:
        print(current_score, " is already above target threshold")
        return current_image, current_score
    
    # Else: continue to iterate through for 'max_iters' number of iterations
    print("original score:\t", current_score)
    for iteration in range(max_iters):
        # Empty lists to store experimental results
        candidates = []
        scores = []

        # Generate multiple perturbation candidates
        # At each iteration, store a candidate perturbation and the score it achieves
        for _ in range(candidates_per_iter):
            candidate_img = add_sparse_fequency_domain_noise_patch_channel(current_image, **noise_params)
            score = query_model_fn(candidate_img)
            candidates.append(candidate_img)
            scores.append(score)
        print("iteration:\t", iteration)
        print("scores:\t\t", scores)

        # For increasing realism likelihood, the best score is the maximum value achieved
        best_idx = np.argmax(scores)
        best_candidate_score = scores[best_idx]

        # Only move if there's an improvement
        if best_candidate_score >= current_score:
            current_image = candidates[best_idx]
            current_score = best_candidate_score

        print("current score:\t",current_score,"\n")
        current_image.save(target_path)
        # Stop if target is reached
        if current_score >= target_threshold:
            break
        
        # For debugging and logging implementation time.
        end = time.time()
        print(f"Took {end - start:.4f} seconds")
    return current_image, current_score

def add_sparse_fequency_domain_noise_patch_channel(
    image: Image.Image,
    num_sparse_points: int = 100,
    sparse_noise_std: float = 1500,
    min_freq_band: float = 0.85,
    max_freq_band: float = 1.00
) -> Image.Image:
    """
    Adds sparse noise in a selected frequency band of the image's frequency domain.

    Parameters:
    image (Image.Image): Input PIL image.
    num_sparse_points (int): Number of sparse frequency points to modify.
    sparse_noise_std (float): Std dev of Gaussian noise to apply.
    min_freq_band (float): Min frequency band (as fraction of max radius).
    max_freq_band (float): Max frequency band (as fraction of max radius).

    Returns:
    Image.Image: Image with frequency-domain sparse noise applied.
    """
    image_np = np.array(image)
    try:
        height, width, channels = image_np.shape
    except ValueError:
        image_np= np.stack([image_np] * 3, axis=-1)
        height, width, channels = image_np.shape
    cy, cx = height // 2, width // 2

    for c in range(channels):
        channel = image_np[:, :, c]

        # DFT and center-shift
        dft = np.fft.fft2(channel)
        dft_shift = np.fft.fftshift(dft)

        # Frequency band selection
        Y, X = np.ogrid[:height, :width]
        distance = np.sqrt((X - cx)**2 + (Y - cy)**2)
        max_radius = np.max(distance)
        min_thresh = min_freq_band * max_radius
        max_thresh = max_freq_band * max_radius

        # Mask bandpass region
        band_mask = (distance >= min_thresh) & (distance <= max_thresh)
        band_indices = np.argwhere(band_mask)

        # Select sparse positions to perturb
        selected_indices = band_indices[np.random.choice(
            band_indices.shape[0],
            size=min(num_sparse_points, len(band_indices)),
            replace=False
        )]

        for y_idx, x_idx in selected_indices:
            dft_shift[y_idx, x_idx] += np.random.normal(0, sparse_noise_std)

        # Inverse transform
        dft_ishift = np.fft.ifftshift(dft_shift)
        img_back = np.fft.ifft2(dft_ishift)
        img_back = np.abs(img_back)
        image_np[:, :, c] = np.clip(img_back, 0, 255).astype(np.uint8)

    return Image.fromarray(image_np)




### 5. Running Code (Increasing Realism Perception)

In [None]:
# for each image in the test image directory
for ii, fp in enumerate(get_shuffled_image_list(os.listdir(imageDir))):
    if ii < maxTestImages:
        
        if fp.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".gif")):
            input_image = Image.open(imageDir + fp)

            # Apply spatial frequency transformation on images, returning the optimal perturbed image
            transformed_image, final_score = guided_frequency_search_increase_realism(
                input_image,
                query_model_fn=query_model_realness,
                target_threshold=TARGET_REALISM_THRESHOLD,
                max_iters=MAX_SEARCH_ITERATIONS,
                candidates_per_iter=CANDIDATES_PER_ITERATION,
                target_path=targetDir+fp,
                **noise_params
            )
            if transformed_image is not None:
                print(f"Final Likelihood Score: {final_score}")
                transformed_image.save(targetDir+fp)

### 6. Decreasing Realism Perception

Evaluating the performance on generated images and making VLMs perceptions 'more realistic' is one part of the problem. To ensure that the approach is valid, we also demonstrate how perceptions of realism can be reduced by applying transformations to move the VLM decision in the opposite direction. To do this, we need a second guided_frequency search function that looks to minimize the realism likelihood score. Once this function is defined, we implement a new running code on the test images.

In [None]:
def guided_frequency_search_decrease_realism(
    image: Image.Image,
    query_model_fn,
    target_threshold: int = 5,
    max_iters: int = 5,
    candidates_per_iter: int = 4,
    target_path = None,
    **noise_params
) -> (Image.Image, int):
    """
    Guided search using frequency-domain perturbations to boost likelihood.

    Parameters:
    image (Image.Image): Input PIL image.
    query_model_fn (function): Black-box model query function.
    target_threshold (int): Desired likelihood score.
    max_iters (int): Max optimization iterations.
    candidates_per_iter (int): Number of perturbation candidates per iteration.
    noise_params: Parameters for the noise function.

    Returns:
    (Image.Image, int): Best transformed image and final likelihood score.
    """
    NPix = 0.2
    Nstd = 0.025
    current_image = image.copy()
    current_score = query_model_fn(current_image)
    noise_params["num_sparse_points"] = int(NPix*current_image.size[0]*current_image.size[1])
    noise_params["sparse_noise_std"] = int(Nstd*current_image.size[0]*current_image.size[1])

    if current_score <= target_threshold:
        print(current_score, " is already above target threshold")
        return current_image, current_score
    print("original score:\t", current_score)
    for iteration in range(max_iters):
        candidates = []
        scores = []

        # Generate multiple perturbation candidates
        for _ in range(candidates_per_iter):
            candidate_img = add_sparse_fequency_domain_noise_patch_channel(current_image, **noise_params)
            score = query_model_fn(candidate_img)
            candidates.append(candidate_img)
            scores.append(score)
        print("iteration:\t", iteration)
        print("scores:\t\t", scores)

        best_idx = np.argmin(scores)
        best_candidate_score = scores[best_idx]

        # Only move if there's an improvement
        if best_candidate_score <= current_score:
            current_image = candidates[best_idx]
            current_score = best_candidate_score

        print("current score:\t",current_score,"\n")
        try:
            current_image.save(target_path)
        except OSError:
            current_image = current_image.convert("RGB")
            current_image.save(target_path)
        # Stop if target is reached
        if current_score <= target_threshold:
            break

    return current_image, current_score

# updated perturbation search hyperparameters
TARGET_REALISM_THRESHOLD = 4
MAX_SEARCH_ITERATIONS = 5
CANDIDATES_PER_ITERATION = 10

# images to be tested and transformed 
imageDir = './test_images/'

# directory where the images will be saved
targetDir = "./perturbed_images/realism/"


# for each image in the test image directory
for ii, fp in enumerate(get_shuffled_image_list(os.listdir(imageDir))):
    if ii < maxTestImages:
        
        if fp.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".gif")):
            input_image = Image.open(imageDir + fp)
            
            # Apply spatial frequency transformation on images, returning the optimal perturbed image
            transformed_image, final_score = guided_frequency_search_decrease_realism(
                input_image,
                query_model_fn=query_model_realness,
                target_threshold=TARGET_REALISM_THRESHOLD,
                max_iters=MAX_SEARCH_ITERATIONS,
                candidates_per_iter=CANDIDATES_PER_ITERATION,
                target_path=targetDir+fp,
                **noise_params)
            if transformed_image is not None:
                print(f"Final Likelihood Score: {final_score}")
                transformed_image.save(targetDir+fp)