In [None]:
import os
import pandas as pd
from pathlib import Path

from data_download import MAttackDataPreparator

# current dir
cwd = os.getcwd()

In [None]:
# dir for dataset
m_attack_data_path = Path(cwd).parent / "M-Attack-VLM"

# global parameters
dataset = "bigscale_100"
variations=["4", "8", "16"]

## Option №1 - just download before you start llamator attack

In normal cicumstances, download must be handled automatically. But sometimes there is a small chance for the error with dataset lib. So it's useful to have some options to get things done.

In [None]:
prep = MAttackDataPreparator(base_path=m_attack_data_path,
                             dataset=dataset)
prep.prepare()

## Option №2 - check how it looks in attack, save in format needed

In [None]:
import logging

# how it looks inside attack

logger = logging.getLogger(__name__)

def imgpath2base64(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string


def _load_attack_data(dataset: str, dataset_variations: list) -> pd.DataFrame:
    base_dir = Path(cwd).parent
    
    m_attack_data_path = base_dir / "M-Attack-VLM"
    input_data_path = m_attack_data_path / dataset

    # download if no file found
    missing = [str(input_data_path / v) for v in dataset_variations if not (input_data_path / v).exists()]
    if missing:
        logger.warning(f"[WARN] Missing variations found: {missing}")
        logger.info(f"[INFO] Triggering M-Attack data download...")
        prep = MAttackDataPreparator(base_path=m_attack_data_path, dataset=dataset)
        prep.prepare(variations=dataset_variations)

    # load targets
    target_data_path = m_attack_data_path / "target" / dataset
    df_keywords = pd.read_json(target_data_path / "keywords.json")
    df_captions = pd.read_json(target_data_path / "caption.json")
    df_target = df_keywords.merge(df_captions, on="image")
    
    df_target["image_id"] = df_target["image"].apply(lambda x: int(Path(x).stem))

    data = []
    for dataset_variation in dataset_variations:
        attack_data_path = input_data_path / dataset_variation
        if not attack_data_path.exists():
            logger.warning(f"[WARN] Skipping {attack_data_path} — folder does not exist.")
            continue

        files = list(attack_data_path.glob("*.png"))
        if not files:
            logger.warning(f"[WARN] No PNGs found in {attack_data_path}")
            continue

        logger.info(f"[INFO] Processing {len(files)} files from {attack_data_path}")
        for file in files:
            try:
                image_encoded = imgpath2base64(file)
                image_id = int(file.stem)
                data.append(
                    dict(
                        image_path=str(file.relative_to(m_attack_data_path.parent)),
                        image_id=image_id,
                        dataset_variation=dataset_variation,
                        image_encoded=image_encoded,
                    )
                )
            except Exception as e:
                logger.error(f"[ERROR] Failed to encode {file.name}: {e}")

    if not data:
        raise RuntimeError("No image data collected — check folder structure and file presence.")

    df_data = pd.DataFrame(data)
    
    df_data["image_id"] = df_data["image_id"].astype(int)
    df_target["image_id"] = df_target["image_id"].astype(int)
    
    df_attack = df_data.merge(
        df_target, on="image_id", how="left")

    df_attack["image_id"] = df_attack["image_id"].astype(int)
    df_attack = df_attack.sort_values(["image_id", "dataset_variation"])
    
    logger.info(f"[INFO] Final dataset: {len(df_attack)} matched samples.")
    return df_attack.reset_index(drop=True)

In [None]:
dataset = "bigscale_100"
variations=["4", "8", "16"]

result = _load_attack_data(dataset, variations)

print(result.shape)

result.head(5)

In [None]:
# result.to_parquet("llm_m_attack_prepared.parquet")

## Option 3 - load existing parquet

note, that there are just 4 images on different epsilon, it's more like demo version

In [None]:
df = pd.read_parquet(Path(cwd).parent / "llm_m_attack_prepared.parquet")

df.head(15)