In [1]:
import os
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import (
    CLIPTokenizer,
    CLIPVisionModelWithProjection,
    CLIPTextModelWithProjection,
)

from tqdm import tqdm
import torch
import os
import argparse
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
model_id = 'openai/clip-vit-large-patch14'
clip_backbone = 'openai/clip-vit-large-patch14'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
clip_text_model = CLIPTextModelWithProjection.from_pretrained(model_id).to(device).eval()
clip_vision_model = CLIPVisionModelWithProjection.from_pretrained(
    clip_backbone
).to(device).eval()

In [3]:
import os
import torch
from sklearn.preprocessing import LabelEncoder

# Path to your folder
folder_path = "/mnt/ssd1/mary/Diffusion-Models-Embedding-Space-Defense/EMBEDDINGS/clip_embeddings"
clip_embedding_files = []
# Find all files ending with '.pt' but NOT 'all_embeddings.pt'
for subfolder in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        print(f"Processing subfolder: {subfolder_path}")
        # Process each subfolder
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith('.pt') and file_name != 'all_embeddings.pt':
                file_path = os.path.join(subfolder_path, file_name)
                clip_embedding_files.append(file_path)
                # print(f"Found embedding file: {file_path}")

all_tensors = []
all_strings = []
# Load and unpack each file
for file_path in clip_embedding_files:
    data = torch.load(file_path)
    for tensor, category_string in data:
        # Split the category string into multiple labels
        subcaptions = [label.strip() for label in category_string.split(',')]
        for label in subcaptions:
            all_tensors.append(tensor.clone())  # Avoid shared reference
            all_strings.append(label)

clip_tensors = all_tensors[:10_000]
clip_strings = all_strings[:10_000]
# Stack all tensors
clip_embedding_tensor = torch.stack(clip_tensors)

# Encode the labels into integers
le = LabelEncoder()
clip_labels = le.fit_transform(clip_strings)
clip_captions = torch.tensor(clip_labels, dtype=torch.long)
clip_captions_np = clip_captions.cpu().numpy()



Processing subfolder: /mnt/ssd1/mary/Diffusion-Models-Embedding-Space-Defense/EMBEDDINGS/clip_embeddings/visu
Processing subfolder: /mnt/ssd1/mary/Diffusion-Models-Embedding-Space-Defense/EMBEDDINGS/clip_embeddings/mma
Processing subfolder: /mnt/ssd1/mary/Diffusion-Models-Embedding-Space-Defense/EMBEDDINGS/clip_embeddings/mscoco
Processing subfolder: /mnt/ssd1/mary/Diffusion-Models-Embedding-Space-Defense/EMBEDDINGS/clip_embeddings/i2p


  data = torch.load(file_path)


In [4]:
# get only first 10k embeddings
print(f"Total embeddings loaded: {len(clip_embedding_tensor)}")
clip_embedding_tensor = clip_embedding_tensor[:10_000]
print(f"Total embeddings after slicing: {len(clip_embedding_tensor)}")

# get only first 10k labels
clip_captions = clip_captions[:10000]
print(f"Total labels after slicing: {len(clip_captions)}")
# Save the embeddings and labels

Total embeddings loaded: 10000
Total embeddings after slicing: 10000
Total labels after slicing: 10000


In [5]:
!pip install umap-learn



In [6]:
import umap.umap_ as umap
import torch


# Run UMAP with supervision using captions
clip_mapper = umap.UMAP(
    n_components=3,
    metric='euclidean',
    random_state=42
).fit(clip_embedding_tensor, y=clip_captions_np)



  warn(


In [16]:
prompt_caption = 'sports car'
prompt_1_caption = 'car'
prompt_2_caption = 'fast'

# embed the prompts
def embed_prompts(prompts):
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = clip_text_model(**inputs)
    return outputs.text_embeds

prompt_embeddings = embed_prompts([prompt_caption, prompt_1_caption, prompt_2_caption])

prompt = prompt_embeddings[0].unsqueeze(0)
prompt_1 = prompt_embeddings[1].unsqueeze(0)
prompt_2 = prompt_embeddings[2].unsqueeze(0)
prompt_3 = prompt_1 + prompt_2

# Get the UMAP coordinates for the prompts
prompt_umap = clip_mapper.transform(prompt.cpu().numpy())
prompt_1_umap = clip_mapper.transform(prompt_1.cpu().numpy())
prompt_2_umap = clip_mapper.transform(prompt_2.cpu().numpy())
prompt_3_umap = clip_mapper.transform(prompt_3.cpu().numpy())

In [None]:
# plot the 3 prompts in 3D with plotly  in a scatter plot
import plotly.graph_objects as go
# add the function to plot the prompts in 3D with the label
def plot_prompts_3d(prompts, colors, captions=None):
    fig = go.Figure(data=[go.Scatter3d(
        x=prompts[:, 0],
        y=prompts[:, 1],
        z=prompts[:, 2],
        mode='markers+text',
        marker=dict(
            size=5,
            color=colors,  # Use the provided colors
            opacity=0.8
        ),
        text=captions,  # Add captions as text labels
        textposition='top center'
    )])
    fig.update_layout(
        scene=dict(
            xaxis_title='UMAP Component 1',
            yaxis_title='UMAP Component 2',
            zaxis_title='UMAP Component 3'
        ),
        title='3D UMAP Projection of Prompts'
    )
    fig.show()



# Prepare the prompts for plotting
prompts = torch.stack([prompt.squeeze(), prompt_1.squeeze(), prompt_2.squeeze(), ])
prompts_umap = clip_mapper.transform(prompts.cpu().numpy())
# Define colors for the prompts
colors = ['red', 'green', 'blue', 'orange']  # Colors for each prompt
# define the captions
captions = ['sports car', 'car', 'fast', 'vehicle']
# Plot the prompts in 3D
plot_prompts_3d(prompts_umap, colors, captions)


In [18]:

# Prepare the prompts for plotting
prompts = torch.stack([prompt.squeeze(), prompt_2.squeeze(), prompt_3.squeeze()])
prompts_umap = clip_mapper.transform(prompts.cpu().numpy())
# Define colors for the prompts
colors = ['red', 'blue', 'orange']  # Colors for each prompt
# define the captions
captions = ['sports car', 'fast','P1' ]
# Plot the prompts in 3D
plot_prompts_3d(prompts_umap, colors, captions)


# Attack A1
given a concept c_t that is filtered the attack is in 2 phases.
1) the prompt 1 is c_t + a second concept c_d. The embedding of c_t+c_d is computed.
2) the embedding of soley the concept c_d is computed 
3) then you generate with the text encoding of emb(c_t+c_d) + emb(c_d)


In [20]:
p1 = ' a zebra standing in the field'
p2 = 'a cake in the shape of a zebra'
p3 = 'a cake'

# embed the prompts
prompt_embeddings = embed_prompts([p1, p2, p3])
prompt = prompt_embeddings[0].unsqueeze(0)
prompt_1 = prompt_embeddings[1].unsqueeze(0)
prompt_2 = prompt_embeddings[2].unsqueeze(0)
prompt_3 =  prompt_1 - prompt_2
# Get the UMAP coordinates for the prompts
prompt_umap = clip_mapper.transform(prompt.cpu().numpy())
prompt_1_umap = clip_mapper.transform(prompt_1.cpu().numpy())
prompt_2_umap = clip_mapper.transform(prompt_2.cpu().numpy())
prompt_3_umap = clip_mapper.transform(prompt_3.cpu().numpy())
# Prepare the prompts for plotting
prompts = torch.stack([prompt.squeeze(), prompt_1.squeeze(), prompt_2.squeeze(), prompt_3.squeeze()])
prompts_umap = clip_mapper.transform(prompts.cpu().numpy())
# Define colors for the prompts
colors = ['red', 'green', 'blue', 'orange']  # Colors for each prompt
# define the captions
captions = [p1, p2, p3, 'P1 + P2 - P3']
# Plot the prompts in 3D
plot_prompts_3d(prompts_umap, colors, captions)

In [21]:
p1 = 'nudity'
p2 = 'text that reads nudity'
p3 = 'text, written'

# embed the prompts
prompt_embeddings = embed_prompts([p1, p2, p3])
prompt = prompt_embeddings[0].unsqueeze(0)
prompt_1 = prompt_embeddings[1].unsqueeze(0)
prompt_2 = prompt_embeddings[2].unsqueeze(0)
prompt_3 = prompt_1 - prompt_2
# Get the UMAP coordinates for the prompts
prompt_umap = clip_mapper.transform(prompt.cpu().numpy())
prompt_1_umap = clip_mapper.transform(prompt_1.cpu().numpy())
prompt_2_umap = clip_mapper.transform(prompt_2.cpu().numpy())
prompt_3_umap = clip_mapper.transform(prompt_3.cpu().numpy())
# Prepare the prompts for plotting
prompts = torch.stack([prompt.squeeze(), prompt_1.squeeze(), prompt_2.squeeze(), prompt_3.squeeze()])
prompts_umap = clip_mapper.transform(prompts.cpu().numpy())
# Define colors for the prompts
colors = ['red', 'green', 'blue', 'orange']  # Colors for each prompt
# define the captions
captions = [p1, p2, p3, 'P2 - P3']
# Plot the prompts in 3D
plot_prompts_3d(prompts_umap, colors, captions)

# Attack A3
you have a non harmful content c_t and you add multiple concepts to the embedding

1) create the embedding of the concept c_t 
2) create the embedding of multiple harmful concepts c_d*
3) sum the 2 embeddingss: emb(c_t)+ emb(c_d*)


In [23]:
p1 = 'nudity'
p2 = 'woman'
p3 = 'nudity, sex, nsfw, naked, nude, porn'

# embed the prompts
prompt_embeddings = embed_prompts([p1, p2, p3])
prompt = prompt_embeddings[0].unsqueeze(0)
prompt_1 = prompt_embeddings[1].unsqueeze(0)
prompt_2 = prompt_embeddings[2].unsqueeze(0)
prompt_3 = prompt_1 + prompt_2
# Get the UMAP coordinates for the prompts
prompt_umap = clip_mapper.transform(prompt.cpu().numpy())
prompt_1_umap = clip_mapper.transform(prompt_1.cpu().numpy())
prompt_2_umap = clip_mapper.transform(prompt_2.cpu().numpy())
prompt_3_umap = clip_mapper.transform(prompt_3.cpu().numpy())
# Prepare the prompts for plotting
prompts = torch.stack([prompt.squeeze(), prompt_1.squeeze(), prompt_2.squeeze(), prompt_3.squeeze()])
prompts_umap = clip_mapper.transform(prompts.cpu().numpy())
# Define colors for the prompts
colors = ['red', 'green', 'blue', 'orange']  # Colors for each prompt
# define the captions
captions = [p1, p2, p3, 'P2 - P3']
# Plot the prompts in 3D
plot_prompts_3d(prompts_umap, colors, captions)