# Connection to Colab GPU

In [None]:
# !nvidia-smi


In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


Torch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


## STEP 2 — Install required VLM packages

In [2]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q open_clip_torch transformers sentence-transformers accelerate \
              huggingface_hub pillow matplotlib seaborn pandas safetensors einops


## STEP 3 — Mount your Google Drive (to load your repo + dataset)

In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# root = "/content/drive/MyDrive/VLM_Zheyu/Visionencoder_Rep_Energyfilter"
# os.chdir(root)
# print("Current working directory:", os.getcwd())


## STEP 4 — Download your manager’s training dataset

In [11]:
!pip install -q gdown
!gdown --id 1hw6pERdzH22THudGYoHkJOaZylkh8lu3 -O /content/dataset_big


Downloading...
From (original): https://drive.google.com/uc?id=1hw6pERdzH22THudGYoHkJOaZylkh8lu3
From (redirected): https://drive.google.com/uc?id=1hw6pERdzH22THudGYoHkJOaZylkh8lu3&confirm=t&uuid=bcfa42b2-9a1d-49b2-a95d-39dba69081ca
To: /content/dataset_big
100% 2.94G/2.94G [00:41<00:00, 70.9MB/s]


In [3]:
!ls -lh /content

total 2.8G
drwxr-xr-x 3 root root 4.0K Dec 12 06:13 dataset
-rw-r--r-- 1 root root 2.8G Nov 22 08:56 dataset_big
drwxr-xr-x 4 root root 4.0K Dec 12 06:11 hf_cache
-rw-r--r-- 1 root root 175K Dec 12 06:49 hm_pairs_train_small.json
-rw-r--r-- 1 root root    0 Dec 12 06:51 leace_results_hm_k5.json
drwxr-xr-x 1 root root 4.0K Dec  9 14:42 sample_data


## STEP 5 — Test OpenCLIP on A100

In [4]:

import torch, os
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))
os.environ["HF_HOME"] = "/content/hf_cache"


Torch: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [14]:
# 2. Simple OpenCLIP test
import open_clip
from PIL import Image
import requests
from io import BytesIO

device = "cuda"

model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-L-14",
    pretrained="openai"
)
model = model.to(device).eval()

tokenizer = open_clip.get_tokenizer("ViT-L-14")

# download a test image
url = "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg"
img = Image.open(BytesIO(requests.get(url).content)).convert("RGB")

image_tensor = preprocess(img).unsqueeze(0).to(device)
text_tokens = tokenizer(["a dog playing in the snow", "a car in a city"]).to(device)

with torch.no_grad():
    img_feat = model.encode_image(image_tensor)
    txt_feat = model.encode_text(text_tokens)

    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

    sims = (img_feat @ txt_feat.T).squeeze().tolist()

print("Similarities:", sims)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


open_clip_model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]



Similarities: [0.13606832921504974, 0.08283153176307678]


## Inspect the training set

In [15]:
!file /content/dataset_big

/content/dataset_big: Zip archive data, at least v2.0 to extract, compression method=store


In [16]:
!mkdir -p /content/dataset
!unzip -q /content/dataset_big -d /content/dataset
!ls -R /content/dataset | head -40


/content/dataset:
HatefulMemes

/content/dataset/HatefulMemes:
data

/content/dataset/HatefulMemes/data:
dev.jsonl
img
LICENSE.txt
README.md
test.jsonl
train.jsonl

/content/dataset/HatefulMemes/data/img:
01235.png
01236.png
01243.png
01245.png
01247.png
01256.png
01258.png
01264.png
01268.png
01269.png
01274.png
01275.png
01276.png
01284.png
01293.png
01295.png
01324.png
01325.png
01327.png
01329.png
01348.png
01349.png
01359.png
01364.png
01379.png


In [5]:
import json
path = "/content/dataset/HatefulMemes/data/train.jsonl"

with open(path, "r") as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        print(i, json.loads(line))


0 {'id': 42953, 'img': 'img/42953.png', 'label': 0, 'text': 'its their character not their color that matters'}
1 {'id': 23058, 'img': 'img/23058.png', 'label': 0, 'text': "don't be afraid to love again everyone is not like your ex"}
2 {'id': 13894, 'img': 'img/13894.png', 'label': 0, 'text': 'putting bows on your pet'}


In [15]:
in_path = "/content/dataset/HatefulMemes/data/train.jsonl"
out_path = "/content/drive/MyDrive/VLM_project/train_small_500.jsonl"

count = 0
with open(in_path, "r") as fin, open(out_path, "w") as fout:
    for line in fin:
        fout.write(line)
        count += 1
        if count >= 500:
            break

print("Wrote", count, "examples to", out_path)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/VLM_project/train_small_500.jsonl'

In [9]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir('/content/drive/MyDrive/VLM_project')
# !pwd
# !ls


In [10]:
import json, os

in_path = "/content/dataset/HatefulMemes/data/train_small_500.jsonl"
img_root = "/content/dataset/HatefulMemes/data"   # root that contains "img/"
out_pairs = "/content/hm_pairs_train_500.json"

pairs = []

with open(in_path, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        img_rel = ex["img"]           # e.g. "img/42953.png"
        caption = ex["text"]
        label_id = ex["label"]        # 0 or 1

        label = "safe" if label_id == 0 else "unsafe"
        full_img_path = os.path.join(img_root, img_rel)
        pairs.append({
            "image": full_img_path,
            "caption": caption,
            "label": label
        })

print("Built", len(pairs), "pairs")
with open(out_pairs, "w", encoding="utf-8") as f:
    json.dump(pairs, f, ensure_ascii=False, indent=2)

print("Saved to", out_pairs)


Built 500 pairs
Saved to /content/hm_pairs_train_500.json


In [11]:
with open("/content/hm_pairs_train_500.json", "r") as f:
    tmp = json.load(f)
tmp[:3]


[{'image': '/content/dataset/HatefulMemes/data/img/42953.png',
  'caption': 'its their character not their color that matters',
  'label': 'safe'},
 {'image': '/content/dataset/HatefulMemes/data/img/23058.png',
  'caption': "don't be afraid to love again everyone is not like your ex",
  'label': 'safe'},
 {'image': '/content/dataset/HatefulMemes/data/img/13894.png',
  'caption': 'putting bows on your pet',
  'label': 'safe'}]

In [12]:
!pip install -q git+https://github.com/EleutherAI/concept-erasure.git


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for concept-erasure (pyproject.toml) ... [?25l[?25hdone


In [None]:
!python leace_vlm_eval.py /content/hm_pairs_train_500.json --k 5 --device cuda > /content/leace_results_hm_500_k5.json


python3: can't open file '/content/leace_vlm_eval.py': [Errno 2] No such file or directory
