In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import CIFAR10
from transformers import AutoTokenizer, TrainingArguments, Trainer, CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType, RandLoRAConfig
from peft.tuners.lora import LoraLayer

from datasets import Dataset
import numpy as np
from math import floor

In [None]:
import os
import shutil

def create_split(dtd_root, split_file, split_name):
    with open(os.path.join(dtd_root, "labels", split_file), "r") as f:
        lines = [line.strip() for line in f]

    for rel_path in lines:
        class_name, filename = rel_path.split("/")
        src = os.path.join(dtd_root, "images", class_name, filename)
        dst = os.path.join(dtd_root, split_name, class_name, filename)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        if not os.path.exists(dst):
            shutil.copy2(src, dst)

# Set this to your actual dataset path
dtd_root = "/home/guyb/Documents/data/dtd"

create_split(dtd_root, "train1.txt", "train")
create_split(dtd_root, "val1.txt", "val")

print("✅ DTD train/val directories created successfully.")


In [None]:
import os
import subprocess
import zipfile
import shutil
import requests
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Correct path for training script compatibility
base_path = os.path.expanduser("~/Documents/data")  # This is where the model expects it

# Ensure base folders exist
os.makedirs(base_path, exist_ok=True)

# Config
download_url = "https://zenodo.org/record/7711810/files/EuroSAT_RGB.zip"
zip_path = base_path + "/EuroSAT_RGB.zip"
extract_to = base_path + "/EuroSAT_raw"
dataset_root = os.path.join(extract_to, "EuroSAT_RGB")
split_output = base_path + "/EuroSAT_splits"
val_ratio = 0.2
seed = 42

# Step 1: Download if not already present
if not os.path.exists(zip_path):
    print("Downloading EuroSAT_RGB.zip...")
    response = requests.get(download_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(zip_path, "wb") as f, tqdm(
        desc="Downloading",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                bar.update(len(chunk))
else:
    print("✅ EuroSAT_RGB.zip already exists.")

# Step 2: Extract
if not os.path.exists(dataset_root):
    print("Extracting...")
    subprocess.run(["unzip", "-oq", zip_path, "-d", extract_to], check=True)
else:
    print("✅ Dataset already extracted.")

# Step 3: Split and organize
for split in ['train', 'val']:
    os.makedirs(os.path.join(split_output, split), exist_ok=True)

for class_name in os.listdir(dataset_root):
    full_class_path = os.path.join(dataset_root, class_name)
    if not os.path.isdir(full_class_path):
        continue

    images = os.listdir(full_class_path)
    train_imgs, val_imgs = train_test_split(images, test_size=val_ratio, random_state=seed)

    for split, img_list in zip(["train", "val"], [train_imgs, val_imgs]):
        class_output = os.path.join(split_output, split, class_name)
        os.makedirs(class_output, exist_ok=True)
        for img in img_list:
            src = os.path.join(full_class_path, img)
            dst = os.path.join(class_output, img)
            shutil.copyfile(src, dst)

print("✅ Finished preparing EuroSAT_splits with train/val folders.")


In [None]:
# --------------------------------------------------------------------------------------------------
# Build RESISC‑45 exactly the way the vision_language repo expects it
# --------------------------------------------------------------------------------------------------
import os, shutil, hashlib
from pathlib import Path
from datasets import load_dataset
from tqdm.auto import tqdm
from PIL import Image

# ── config ─────────────────────────────────────────────────────────────────────────
BASE_DIR    = Path("~/Documents/data").expanduser()   # <‑‑ change if you want
RES_ROOT    = BASE_DIR / "resisc45"                   # root given to your script
IMG_ROOT    = RES_ROOT / "NWPU-RESISC45"              # where ImageFolder will read
MAKE_LINKS  = True                                    # True = symlink, False = copy
SPLITS      = ("train", "test")                # HF splits match paper splits
# ----------------------------------------------------------------------------------

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def reset_dir(p: Path):
    if p.exists():
        shutil.rmtree(p)
    safe_mkdir(p)

# 1️⃣ start fresh (optional – comment out if you DON’T want to erase previous work)
reset_dir(IMG_ROOT)
safe_mkdir(RES_ROOT)          # keep split files outside the image tree

# 2️⃣ load HF dataset (caches under ~/.cache/huggingface by default, so 2nd run is instant)
ds = load_dataset("timm/resisc45")

# 3️⃣ write images + gather split file names
split_lists = {s: [] for s in SPLITS}

for split in SPLITS:
    print(f"⚙️  processing {split} split …")
    for ex in tqdm(ds[split]):
        label_idx  = ex["label"]
        cls_name   = ds[split].features["label"].int2str(label_idx)
        img: Image.Image = ex["image"]

        # filename: keep original SHA‑1 HF hash to avoid name clashes
        sha1 = hashlib.sha1(img.tobytes()).hexdigest()[:16]
        fname = f"{cls_name}_{sha1}.jpg"

        # destination path
        dest_dir  = IMG_ROOT / cls_name
        dest_path = dest_dir / fname
        safe_mkdir(dest_dir)

        if not dest_path.exists():
            if MAKE_LINKS and "file" in ex and Path(ex["file"]).is_file():
                # create a *symlink* to the cached file
                os.symlink(os.path.abspath(ex["file"]), dest_path)
            else:
                # fall back to save/copy
                img.save(dest_path)

        split_lists[split].append(fname)

# 4️⃣ write split text files (filenames only, one per line)
for split, names in split_lists.items():
    txt_path = RES_ROOT / f"resisc45-{split}.txt"
    with open(txt_path, "w") as f:
        f.write("\n".join(names))
    print(f"✅ wrote {txt_path.relative_to(BASE_DIR)}  ({len(names)} images)")

print("\n🎉  RESISC‑45 ready at:", RES_ROOT)
print("   Image root for ImageFolder :", IMG_ROOT)


In [None]:
import os, shutil, hashlib
from pathlib import Path
from datasets import load_dataset
from tqdm.auto import tqdm
from PIL import Image

# ── config ─────────────────────────────────────────────────────────────
BASE_DIR    = Path("~/Documents/data").expanduser()
SUN_ROOT    = BASE_DIR / "sun397"
IMG_ROOT    = SUN_ROOT / "SUN397"
SPLIT_DIRS  = {s: SUN_ROOT / s for s in ("train", "test")}
SPLITS      = ("train", "test")
MAKE_LINKS  = True  # True = symlink original, False = save image
# ───────────────────────────────────────────────────────────────────────

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def reset_dir(p: Path):
    if p.exists():
        shutil.rmtree(p)
    safe_mkdir(p)

# 1️⃣ Clean previous work (optional)
reset_dir(IMG_ROOT)
for p in SPLIT_DIRS.values():
    reset_dir(p)
safe_mkdir(SUN_ROOT)

# 2️⃣ Load HF dataset
ds = load_dataset("tanganke/sun397")

# 3️⃣ Save images and create class folders
split_lists = {s: [] for s in SPLITS}
for split in SPLITS:
    print(f"Processing split: {split}")
    for ex in tqdm(ds[split]):
        label_idx = ex["label"]
        cls = ds[split].features["label"].int2str(label_idx)
        img: Image.Image = ex["image"]
        sha1 = hashlib.sha1(img.tobytes()).hexdigest()[:16]
        fname = f"{cls}_{sha1}.jpg"

        src_dir = IMG_ROOT / cls
        dst_dir = SPLIT_DIRS[split] / cls
        src_path = src_dir / fname
        dst_path = dst_dir / fname

        safe_mkdir(src_dir)
        safe_mkdir(dst_dir)

        if not src_path.exists():
            if MAKE_LINKS and "file" in ex and Path(ex["file"]).is_file():
                os.symlink(os.path.abspath(ex["file"]), src_path)
            else:
                if img.mode != "RGB":
                    img = img.convert("RGB")
                img.save(src_path)

        if not dst_path.exists():
            os.symlink(src_path, dst_path)

        split_lists[split].append(fname)

# 4️⃣ Save list of image files (optional)
for split, names in split_lists.items():
    txt_path = SUN_ROOT / f"sun397-{split}.txt"
    with open(txt_path, "w") as f:
        f.write("\n".join(names))
    print(f"✅ wrote {txt_path.relative_to(BASE_DIR)}  ({len(names)} images)")
    print(f"✅ ready at split folder: {SPLIT_DIRS[split].relative_to(BASE_DIR)}")

print("\n🎉  SUN397 ready at:", SUN_ROOT)
print("   Use these in ImageFolder:")
print("    ", SPLIT_DIRS["train"])
print("    ", SPLIT_DIRS["test"])


In [None]:
import os, shutil, hashlib
from pathlib import Path
from datasets import load_dataset
from tqdm.auto import tqdm
from PIL import Image

# ── config ─────────────────────────────────────────────────────────────
BASE_DIR   = Path("~/Documents/data").expanduser()
SUN_ROOT   = BASE_DIR / "sun397_val_only"
IMG_ROOT   = SUN_ROOT / "SUN397"
SPLIT      = "val"
SPLIT_DIR  = SUN_ROOT / SPLIT
MAKE_LINKS = True  # True = symlink original, False = save image
# ───────────────────────────────────────────────────────────────────────

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def reset_dir(p: Path):
    if p.exists():
        shutil.rmtree(p)
    safe_mkdir(p)

# 1️⃣ Clean previous work
reset_dir(IMG_ROOT)
reset_dir(SPLIT_DIR)
safe_mkdir(SUN_ROOT)

# 2️⃣ Load HF dataset
ds = load_dataset("tanganke/sun397")

# 3️⃣ Save images and create class folders
split_list = []
print(f"Processing split: {SPLIT}")
for ex in tqdm(ds[SPLIT]):
    label_idx = ex["label"]
    cls = ds[SPLIT].features["label"].int2str(label_idx)
    img: Image.Image = ex["image"]
    sha1 = hashlib.sha1(img.tobytes()).hexdigest()[:16]
    fname = f"{cls}_{sha1}.jpg"

    src_dir = IMG_ROOT / cls
    dst_dir = SPLIT_DIR / cls
    src_path = src_dir / fname
    dst_path = dst_dir / fname

    safe_mkdir(src_dir)
    safe_mkdir(dst_dir)

    if not src_path.exists():
        if MAKE_LINKS and "file" in ex and Path(ex["file"]).is_file():
            os.symlink(os.path.abspath(ex["file"]), src_path)
        else:
            if img.mode != "RGB":
                img = img.convert("RGB")
            img.save(src_path)

    if not dst_path.exists():
        os.symlink(src_path, dst_path)

    split_list.append(fname)

# 4️⃣ Save list of image files (optional)
txt_path = SUN_ROOT / f"sun397-{SPLIT}.txt"
with open(txt_path, "w") as f:
    f.write("\n".join(split_list))
print(f"✅ wrote {txt_path.relative_to(BASE_DIR)}  ({len(split_list)} images)")
print(f"✅ ready at split folder: {SPLIT_DIR.relative_to(BASE_DIR)}")

print("\n🎉  SUN397 val split ready at:", SUN_ROOT)
print("   Use this in ImageFolder:")
print("    ", SPLIT_DIR)


In [7]:
import os
import requests
import tarfile
import zipfile
from pathlib import Path

def download_file(url: str, dest_path: Path):
    print(f"⬇️ Downloading from:\n{url}")
    with requests.get(url, stream=True, allow_redirects=True) as r:
        r.raise_for_status()
        with open(dest_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

def extract_file(archive_path: Path, dest_dir: Path):
    print(f"🗂️ Extracting {archive_path.name} into {dest_dir}...")
    if archive_path.suffix == ".zip":
        with zipfile.ZipFile(archive_path, 'r') as zip_ref:
            zip_ref.extractall(dest_dir)
    elif archive_path.suffix == ".tar":
        with tarfile.open(archive_path, "r:") as tar_ref:
            tar_ref.extractall(dest_dir)
    else:
        raise ValueError(f"Unsupported archive type: {archive_path}")

def setup_caltech101(root: Path):
    root.mkdir(parents=True, exist_ok=True)

    files = {
        "caltech-101.zip": "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
        "Annotations.tar": "https://www.vision.caltech.edu/Image_Datasets/Caltech101/Annotations.tar"
    }


    for fname, url in files.items():
        dest_file = root / fname
        if not dest_file.exists():
            download_file(url, dest_file)
        else:
            print(f"✅ Already downloaded: {fname}")
        extract_file(dest_file, root)

if __name__ == "__main__":
    ROOT = Path.home() / "data" / "caltech101"
    setup_caltech101(ROOT)
    print("✅ Dataset ready at", ROOT)


✅ Already downloaded: caltech-101.zip
🗂️ Extracting caltech-101.zip into /home/guyb/data/caltech101...
⬇️ Downloading from:
https://www.vision.caltech.edu/Image_Datasets/Caltech101/Annotations.tar


HTTPError: 404 Client Error: Not Found for url: https://www.vision.caltech.edu/Image_Datasets/Caltech101/Annotations.tar

In [None]:
# 1. Load CIFAR10
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

trainset = CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = CIFAR10(root='./data', train=False, download=True, transform=transform)

In [None]:
# 2. Wrap into HF dataset for Trainer compatibility
def convert_to_hf(ds):
    return Dataset.from_dict({
        'pixel_values': [img.numpy() for img, _ in ds],
        'labels': [label for _, label in ds],
    })

train_dataset = convert_to_hf(trainset)
eval_dataset = convert_to_hf(testset)

In [None]:
# 3. Load CLIP (ViT-B-32) model & processor
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

In [None]:
# 4. Add RandLoRA
peft_config = RandLoRAConfig(
    r=6,
    target_modules=["q_proj", "v_proj"],  # Modify as needed for CLIP
    task_type=TaskType.VISION,
    rank_pattern="normal",  # optional
)

model = get_peft_model(model, peft_config)

In [None]:
# 5. Data preprocessing
def preprocess(batch):
    inputs = processor(images=[torch.tensor(x) for x in batch["pixel_values"]], return_tensors="pt", padding=True)
    inputs["labels"] = batch["labels"]
    return inputs

train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)

In [None]:

# Assume batch_size is 32 to compute gradient accumulation steps (128 // 32 = 4)
batch_size = 32
num_grad_accumulation = floor(128 / batch_size)

training_args = TrainingArguments(
    output_dir="./randlora_cifar10",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,                           # or set dynamically from args.epochs
    learning_rate=1e-3,                           # match your default classifier LR (if not using lp_clip)
    weight_decay=0.01,                            # match args.wd (adjust if different)
    gradient_accumulation_steps=num_grad_accumulation,
    lr_scheduler_type="cosine",                   # match your cosine_lr() scheduler
    logging_steps=10,
    report_to="none",
    remove_unused_columns=False,
    fp16=False,                                   # use bf16 in your script, so keep this False unless you prefer fp16
    bf16=True if torch.cuda.is_available() else False,
)


In [None]:
# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
# 8. Train
trainer.train()