In [31]:
import os
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import numpy as np
import pickle
from transformers import ViTModel, ViTFeatureExtractor

In [32]:
# load list of charter img paths

folders = [
    'images/preprocessed_imgs/binarized_imgs/writable_area_goettweig',
    'images/preprocessed_imgs/binarized_imgs/writable_area_lambach',
    'images/preprocessed_imgs/binarized_imgs/writable_area_lilienfeld'
]
if isinstance(folders, str):
    folders = [folders]
charter_list = []
for folder in folders:
    files = os.listdir(folder)
    full_paths = [os.path.abspath(os.path.join(folder, f)) for f in files]
    charter_list.extend(full_paths)

charter_list

['/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/9a46a8a7fff18858c10c19cccd27c3f6.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/a98966f6b468309f575b4ee0d4ee3347.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/43f55a9807deb4045481bd892d7bf183.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/ac6db7b8bb8db9d2faed9aa8867c8b6e.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/2b3d6b207d7b5feab5eab4893a033f80.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/077b6733bfd190e5eb147af31d809642.jpg',
 '/home/tschernn/becore-clustering/images/preprocessed_imgs/binarized_imgs/writable_area_goettweig/a931fc89cf18f1a693b4eca75e3dae40.jpg',
 '/home/tschernn/becore-clustering

In [33]:
# check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [34]:
# Define transformation (same for all models)
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [35]:
# Define feature extraction functions for different models

### 🔹 1. ResNet-50 (Modified for 512x512) with CUDA
def extract_features_resnet(image_paths):
    model = models.resnet50(pretrained=True)
    model.conv1 = torch.nn.Conv2d(3, 64, kernel_size=9, stride=1, padding=4, bias=False)  # Adjust first conv layer
    model.maxpool = torch.nn.Identity()  # Remove max pooling if needed
    model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove final FC layer
    model = model.to(device).eval()  # Move to GPU

    features = []
    for img_path in image_paths:
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)  # Move input to GPU
        with torch.no_grad():
            feat = model(img).squeeze().flatten().cpu().numpy()  # Flatten and move to CPU
        features.append(feat)

    return np.array(features)


### 🔹 2. EfficientNet-B4 (Handles 512x512 Natively) with CUDA
def extract_features_efficientnet(image_paths):
    model = models.efficientnet_b4(pretrained=True)
    model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove final FC layer
    model = model.to(device).eval()  # Move to GPU

    features = []
    for img_path in image_paths:
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)  # Move input to GPU
        with torch.no_grad():
            feat = model(img).squeeze().flatten().cpu().numpy()
        features.append(feat)

    return np.array(features)


### 🔹 3. Vision Transformer (DINO ViT) with CUDA
def extract_features_vit(image_paths):
    model = ViTModel.from_pretrained("facebook/dino-vitb16").to(device).eval()
    feature_extractor = ViTFeatureExtractor.from_pretrained("facebook/dino-vitb16")

    features = []
    for img_path in image_paths:
        img = Image.open(img_path).convert("RGB")
        inputs = feature_extractor(images=img, return_tensors="pt").to(device)  # Move input to GPU
        with torch.no_grad():
            feat = model(**inputs).last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token feature
        features.append(feat)

    return np.array(features)

# Choose the model to run
model_choice = "resnet"  # Change to "efficientnet" or "vit"

if model_choice == "resnet":
    features = extract_features_resnet(charter_list)
elif model_choice == "efficientnet":
    features = extract_features_efficientnet(charter_list)
elif model_choice == "vit":
    features = extract_features_vit(charter_list)
else:
    raise ValueError("Invalid model choice!")

# save features
with open(f'pickles/features_{model_choice}.pkl', 'wb') as handle:
    pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f'saved features in pickles/features_{model_choice}.pkl')

saved features in pickles/features_resnet.pkl
