In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from glob import glob

class UnsupervisedFineTuner(nn.Module):
    def __init__(self, encoder_type='dinov2', freeze=False):
        super().__init__()
        
        if encoder_type == 'dinov2':
            self.encoder = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
            dim = 768
        else:
            from efficientnet_pytorch import EfficientNet
            self.encoder = EfficientNet.from_pretrained('efficientnet-b5')
            dim = 2048
            
        if freeze:
            for p in self.encoder.parameters(): p.requires_grad = False
        
        self.encoder_type = encoder_type
        self.decoder = nn.Sequential(
            nn.Linear(dim, 512*7*7), nn.ReLU(), nn.Unflatten(1, (512,7,7)),
            nn.ConvTranspose2d(512,256,3,2,1,1), nn.BatchNorm2d(256), nn.ReLU(),
            nn.ConvTranspose2d(256,128,3,2,1,1), nn.BatchNorm2d(128), nn.ReLU(),
            nn.ConvTranspose2d(128,64,3,2,1,1), nn.BatchNorm2d(64), nn.ReLU(),
            nn.ConvTranspose2d(64,32,3,2,1,1), nn.BatchNorm2d(32), nn.ReLU(),
            nn.ConvTranspose2d(32,3,3,2,1,1), nn.Sigmoid()
        )
    
    def encode(self, x):
        if self.encoder_type == 'dinov2':
            return self.encoder(x)
        f = self.encoder.extract_features(x)
        return F.adaptive_avg_pool2d(f, 1).flatten(1)
    
    def forward(self, x):
        f = self.encode(x)
        return self.decoder(f), f

class UnlabeledDataset(Dataset):
    def __init__(self, paths, transform):
        self.paths, self.transform = paths, transform
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        return self.transform(Image.open(self.paths[i]).convert('RGB'))

def train(model, loader, epochs, lr, device, freeze_enc=True):
    model.to(device)
    params = model.decoder.parameters() if freeze_enc else model.parameters()
    opt = torch.optim.Adam(params, lr=lr)
    
    for e in range(epochs):
        total = 0
        for imgs in loader:
            imgs = imgs.to(device)
            recon, _ = model(imgs)
            loss = F.mse_loss(recon, imgs)
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            total += loss.item()
        if (e+1) % 5 == 0: print(f"Epoch {e+1}: {total/len(loader):.6f}")
    return model

def main():
    # Load all images from car-img-hr directory
    img_dir = 'car-img-hr'
    paths = glob(f'{img_dir}/*.jpg') + glob(f'{img_dir}/*.png') + \
            glob(f'{img_dir}/*.jpeg') + glob(f'{img_dir}/*.JPG')
    print(f"Found {len(paths)} images in {img_dir}")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()
    ])
    
    loader = DataLoader(UnlabeledDataset(paths, transform), 
                       batch_size=16, shuffle=True, num_workers=4)
    
    model = UnsupervisedFineTuner('dinov2', freeze=True)
    model = train(model, loader, 20, 1e-3, device, freeze_enc=True)
    
    for p in model.encoder.parameters(): p.requires_grad = True
    model = train(model, loader, 30, 1e-4, device, freeze_enc=False)
    
    torch.save(model.encoder.state_dict(), 'encoder.pth')
    return model

def extract_features(model, img_dir='car-img-hr', device='cuda'):
    paths = glob(f'{img_dir}/*.jpg') + glob(f'{img_dir}/*.png') + \
            glob(f'{img_dir}/*.jpeg') + glob(f'{img_dir}/*.JPG')
    
    model.eval().to(device)
    transform = transforms.Compose([
        transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()
    ])
    
    features = []
    for i in range(0, len(paths), 32):
        batch = torch.stack([transform(Image.open(p).convert('RGB')) 
                            for p in paths[i:i+32]]).to(device)
        with torch.no_grad():
            features.append(model.encode(batch).cpu().numpy())
    return np.vstack(features)

if __name__ == '__main__':
    model = main()
    features = extract_features(model)
    
    # Train classifier on extracted features
    from sklearn.neural_network import MLPClassifier
    # MLPClassifier((256,128), max_iter=500).fit(features, labels)


Found 57 images in car-img-hr


Using cache found in /home/soham/.cache/torch/hub/facebookresearch_dinov2_main


Epoch 5: 0.062256
Epoch 10: 0.031049
Epoch 15: 0.019045
Epoch 20: 0.014042
Epoch 5: 0.051431
Epoch 10: 0.041459
Epoch 15: 0.031801
Epoch 20: 0.023122
Epoch 25: 0.018062
Epoch 30: 0.013927


In [8]:
import torch
from torchvision import transforms
from PIL import Image
from pathlib import Path
import numpy as np

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Transform
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Get images
image_paths = list(Path('path/to/images').glob('*.jpg'))

# Extract features
all_features = []
with torch.no_grad():
    for i in range(0, len(image_paths), 32):
        batch_paths = image_paths[i:i+32]
        batch = torch.stack([transform(Image.open(p).convert('RGB')) for p in batch_paths]).to(device)
        features = model(batch)
        all_features.append(features.cpu().numpy())

features_array = np.vstack(all_features)
print(f"Features shape: {features_array.shape}")


Using cache found in /home/soham/.cache/torch/hub/facebookresearch_dinov2_main


ValueError: need at least one array to concatenate