# Implementation of the SIMCLR with resnet50 backbone

In [3]:
from dotenv import load_dotenv
load_dotenv()       # reads .env and sets os.environ
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33manaliju[0m ([33manaliju-paris[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
import os
import ssl
import zipfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader, random_split, Dataset, Subset
from torchvision.models import resnet50
from PIL import Image
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
import seaborn as sns

from utils.version_utils import print_versions, configure_gpu_device, set_seed


In [5]:

print_versions()
set_seed(seed=42)

TARGET_GPU_INDEX = 2

DEVICE = configure_gpu_device(TARGET_GPU_INDEX)

Conda version: 25.5.1
Python version: 3.10.16
PyTorch version: 2.5.1
CUDA available: True
CUDA device count: 3
Torchvision version: 0.20.1
Successfully set to use GPU: 2 (Quadro RTX 5000)
Final DEVICE variable is set to: cuda:2
Current PyTorch default device: 2
Current PyTorch default device (after set_device): 2
Dummy tensor is on device: cuda:2


In [4]:

# Prevent nondeterminism
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.enabled = False

CONFIG = {
    "LOCAL_OR_COLAB": "LOCAL",
    "DATA_DIR_LOCAL": "/share/DEEPLEARNING/carvalhj/EuroSAT_RGB/",
    "DATA_DIR_COLAB": "/content/EuroSAT_RGB",
    "ZIP_PATH": "/content/EuroSAT.zip",
    "EUROSAT_URL": "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
    "SEED": 42,  
    "BATCH_SIZE": 128,
    "LR": 3.75e-4,
    "WD": 0.5,
    "LR_LINEAR": 3.75e-4,
    "EPOCHS_SIMCLR": 30,
    "EPOCHS_LINEAR": 10,
    "PROJ_DIM": 128,
    "FEATURE_DIM": 2048, # ResNet50 feature dimension = 2048
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# split fractions
TRAIN_FRAC = 0.8
VAL_FRAC   = 0.1
TEST_FRAC  = 0.1

SEED = CONFIG["SEED"]

PRETRAINED = False

TEMPERATURE = 0.2

BETAS=(0.9,0.98)
EPS = 1e-8

LINEAR_PROB_TRAIN_SPLIT = 0.75

GLOBAL_SEED = CONFIG["SEED"]
NUM_WORKERS = 4

In [None]:


def prepare_data():
    if CONFIG["LOCAL_OR_COLAB"] == "LOCAL":
        return CONFIG["DATA_DIR_LOCAL"]

    if not os.path.exists(CONFIG["DATA_DIR_COLAB"]):
        print("Downloading EuroSAT RGB...")
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(CONFIG["EUROSAT_URL"], CONFIG["ZIP_PATH"])
        with zipfile.ZipFile(CONFIG["ZIP_PATH"], 'r') as zip_ref:
            zip_ref.extractall("/content")
        os.rename("/content/2750", CONFIG["DATA_DIR_COLAB"])
        print("EuroSAT RGB dataset downloaded and extracted.")
    return CONFIG["DATA_DIR_COLAB"]


def compute_mean_std(dataset, batch_size):
    loader = DataLoader(dataset, batch_size, shuffle=False, num_workers=2)
    mean = 0.0
    std = 0.0
    n_samples = 0

    for data, _ in loader:
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)  # (B, C, H*W)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        n_samples += batch_samples

    mean /= n_samples
    std /= n_samples
    return mean.tolist(), std.tolist()


class TwoCropsTransform:
    def __init__(self, base_transform):
        self.base_transform = base_transform

    def __call__(self, x):
        return [self.base_transform(x), self.base_transform(x)]
    
class SimCLRDataset(Dataset):
    def __init__(self, dataset, transform):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        x, _ = self.dataset[idx]
        x1, x2 = self.transform(x)
        return x1, x2


def get_proportion(num_classes, dataset):
    return np.bincount(np.array(dataset.dataset.targets)[dataset.indices], minlength=num_classes) / len(dataset)

def get_split_indexes(labels, total_count):
    n_train = int(np.floor(TRAIN_FRAC * total_count))
    n_temp = total_count - n_train   # this is val + test

    sss1 = StratifiedShuffleSplit(
        n_splits=1,
        train_size=n_train,
        test_size=n_temp,
        random_state=SEED
    )
    # Train and temp(val+test) indices
    train_idx, temp_idx = next(sss1.split(np.zeros(total_count), labels))

    n_val = int(np.floor(VAL_FRAC * total_count))
    n_test = total_count - n_train - n_val
    assert n_temp == n_val + n_test, "Fractions must sum to 1."

    labels_temp = labels[temp_idx]

    sss2 = StratifiedShuffleSplit(
        n_splits=1,
        train_size=n_val,
        test_size=n_test,
        random_state=SEED
    )
    val_idx_in_temp, test_idx_in_temp = next(sss2.split(np.zeros(len(temp_idx)), labels_temp))

    val_idx = temp_idx[val_idx_in_temp]
    test_idx = temp_idx[test_idx_in_temp]

    assert len(train_idx) == n_train
    assert len(val_idx) == n_val
    assert len(test_idx) == n_test

    print(f"Stratified split sizes: train={len(train_idx)}, val={len(val_idx)}, test={len(test_idx)}")
    return train_idx,val_idx,test_idx

def get_data_loaders(data_dir, batch_size):

    dataset_for_stats = datasets.ImageFolder(
        root=data_dir,
        transform=transforms.ToTensor()
    )
    total_len = len(dataset_for_stats)
    labels = np.array(dataset_for_stats.targets)
    num_classes = len(dataset_for_stats.classes)
    print(f"Total samples in folder: {total_len}, classes: {dataset_for_stats.classes}")

    train_indices, val_indices, test_indices = get_split_indexes(labels, total_len)

    train_for_stats_subset = Subset(dataset_for_stats, train_indices)
    mean, std = compute_mean_std(train_for_stats_subset, batch_size)
    print(f"Computed mean: {mean}")
    print(f"Computed std:  {std}")

    dataset_train_no_transform = datasets.ImageFolder(
        root=data_dir,
        transform=None
    )
    train_subset_no_transform = Subset(dataset_train_no_transform, train_indices)

    eval_transform = transforms.Compose([
        transforms.Resize(72),
        transforms.CenterCrop(64),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std),
    ])
    dataset_eval = datasets.ImageFolder(
        root=data_dir,
        transform=eval_transform
    )
    val_subset = Subset(dataset_eval, val_indices)
    test_subset = Subset(dataset_eval, test_indices)

    normalize = transforms.Normalize(mean=mean, std=std)
    augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(64, scale=(0.5, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomGrayscale(p=0.2),
        transforms.ToTensor(),
        normalize,
    ])
    simclr_transform = TwoCropsTransform(augment_transform)
    train_ds_simclr = SimCLRDataset(train_subset_no_transform, simclr_transform)

    train_loader = DataLoader(
        train_ds_simclr,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=NUM_WORKERS,
        generator=torch.Generator().manual_seed(SEED)
    )
    val_loader = DataLoader(
        val_subset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=NUM_WORKERS,
        generator=torch.Generator().manual_seed(SEED)
    )
    test_loader = DataLoader(
        test_subset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=NUM_WORKERS,
        generator=torch.Generator().manual_seed(SEED)
    )

    print(f"Train/Val/Test loaders: {len(train_loader)}/{len(val_loader)}/{len(test_loader)} batches")

    return train_loader, val_loader, test_loader, num_classes
class ProjectionHead(nn.Module):
    def __init__(self, input_dim, proj_dim=128, hidden_dim=2048):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, proj_dim)
        )

    def forward(self, x):
        return self.net(x)

class SimCLRModel(nn.Module):
    def __init__(self, base_encoder, proj_dim=128):
        super().__init__()
        self.encoder = base_encoder
        self.encoder.fc = nn.Identity()
        self.projection_head = ProjectionHead(input_dim=CONFIG["FEATURE_DIM"], proj_dim=proj_dim)

    def forward(self, x):
        feat = self.encoder(x)
        proj = self.projection_head(feat)
        return feat, proj

class NTXentLoss(nn.Module):
    def __init__(self, batch_size, temperature=0.5, device='cuda'):
        super().__init__()
        self.temperature = temperature
        self.batch_size = batch_size
        self.device = device
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, zis, zjs):
        N = zis.size(0)
        z = F.normalize(torch.cat([zis, zjs], dim=0), dim=1)
        sim = torch.matmul(z, z.T) / self.temperature
        mask = torch.eye(2 * N, dtype=torch.bool).to(self.device)
        sim = sim.masked_fill(mask, -1e9)
        labels = torch.cat([torch.arange(N, 2 * N), torch.arange(0, N)]).to(self.device)
        return self.criterion(sim, labels)

def train_simclr(model, train_loader, val_loader, optimizer, criterion, device, epochs, scheduler=None):
    model.train()
    model.to(device)
    # Start a W&B run here
   
    for epoch in range(epochs):
        total_loss = 0
        for (x1, x2) in train_loader:
            x1, x2 = x1.to(device), x2.to(device)
            _, z1 = model(x1)
            _, z2 = model(x2)
            loss = criterion(z1, z2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
            total_loss += loss.item()
        avg = total_loss / len(train_loader)

        print(f"[SimCLR] Epoch {epoch+1}/{epochs} - Train Loss: {avg:.4f}%")
        
    print("Finished SimCLR pretraining.")

def evaluate(classifier, backbone, loader, device):
    classifier.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            features = backbone(images)
            outputs = classifier(features)
            total += labels.size(0)
            correct += (outputs.argmax(1) == labels).sum().item()
    return correct / total * 100

def make_optimizer_scheduler(params, lr, wd, steps_per_epoch, epochs):
    total_steps  = epochs * steps_per_epoch
    warmup_steps = steps_per_epoch
    opt = optim.AdamW(params, lr=lr, betas=(0.9,0.98), eps=1e-8, weight_decay=wd)
    sched = SequentialLR(
        opt,
        schedulers=[
            LinearLR(opt,  start_factor=1e-6, end_factor=1.0, total_iters=warmup_steps),
            CosineAnnealingLR(opt, T_max=total_steps - warmup_steps)
        ],
        milestones=[warmup_steps]
    )
    return opt, sched

In [None]:
def train_linear_probe(backbone, train_loader, val_loader, device, epochs, lr, run_id):
    for p in backbone.parameters():
        p.requires_grad = False

    base_ds = train_loader.dataset
    while isinstance(base_ds, Subset):
        base_ds = base_ds.dataset
    num_classes = len(base_ds.classes)

    classifier = nn.Linear(CONFIG["FEATURE_DIM"], num_classes).to(device)

    optimizer, sched = make_optimizer_scheduler(
        classifier.parameters(),
        lr=lr,
        wd=CONFIG["WD"],
        steps_per_epoch=len(train_loader),
        epochs=epochs
    )
    wandb.init(
        project="EuroSAT_SimCLR_LinearProbe",
        name=f"LinearProbe_Seed{run_id}",
        config=CONFIG,
    )
    wandb.watch(classifier, log="all", log_freq=100)
    # Define the loss function

    criterion = nn.CrossEntropyLoss()


    for epoch in range(epochs):
        classifier.train()
        correct, total = 0, 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            features = backbone(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if sched:
                sched.step()
            total += labels.size(0)
            correct += (outputs.argmax(1) == labels).sum().item()

        train_acc = correct / total * 100
        val_acc = evaluate(classifier, backbone, val_loader, device)
        print(f"[Linear] Epoch {epoch+1}/{epochs} - "
              f"Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%")

    torch.save(classifier.state_dict(), f"models/linear_probe_seed{run_id}.pth")
    return val_acc

def load_evaluate_model(model_path, device, data_dir, seed):

    wandb.init(
        project="EuroSAT_SimCLR_LinearProbe",
        name=f"LinearProbe_Seed{seed}",
        config=CONFIG,
    )
    results = []

    backbone = resnet50(weights=None if not PRETRAINED else "DEFAULT")
    backbone.fc = nn.Identity()  # same as in SimCLRModel
    backbone.load_state_dict(torch.load(model_path), strict=False)
    backbone.to(device)    
    backbone.eval()
    for p in backbone.parameters():
        p.requires_grad = False

    # (Make sure 'data_dir' and 'seed' are in scope; if not, pass them in.)
    _, _, test_loader, _ = get_data_loaders(data_dir, CONFIG["BATCH_SIZE"])
    print(f"Starting linear probe on EuroSAT test split (seed={seed})...")

    # Split EuroSAT test‐subset into 80%/20% for probe‐train vs. probe‐val
    full_test_ds = test_loader.dataset  # this is a Subset of dataset_eval
    train_size = int(LINEAR_PROB_TRAIN_SPLIT * len(full_test_ds))
    val_size = len(full_test_ds) - train_size
    train_dataset, val_dataset = random_split(full_test_ds, [train_size, val_size])

    train_loader_from_test = DataLoader(
        train_dataset,
        batch_size=CONFIG["BATCH_SIZE"],
        shuffle=True,
        num_workers=2
    )
    val_loader_from_test = DataLoader(
        val_dataset,
        batch_size=CONFIG["BATCH_SIZE"],
        shuffle=False,
        num_workers=2
    )

    linear_probe_val_acc = train_linear_probe(
        backbone,
        train_loader_from_test,
        val_loader_from_test,
        DEVICE,
        epochs=CONFIG["EPOCHS_LINEAR"],
        lr=CONFIG["LR_LINEAR"],
        run_id=seed
    )
    wandb.log({"linear_probe_val_acc": linear_probe_val_acc})
    wandb.finish()
    print(f"[Linear‐Probe on EuroSAT test] Final Val Acc = {linear_probe_val_acc:.2f}%\n")

    results.append({
        "seed": seed,
        "val_acc": linear_probe_val_acc
    })
    with open("models/constrastive_linear_probe_results.txt", "a") as f:
        f.write(f"Seed: {seed}, Val Acc: {linear_probe_val_acc:.2f}%\n")
    print("Results saved to linear_probe_results.txt")
    return results

In [None]:

seeds = [GLOBAL_SEED]

for seed in seeds:
    print(f"\n=== Starting run with seed {seed} ===")
    set_seed(seed)
    
    data_dir = prepare_data()
    train_loader, val_loader, test_loader, num_classes = get_data_loaders(data_dir, CONFIG["BATCH_SIZE"])

    # Initialize base encoder and SimCLR model
    base_encoder = resnet50(weights=None)
    simclr_model = SimCLRModel(base_encoder, proj_dim=CONFIG["PROJ_DIM"])
    # optimizer = optim.Adam(simclr_model.parameters(), lr=CONFIG["LR"])
    wd =  0.5 
    optimizer, scheduler = make_optimizer_scheduler(
        simclr_model.parameters(),
        CONFIG["LR"],
        CONFIG["WD"],
        len(train_loader),
        CONFIG["EPOCHS_SIMCLR"]
        )
    
    bs = CONFIG["BATCH_SIZE"]
    loss_fn = NTXentLoss(bs, temperature=TEMPERATURE, device=DEVICE)

    print("Starting SimCLR training...")
    lr = CONFIG["LR"]
    wandb_run = wandb.init(
        project="eurosat-contrastive-scratch-grid-search",
        name=f"BS{bs}_LR{lr:.0e}_SEED{seed}_TEMPERATURE{TEMPERATURE}",
        config={
            "seed": seed,
            "temperature": TEMPERATURE,
            "model": "SimCLR",
            "dataset": "EuroSAT",
            "batch_size": bs,
            "learning_rate": CONFIG["LR"],
            "epochs": CONFIG["EPOCHS_SIMCLR"],
            "proj_dim": CONFIG["PROJ_DIM"],
            "feature_dim": CONFIG["FEATURE_DIM"],
            "pretrained": PRETRAINED,
        }
    )
    train_simclr(simclr_model, train_loader, val_loader, optimizer, loss_fn, DEVICE, CONFIG["EPOCHS_SIMCLR"], scheduler=scheduler)
    wandb_run.finish()

    print("Saving encoder...")
    torch.save(simclr_model.state_dict(), f"simclr_model_seed{seed}_temperature{TEMPERATURE}_bs{bs}.pth")



=== Starting run with seed 42 ===
Total samples in folder: 27000, classes: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
Stratified split sizes: train=21600, val=2700, test=2700
Computed mean: [0.3441457748413086, 0.38009852170944214, 0.40766361355781555]
Computed std:  [0.09299740195274353, 0.06464490294456482, 0.054139167070388794]
Train/Val/Test loaders: 168/22/22 batches
Starting SimCLR training...


[SimCLR] Epoch 1/30 - Train Loss: 3.5113%




[SimCLR] Epoch 2/30 - Train Loss: 2.1347%
[SimCLR] Epoch 3/30 - Train Loss: 1.7883%
[SimCLR] Epoch 4/30 - Train Loss: 1.6446%


In [None]:
import torch
import os

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available (torch): {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version PyTorch was built with: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("PyTorch reports CUDA is NOT available.")

# Check environment variables directly from Python
print(f"CUDA_HOME: {os.environ.get('CUDA_HOME')}")
print(f"PATH: {os.environ.get('PATH')}")
print(f"LD_LIBRARY_PATH: {os.environ.get('LD_LIBRARY_PATH')}")

PyTorch version: 2.5.1
CUDA available (torch): True
CUDA version PyTorch was built with: 12.1
Number of GPUs: 3
  GPU 0: Quadro RTX 5000
  GPU 1: Quadro RTX 5000
  GPU 2: Quadro RTX 5000
CUDA_HOME: None
PATH: /users/c/carvalhj/miniconda3/envs/myenv/bin:/users/c/carvalhj/.vscode-server/cli/servers/Stable-dfaf44141ea9deb3b4096f7cd6d24e00c147a4b1/server/bin/remote-cli:/users/c/carvalhj/miniconda3/envs/myenv/bin:/users/c/carvalhj/miniconda3/condabin:/users/c/carvalhj/bin:/usr/local/bin:/usr/bin:/bin
LD_LIBRARY_PATH: None


In [None]:

# Run the evaluation
for seed in seeds:
    results = load_evaluate_model(f"simclr_model_seed{seed}.pth", DEVICE, data_dir, seed)
    print(f"Results for seed {seed}: {results}")
    

  backbone.load_state_dict(torch.load(model_path), strict=False)


Total samples in folder: 27000, classes: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
Stratified split sizes: train=21600, val=2700, test=2700
Computed mean: [0.3441457748413086, 0.38009852170944214, 0.40766361355781555]
Computed std:  [0.09299740195274353, 0.06464490294456482, 0.054139167070388794]
Train/Val/Test loaders: 168/22/22 batches
Starting linear probe on EuroSAT test split (seed=42)...




[Linear] Epoch 1/10 - Train Acc: 9.73%, Val Acc: 12.00%
[Linear] Epoch 2/10 - Train Acc: 11.36%, Val Acc: 17.33%
[Linear] Epoch 3/10 - Train Acc: 14.07%, Val Acc: 13.63%
[Linear] Epoch 4/10 - Train Acc: 15.41%, Val Acc: 14.22%
[Linear] Epoch 5/10 - Train Acc: 19.26%, Val Acc: 17.48%
[Linear] Epoch 6/10 - Train Acc: 21.43%, Val Acc: 23.11%
[Linear] Epoch 7/10 - Train Acc: 22.37%, Val Acc: 22.96%
[Linear] Epoch 8/10 - Train Acc: 24.10%, Val Acc: 23.11%
[Linear] Epoch 9/10 - Train Acc: 23.26%, Val Acc: 22.81%
[Linear] Epoch 10/10 - Train Acc: 24.64%, Val Acc: 22.81%


0,1
linear_probe_val_acc,▁

0,1
linear_probe_val_acc,22.81481


[Linear‐Probe on EuroSAT test] Final Val Acc = 22.81%

Results saved to linear_probe_results.txt
Results for seed 42: [{'seed': 42, 'val_acc': 22.814814814814817}]
