# SimCLR 

Notebook based (partially) on the source code from: https://github.com/The-AI-Summer/simclr

## Imports

In [None]:
from utils.SimCLR import *

## Data

In [None]:
SEED = 42
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
torch.cuda.manual_seed(SEED)

print(f"Detected Device: {device}")

In [None]:
data_df = pd.read_csv("data/nearest_places_mapping.csv")
data_df = data_df.sample(frac=1).reset_index(drop=True)

## Plots

In [None]:
augmenter = Augment(img_size=224)
loader = get_data_loader(data_df, BATCH_SIZE, base_path='Eynsham/Images', transform=None, shuffle=True)
plot_sample_batch(loader, num_examples=4, num_views=5)

In [None]:
augmenter = Augment(img_size=224)
loader = get_data_loader(data_df, BATCH_SIZE, base_path='Eynsham/Images', transform=augmenter, shuffle=True)
plot_sample_batch(loader, num_examples=4, num_views=5)

## Large Embedding - 64

In [None]:
data_df = pd.read_csv("data/nearest_places_mapping.csv")
data_df = data_df.sample(frac=1).reset_index(drop=True)

In [None]:
model = SimCLR_pl(EMBEDDING_SIZE_LARGE, MLP_DIM, parallel_views=True, use_adapter=False)
available_gpus = len([torch.cuda.device(i) for i in range(torch.cuda.device_count())])

transform = Augment(img_size=224)
data_loader = get_data_loader(data_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True)
accumulator = GradientAccumulationScheduler(scheduling={0: GRADIENT_ACCUMULATION_STEPS})
trainer = Trainer(callbacks=[accumulator],
                  accelerator='gpu',
                  devices=available_gpus,
                  amp_backend="native",
                  max_epochs=MAX_EPOCHS)

trainer.fit(model, data_loader)
model = model.to("cpu")
os.makedirs("models/", exist_ok=True)
model_path = "models/simclr_weights_large_parallel.pth"
torch.save(model.state_dict(), model_path)
print(f"Model weights saved to {model_path}")
del model

### Fine-tuning

In [None]:
data_df = pd.read_csv("data/nearest_places_mapping.csv")
data_df = data_df.sample(frac=1).reset_index(drop=True)
train_df = data_df.head(len(data_df) - 300)
test_df = data_df.tail(300)
del data_df

In [None]:
model_path = "models/simclr_weights_large_parallel.pth"
model = SimCLR_pl(EMBEDDING_SIZE_LARGE, MLP_DIM, parallel_views=True, use_adapter=False)
model.load_state_dict(torch.load(model_path))

In [None]:
dataset = FineTuniningMultiViewImageDataset(train_df, base_path='Eynsham/Images', transform=None, num_views=5)
plot_views_from_finetunedataset(dataset, index=0)

In [None]:
model.set_fine_tuning(True)
model.freeze_backbone()
available_gpus = len([torch.cuda.device(i) for i in range(torch.cuda.device_count())])
transform = TestAugment(img_size=224) 
train_loader = get_data_loader(train_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True, fine_tune=True)
test_loader = get_data_loader(test_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True, fine_tune=True)

accumulator = GradientAccumulationScheduler(scheduling={0: GRADIENT_ACCUMULATION_STEPS})
trainer = Trainer(callbacks=[accumulator],
                  accelerator='gpu',
                  devices=available_gpus,
                  max_epochs=MAX_EPOCHS)

trainer.fit(model, train_loader)
model.eval()
trainer.test(model, dataloaders=test_loader)

model = model.to("cpu")
os.makedirs("models/", exist_ok=True)
model_path = "models/simclr_weights_large_parallel_fine_tuned.pth"
torch.save(model.state_dict(), model_path)
print(f"Model weights saved to {model_path}")
del model

### Results

In [None]:
model_path = "models/simclr_weights_large_parallel_fine_tuned.pth"
model = SimCLR_pl(EMBEDDING_SIZE_LARGE, MLP_DIM, parallel_views=True, use_adapter=False)
model.load_state_dict(torch.load(model_path))

In [None]:
transform = TestAugment(img_size=224)
plot_embedding_match_2d(model, test_df.copy(), base_path='Eynsham/Images', device=device, batch_size=BATCH_SIZE, transform = transform)

In [None]:
top1small, top5small, top10small, distance = evaluate_embedding_accuracy(model, test_df, base_path='Eynsham/Images', transform=transform)
print(f"Accuracy for concatenated embeddings SimCLR: Top 1: {top1small*100:.4f}%, Top 5: {top5small*100:.4f}%, Top 10: {top10small*100:.4f}%, Mean Distance to target: {distance:.4f}")

## Small Embedding - 2

In [None]:
data_df = pd.read_csv("data/nearest_places_mapping.csv")
data_df = data_df.sample(frac=1).reset_index(drop=True)

In [None]:
model = SimCLR_pl(EMBEDDING_SIZE_SMALL, MLP_DIM, parallel_views=True, use_adapter=False)
available_gpus = len([torch.cuda.device(i) for i in range(torch.cuda.device_count())])

transform = Augment(img_size=224)
data_loader = get_data_loader(data_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True)
accumulator = GradientAccumulationScheduler(scheduling={0: GRADIENT_ACCUMULATION_STEPS})
trainer = Trainer(callbacks=[accumulator],
                  accelerator='gpu',
                  devices=available_gpus,
                  amp_backend="native",
                  max_epochs=MAX_EPOCHS)

trainer.fit(model, data_loader)
model = model.to("cpu")
os.makedirs("models/", exist_ok=True)
model_path = "models/simclr_weights_small_parallel.pth"
torch.save(model.state_dict(), model_path)
print(f"Model weights saved to {model_path}")
del model

### Fine-tuning

In [None]:
data_df = pd.read_csv("data/nearest_places_mapping.csv")
data_df = data_df.sample(frac=1).reset_index(drop=True)
train_df = data_df.head(len(data_df) - 300)
test_df = data_df.tail(300)
del data_df

In [None]:
model_path = "models/simclr_weights_small_parallel.pth"
model = SimCLR_pl(EMBEDDING_SIZE_SMALL, MLP_DIM, parallel_views=True, use_adapter=False)
model.load_state_dict(torch.load(model_path))

In [None]:
dataset = FineTuniningMultiViewImageDataset(train_df, base_path='Eynsham/Images', transform=None, num_views=5)
plot_views_from_finetunedataset(dataset, index=0)

In [None]:
model.set_fine_tuning(True)
model.freeze_backbone()
available_gpus = len([torch.cuda.device(i) for i in range(torch.cuda.device_count())])
transform = TestAugment(img_size=224) 
train_loader = get_data_loader(train_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True, fine_tune=True)
test_loader = get_data_loader(test_df, BATCH_SIZE, base_path='Eynsham/Images', transform=transform, shuffle=True, fine_tune=True)

accumulator = GradientAccumulationScheduler(scheduling={0: GRADIENT_ACCUMULATION_STEPS})
trainer = Trainer(callbacks=[accumulator],
                  accelerator='gpu',
                  devices=available_gpus,
                  max_epochs=MAX_EPOCHS)

trainer.fit(model, train_loader)
model.eval()
trainer.test(model, dataloaders=test_loader)

model = model.to("cpu")
os.makedirs("models/", exist_ok=True)
model_path = "models/simclr_weights_small_parallel_fine_tuned.pth"
torch.save(model.state_dict(), model_path)
print(f"Model weights saved to {model_path}")
del model

### Results

In [None]:
model_path = "models/simclr_weights_small_parallel_fine_tuned.pth"
model = SimCLR_pl(EMBEDDING_SIZE_SMALL, MLP_DIM, parallel_views=True, use_adapter=False)
model.load_state_dict(torch.load(model_path))

In [None]:
transform = TestAugment(img_size=224)
plot_embedding_match_2d(model, test_df, base_path='Eynsham/Images', transform=transform, use_pca=False)

In [None]:
top1small, top5small, top10small, distance = evaluate_embedding_accuracy(model, test_df, base_path='Eynsham/Images', transform=transform)
print(f"Accuracy for concatenated embeddings SimCLR: Top 1: {top1small*100:.4f}%, Top 5: {top5small*100:.4f}%, Top 10: {top10small*100:.4f}%, Mean Distance to target: {distance:.4f}")