In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../model/")
sys.path.append("../tools/")
from constants import *
from MLP_classifier import MultiClassClassifier
from dataset import FlickrAndPairs, TaskA, TaskAWithLabel, SimpleDataset
import torch
from torch.utils.data import DataLoader
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 11.06it/s]


In [7]:
train_data_dino = FlickrAndPairs(path="/data4/saland/data/flickr_and_pairs_DinoV2.pt",load_from_disk=True)
train_data_clip = FlickrAndPairs(path="/data4/saland/data/flickr_and_pairs.pt",load_from_disk=True)
test_data_dino  = TaskAWithLabel(path_to_csv="../../docs/scan.csv",path_to_taskA="/data4/saland/data/taskA_dinoV2.pt")
test_data_clip  = TaskAWithLabel(path_to_csv="../../docs/scan.csv",path_to_taskA="/data4/saland/data/taskA.pt")

100%|██████████| 3333/3333 [00:00<00:00, 4989.15it/s]
100%|██████████| 3333/3333 [00:00<00:00, 4935.96it/s]


### CLIP vs DINO

In [13]:
device = "cuda:0"
model_dino = MultiClassClassifier(n_features=DINO_FEATURE_DIM,n_classes=2).to(device)
model_clip = MultiClassClassifier(n_features=CLIP_FEATURE_DIM,n_classes=2).to(device)

lr = 1e-3
batch_size = 64
n_epochs = 1000

loss_fn = nn.CrossEntropyLoss()
optimizer_clip = torch.optim.SGD(model_clip.parameters(), lr=lr)
optimizer_dino = torch.optim.SGD(model_dino.parameters(), lr=lr)
model_dino.train()
model_clip.train()

rng = torch.Generator().manual_seed(SEED)

train_loader_dino = DataLoader(train_data_dino,batch_size=batch_size,shuffle=True,generator=rng)
train_loader_clip = DataLoader(train_data_clip,batch_size=batch_size,shuffle=True,generator=rng)

In [6]:
for epoch in range(1,n_epochs+1):
    for idx, batch in enumerate(train_loader_dino):
        # prediction and loss
        pred_dino = model_dino((batch["features"]).to(device))
        loss_dino = loss_fn(pred_dino,batch["label"].type(torch.LongTensor).to(device))


        # backpropagation
        
        loss_dino.backward()
        optimizer_dino.step()
        optimizer_dino.zero_grad()
        
    loss_dino, current = loss_dino.item(), idx*batch_size + len(batch["features"])
    if epoch%10 == 0 and epoch > 0:
        print(f"loss_dino: {loss_dino:>7f} [{epoch:>5d}/{n_epochs:>5d}]")

for epoch in range(1,n_epochs+1):
    for idx, batch in enumerate(train_loader_clip):
        
        pred_clip = model_clip(batch["features"].to(device))
        loss_clip = loss_fn(pred_clip,batch["label"].to(device))
        
        loss_clip.backward()
        optimizer_clip.step()
        optimizer_clip.zero_grad()
    
    loss_clip, current = loss_clip.item(), idx*batch_size + len(batch["features"])
    if epoch%10 == 0 and epoch > 0:
        print(f"loss_clip: {loss_clip:>7f} [{epoch:>5d}/{n_epochs:>5d}]")

loss_dino: 0.559759 [   10/ 1000]
loss_dino: 0.488277 [   20/ 1000]
loss_dino: 0.309040 [   30/ 1000]
loss_dino: 0.295351 [   40/ 1000]
loss_dino: 0.287999 [   50/ 1000]
loss_dino: 0.141933 [   60/ 1000]
loss_dino: 0.193949 [   70/ 1000]
loss_dino: 0.131623 [   80/ 1000]
loss_dino: 0.099305 [   90/ 1000]
loss_dino: 0.062152 [  100/ 1000]
loss_dino: 0.108954 [  110/ 1000]
loss_dino: 0.072467 [  120/ 1000]
loss_dino: 0.078572 [  130/ 1000]
loss_dino: 0.075661 [  140/ 1000]
loss_dino: 0.071094 [  150/ 1000]
loss_dino: 0.087684 [  160/ 1000]
loss_dino: 0.073516 [  170/ 1000]
loss_dino: 0.057642 [  180/ 1000]
loss_dino: 0.057338 [  190/ 1000]
loss_dino: 0.073112 [  200/ 1000]
loss_dino: 0.043730 [  210/ 1000]
loss_dino: 0.041186 [  220/ 1000]
loss_dino: 0.031449 [  230/ 1000]
loss_dino: 0.048077 [  240/ 1000]
loss_dino: 0.037844 [  250/ 1000]
loss_dino: 0.030416 [  260/ 1000]
loss_dino: 0.033182 [  270/ 1000]
loss_dino: 0.018958 [  280/ 1000]
loss_dino: 0.017487 [  290/ 1000]
loss_dino: 0.0

In [9]:
acc_clip = model_clip.get_model_accuracy_binary(test_data_clip.features,test_data_clip.label,binary_model=True,device=device)
acc_dino = model_dino.get_model_accuracy_binary(test_data_dino.features,test_data_dino.label,binary_model=True,device=device)
print("accuracy clip:",acc_clip)
print("accuracy dino:",acc_dino)

accuracy clip: 0.860185980796814
accuracy dino: 0.7680767774581909


### Concatenation of CLIP and DINOV2 features

In [8]:
train_clip_dino = SimpleDataset(features=torch.cat((train_data_clip.features,
                                                    train_data_dino.features),dim=1),
                                label=train_data_clip.label)

test_clip_dino = SimpleDataset(features=torch.cat((test_data_clip.features.to(device),
                                                   test_data_dino.features.to(device)),dim=1),
                                label= test_data_clip.label)

In [9]:
train_clip_dino.features.shape

torch.Size([3999, 1536])

In [12]:
device = "cuda:0"
model = MultiClassClassifier(n_features=CLIP_FEATURE_DIM+DINO_FEATURE_DIM,n_classes=2).to(device)
model.train()

lr = 1e-3
batch_size = 64
n_epochs = 1000

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

rng = rng = torch.Generator().manual_seed(SEED)
train_loader = DataLoader(train_clip_dino,batch_size=batch_size,shuffle=True,generator=rng)

for epoch in range(1,n_epochs+1):
    for idx, batch in enumerate(train_loader):
        
        pred = model(batch["features"].to(device))
        loss = loss_fn(pred,batch["label"].to(device))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    loss, current = loss.item(), idx*batch_size + len(batch["features"])
    if epoch%10 == 0 and epoch > 0:
        print(f"loss_clip: {loss:>7f} [{epoch:>5d}/{n_epochs:>5d}]")

loss_clip: 0.406928 [   10/ 1000]
loss_clip: 0.256964 [   20/ 1000]
loss_clip: 0.213967 [   30/ 1000]
loss_clip: 0.170856 [   40/ 1000]
loss_clip: 0.105218 [   50/ 1000]
loss_clip: 0.038156 [   60/ 1000]
loss_clip: 0.093853 [   70/ 1000]
loss_clip: 0.053000 [   80/ 1000]
loss_clip: 0.047475 [   90/ 1000]
loss_clip: 0.020674 [  100/ 1000]
loss_clip: 0.056921 [  110/ 1000]
loss_clip: 0.037268 [  120/ 1000]
loss_clip: 0.032698 [  130/ 1000]
loss_clip: 0.050717 [  140/ 1000]
loss_clip: 0.028283 [  150/ 1000]
loss_clip: 0.030035 [  160/ 1000]
loss_clip: 0.045557 [  170/ 1000]
loss_clip: 0.016609 [  180/ 1000]
loss_clip: 0.031243 [  190/ 1000]
loss_clip: 0.027374 [  200/ 1000]
loss_clip: 0.011715 [  210/ 1000]
loss_clip: 0.015760 [  220/ 1000]
loss_clip: 0.010196 [  230/ 1000]
loss_clip: 0.020012 [  240/ 1000]
loss_clip: 0.012137 [  250/ 1000]
loss_clip: 0.009863 [  260/ 1000]
loss_clip: 0.019553 [  270/ 1000]
loss_clip: 0.005883 [  280/ 1000]
loss_clip: 0.014183 [  290/ 1000]
loss_clip: 0.0

In [13]:
model.get_model_accuracy_binary(test_clip_dino.features,test_clip_dino.label,device=device,binary_model=True)

0.8955895304679871