In [1]:
import pandas as pd
import glob
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import utilities
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import timm
import math
from transformers import (get_linear_schedule_with_warmup, 
                          get_cosine_schedule_with_warmup, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_constant_schedule_with_warmup)
from tqdm import tqdm
import faiss
import random
import gc
import transformers
from transformers import CLIPProcessor, CLIPVisionModel,  CLIPVisionConfig
from PIL import Image
from torchvision import transforms
from pytorch_metric_learning import losses
import open_clip
import sys

In [2]:
class CFG:
    model_name = 'ViT-H-14' 
    model_data = 'laion2b_s32b_b79k'
    samples_per_class = 50
    min_samples = 4
    image_size = 224 
    seed = 5
    workers = 6
    train_batch_size = 8
    valid_batch_size = 32 
    emb_size = 512
    vit_bb_lr = {'10': 1.25e-6, '20': 2.5e-6, '26': 5e-6, '32': 10e-6} 
    vit_bb_wd = 1e-3
    hd_lr = 3e-4
    hd_wd = 1e-5
    autocast = True
    n_warmup_steps = 1000
    n_epochs = 1
    device = torch.device('cuda')
    s=30.
    m=.45
    m_min=.05
    acc_steps = 4
    global_step = 0

In [3]:
utilities.set_seed(CFG.seed)

In [4]:
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32-quickgelu', 'openai'),
 ('ViT-B-32-quickgelu', 'laion400m_e31'),
 ('ViT-B-32-quickgelu', 'laion400m_e32'),
 ('ViT-B-16', 'openai'),
 ('ViT-B-16', 'laion400m_e31'),
 ('ViT-B-16', 'laion400m_e32'),
 ('ViT-B-16', 'laion2b_s34b_b88k'),
 ('ViT-B-16-plus-240', 'laion400m_e31'),
 ('ViT-B-16-plus-240', 'laion400m_e32'),
 ('ViT-L-14', 'openai'),
 ('ViT-L-14', 'laion400m_e31'),
 ('ViT-L-14', 'laion400m_e32'),
 ('ViT-L-14', 'laion2b_s32b_b82k'),
 ('ViT-L-14-336', 'openai'),
 ('ViT-H-14', 'laion2b_s32b_

In [5]:
vit_backbone, model_transforms, _ = open_clip.create_model_and_transforms(CFG.model_name, pretrained=CFG.model_data)

In [15]:
#landmarks = random.sample(glob.glob('../data/landmark-2020/*'), 9691)
products = glob.glob('../products-10k/train/*')

In [16]:
train_globs = []
train_globs.append(products)

In [21]:
def get_samples(paths):

    value_counts = []
    folder_count = 0
    maps = []
    for i, var in enumerate(paths):
        dataset_maps = []
        for j, folder_path in enumerate(var): 
            folder_contents = glob.glob(folder_path + '/*')
            print(folder_path, folder_contents)
            length = len(folder_contents)
            if length >= CFG.min_samples:
                folder_size = 0
                for file_path in folder_contents[:CFG.samples_per_class]:
                        folder_size += 1
                        dataset_maps.append((file_path, folder_count))
                folder_count += 1
                value_counts.append(folder_size)
            break
        break
        maps.append(dataset_maps)

    total = 0
    print(maps)
    for i, var in enumerate(maps):
        length = len(var)
        total += length
        print('samples in dataset', length)

    for i, var in enumerate(maps):
        print('percentage of samples of dataset', len(var)/total)

    total_samples = []
    for i, dataset_map in enumerate(maps):
        for j, map in enumerate(dataset_map):
            total_samples.append(map)
    
    return total_samples, folder_count, np.array(value_counts)

In [22]:
print('train:')
train_samples, train_classes, value_counts = get_samples(train_globs)

train:
../products-10k/train/106801.jpg []
[]


In [10]:
CFG.n_classes = train_classes
print(CFG.n_classes)

17642


In [11]:
prev_num = 0
switches = 0
count = 0
for var in train_samples:
    current_num = var[-1]

    if current_num != prev_num:
        assert count >= CFG.min_samples
        count = 0
        switches+= 1
    count+=1
    
    prev_num = current_num
print(switches)
print(train_classes)
assert switches+1 == train_classes

17641
17642


In [12]:
valid_paths = glob.glob('../data/objectNET-4-of-10/*')
valid_samples = [(file_path, i) for i, folder_path in enumerate(valid_paths) for file_path in glob.glob(folder_path + '/*')]
print(len(valid_samples))

20111


In [13]:
print(len(train_samples))

285752


In [14]:
class ImageNet_DS(Dataset):
    def __init__(self, map, transforms):
        self.map=map
        self.transforms=transforms

    def __getitem__(self, index):
        
        image_path, label = self.map[index] 
        
        label = torch.tensor(label, dtype=torch.long)

        images = self.transforms(image_path, CFG.image_size)

        images = transforms.functional.resize(images, size=[CFG.image_size, CFG.image_size]) 

        return {
            'images': images,
            'labels': label
        }

    def __len__(self):
        return len(self.map)

In [15]:
class Head(nn.Module):
    def __init__(self, hidden_size):
        super(Head, self).__init__()

        self.emb = nn.Linear(hidden_size, CFG.emb_size, bias=False)
        self.arc = utilities.ArcMarginProduct_subcenter(CFG.emb_size, CFG.n_classes)
        self.dropout = utilities.Multisample_Dropout()

    def forward(self, x):
        embeddings = self.dropout(x, self.emb)
        
        output = self.arc(embeddings)

        return output, F.normalize(embeddings)

In [16]:
class Model(nn.Module):
    def __init__(self, vit_backbone):
        super(Model, self).__init__()

        self.vit_backbone = vit_backbone

        self.head = Head(1024)

    def forward(self, images):

        x = transforms.functional.resize(images, size=[CFG.image_size, CFG.image_size]) 
        x = x/255
        x = transforms.functional.normalize(x,  
                                             mean=model_transforms.transforms[-1].mean, 
                                             std=model_transforms.transforms[-1].std)

        x = self.vit_backbone.encode_image(x)
        
        return self.head(x)

    def get_parameters(self):

        parameter_settings = [] 
        parameter_settings.extend(self.get_parameter_section([(n, p) for n, p in self.vit_backbone.named_parameters()], lr=CFG.vit_bb_lr, wd=CFG.vit_bb_wd)) 

        parameter_settings.extend(self.get_parameter_section([(n, p) for n, p in self.head.named_parameters()], lr=CFG.hd_lr, wd=CFG.hd_wd)) 

        return parameter_settings

    def get_parameter_section(self, parameters, lr=None, wd=None): 
        parameter_settings = []


        lr_is_dict = isinstance(lr, dict)
        wd_is_dict = isinstance(wd, dict)

        layer_no = None
        for no, (n,p) in enumerate(parameters):
            
            for split in n.split('.'):
                if split.isnumeric():
                    layer_no = int(split)
            
            if not layer_no:
                layer_no = 0
            
            if lr_is_dict:
                for k,v in lr.items():
                    if layer_no < int(k):
                        temp_lr = v
                        break
            else:
                temp_lr = lr

            if wd_is_dict:
                for k,v in wd.items():
                    if layer_no < int(k):
                        temp_wd = v
                        break
            else:
                temp_wd = wd

            weight_decay = 0.0 if 'bias' in n else temp_wd

            parameter_setting = {"params" : p, "lr" : temp_lr, "weight_decay" : temp_wd}

            parameter_settings.append(parameter_setting)

            #print(f'no {no} | params {n} | lr {temp_lr} | weight_decay {weight_decay} | requires_grad {p.requires_grad}')

        return parameter_settings


In [17]:
train_dataset = ImageNet_DS(train_samples, utilities.transforms_auto_augment)
valid_dataset = ImageNet_DS(valid_samples, utilities.transforms_valid)

In [18]:
train_loader = DataLoader(train_dataset, batch_size = CFG.train_batch_size, num_workers=CFG.workers, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size = CFG.valid_batch_size, num_workers=CFG.workers, shuffle=False, drop_last=False)

In [19]:
def ArcFace_criterion(logits_m, target, margins):
    arc = utilities.ArcFaceLossAdaptiveMargin(margins=margins, s=CFG.s)
    loss_m = arc(logits_m, target, CFG.n_classes)
    return loss_m

In [20]:
def train(model, train_loader, optimizer, scaler, scheduler, epoch):
    model.train()
    loss_metrics = utilities.AverageMeter()
    criterion = ArcFace_criterion

    tmp = np.sqrt(1 / np.sqrt(value_counts))
    margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * CFG.m + CFG.m_min
        
    bar = tqdm(train_loader)
    for step, data in enumerate(bar):
        step += 1
        images = data['images'].to(CFG.device, dtype=torch.float)
        labels = data['labels'].to(CFG.device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.autocast):
            outputs, features = model(images)

        loss = criterion(outputs, labels, margins)
        loss_metrics.update(loss.item(), batch_size)
        loss = loss / CFG.acc_steps
        scaler.scale(loss).backward()

        if step % CFG.acc_steps == 0 or step == len(bar):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            CFG.global_step += 1
            
        lrs = utilities.get_lr_groups(optimizer.param_groups)

        loss_avg = loss_metrics.avg

        bar.set_postfix(loss=loss_avg, epoch=epoch, lrs=lrs, step=CFG.global_step)

In [21]:
def eval(model, valid_loader, epoch):
    with torch.no_grad():
        model.eval() 

        all_embeddings = []
        all_labels = [] 

        bar = tqdm(valid_loader)
        for i, data in enumerate(bar):
            images = data['images'].to(CFG.device, dtype=torch.float)
            labels = data['labels'].to(CFG.device)

            outputs, embeddings = model(images)

            all_embeddings.append(embeddings.detach().cpu().numpy())
            all_labels.append(labels.detach().cpu().numpy())

            bar.set_postfix(epoch=epoch)

    all_embeddings = np.concatenate(all_embeddings, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_embeddings, all_labels

In [22]:
model = Model(vit_backbone).to(CFG.device)

In [23]:
#model.get_parameters()

In [24]:
optimizer = torch.optim.AdamW(model.get_parameters())
 
scaler = torch.cuda.amp.GradScaler(enabled=CFG.autocast)

steps_per_epoch = math.ceil(len(train_loader) / CFG.acc_steps)

num_training_steps = math.ceil(CFG.n_epochs * steps_per_epoch)

scheduler = get_cosine_schedule_with_warmup(optimizer,
                                            num_training_steps=num_training_steps,
                                            num_warmup_steps=CFG.n_warmup_steps)   

CFG.global_step = 0                   
for epoch in range(math.ceil(CFG.n_epochs)):
    
    train(model, train_loader, optimizer, scaler, scheduler, epoch)
    embeddings, labels = eval(model, valid_loader, epoch)

    gc.collect()
    torch.cuda.empty_cache() 

    scores, indices = utilities.get_similiarity(embeddings, 6)
    indices = indices[:,1:] 
    labels, indices = labels.tolist(), indices.tolist() 
    preds = utilities.convert_indices_to_labels(indices, labels)
    score = utilities.map_per_set(labels, preds)
    print('score : ', score)

  0%|          | 37/35719 [00:11<2:33:22,  3.88it/s, epoch=0, loss=15.7, lrs=['1.125000e-08', '2.250000e-08', '4.500000e-08', '9.000000e-08', '2.700000e-06'], step=9]

In [None]:
model_name = CFG.model_name.replace('/','-')
torch.save(model.state_dict(), f'../models/{model_name}')