In [1]:
import glob
import sys
import os
import time
import random
import math

# DATALOADER
import cv2
from PIL import Image
import numpy as np
import albumentations as A
import torchvision.transforms as T
from PIL import Image
import pandas as pd

# BUILDING MODEL
import torch
import torch.nn as nn
import torch.nn.functional as F

# TRAINING
from torch.utils.data import DataLoader, Dataset
import faiss
from tqdm import tqdm_notebook as tqdm

# OTHER STUFF
import timm
from transformers import (get_linear_schedule_with_warmup, 
                          get_cosine_schedule_with_warmup, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_constant_schedule_with_warmup)
import gc
import transformers
from transformers import CLIPProcessor, CLIPVisionModel,  CLIPVisionConfig
from pytorch_metric_learning import losses
import open_clip

# UTILS
import utilities

%load_ext autoreload
%autoreload 2

In [2]:
torch.cuda.is_available()

True

In [3]:
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32-quickgelu', 'openai'),
 ('ViT-B-32-quickgelu', 'laion400m_e31'),
 ('ViT-B-32-quickgelu', 'laion400m_e32'),
 ('ViT-B-16', 'openai'),
 ('ViT-B-16', 'laion400m_e31'),
 ('ViT-B-16', 'laion400m_e32'),
 ('ViT-B-16', 'laion2b_s34b_b88k'),
 ('ViT-B-16-plus-240', 'laion400m_e31'),
 ('ViT-B-16-plus-240', 'laion400m_e32'),
 ('ViT-L-14', 'openai'),
 ('ViT-L-14', 'laion400m_e31'),
 ('ViT-L-14', 'laion400m_e32'),
 ('ViT-L-14', 'laion2b_s32b_b82k'),
 ('ViT-L-14-336', 'openai'),
 ('ViT-H-14', 'laion2b_s32b_

In [4]:
class CFG:
    model_name = 'ViT-H-14' 
    model_data = 'laion2b_s32b_b79k'
    samples_per_class = 50
    n_classes = 0
    min_samples = 4
    image_size = 224 
    hidden_layer = 1024
    seed = 5
    workers = 6
    train_batch_size = 8
    valid_batch_size = 32 
    emb_size = 512
    vit_bb_lr = {'10': 1.25e-6, '20': 2.5e-6, '26': 5e-6, '32': 10e-6} 
    vit_bb_wd = 1e-3
    hd_lr = 3e-4
    hd_wd = 1e-5
    autocast = True
    n_warmup_steps = 1000
    n_epochs = 10
    device = torch.device('cuda')
    s=30.
    m=.45
    m_min=.05
    acc_steps = 4
    global_step = 0

In [5]:
CFG.device

device(type='cuda')

In [6]:
utilities.set_seed(CFG.seed)

In [7]:
def get_data(df):
    df_g = df.groupby('class', group_keys=True).apply(lambda x: x)

    training_samples = []
    num_classes = 0
    new_cls_id = 0
    value_counts = []
    for cls in tqdm(set(df_g['class'])):
        paths = list(df_g.path[df_g['class'] == cls])
        if len(paths) >= CFG.min_samples:
            num_classes += 1

            random.shuffle(paths)

            paths_ = [ 
                (p, new_cls_id) 
                for p in paths[:CFG.samples_per_class]
            ]
            
            # some classes will be neglect it
            new_cls_id += 1
            
            value_counts.append(len(paths_))
            training_samples.extend(paths_)
            
    return training_samples, value_counts, num_classes

In [8]:
# merge the test and train csv files to one dataframe
train_df = pd.read_csv('../products-10k/train.csv')
train_df['path'] = train_df.apply(lambda x: '../products-10k/train' + '/' + x['name'], axis=1)


# remove ../products-10k/test/9397815.jpg from the list!
test_df = pd.read_csv('../products-10k/test_kaggletest.csv')
test_df = test_df.drop(test_df[test_df.name == '9397815.jpg'].index) # smt wrong with this img
test_df['path'] = test_df.apply(lambda x: '../products-10k/test' + '/' + x['name'], axis=1)

all_df = pd.concat([
    test_df[['class','path']],
    train_df[['class', 'path']]
])

data_train, value_counts_train, num_classes_train = get_data(all_df)
value_counts = np.array(value_counts_train)
CFG.n_classes = num_classes_train

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for cls in tqdm(set(df_g['class'])):


  0%|          | 0/9691 [00:00<?, ?it/s]

In [9]:
len(data_train), CFG.n_classes

(196944, 9691)

In [10]:
class Head(nn.Module):
    def __init__(self, hidden_size, k=3):
        super(Head, self).__init__()

        self.emb = nn.Linear(hidden_size, CFG.emb_size, bias=False)
        self.arc = utilities.ArcMarginProduct_subcenter(CFG.emb_size, CFG.n_classes, k)
        self.dropout = utilities.Multisample_Dropout()

    def forward(self, x):
        embeddings = self.dropout(x, self.emb)
        
        output = self.arc(embeddings)

        return output, F.normalize(embeddings)

In [11]:
class Model(nn.Module):
    def __init__(self, vit_backbone, head_size, version='v1', k=3):
        super(Model, self).__init__()
        if version == 'v1':
            self.encoder = vit_backbone.visual
        elif version == 'v2':
            self.encoder = vit_backbone.visual.trunk
        else:
            self.encoder = vit_backbone.visual
        self.head = Head(head_size, k)

    def forward(self, x):
        x = self.encoder(x)
        return self.head(x)

    def get_parameters(self):

        parameter_settings = [] 
        parameter_settings.extend(
            self.get_parameter_section(
                [(n, p) for n, p in self.encoder.named_parameters()], 
                lr=CFG.vit_bb_lr, 
                wd=CFG.vit_bb_wd
            )
        ) 

        parameter_settings.extend(
            self.get_parameter_section(
                [(n, p) for n, p in self.head.named_parameters()], 
                lr=CFG.hd_lr, 
                wd=CFG.hd_wd
            )
        ) 

        return parameter_settings

    def get_parameter_section(self, parameters, lr=None, wd=None): 
        parameter_settings = []


        lr_is_dict = isinstance(lr, dict)
        wd_is_dict = isinstance(wd, dict)

        layer_no = None
        for no, (n,p) in enumerate(parameters):
            
            for split in n.split('.'):
                if split.isnumeric():
                    layer_no = int(split)
            
            if not layer_no:
                layer_no = 0
            
            if lr_is_dict:
                for k,v in lr.items():
                    if layer_no < int(k):
                        temp_lr = v
                        break
            else:
                temp_lr = lr

            if wd_is_dict:
                for k,v in wd.items():
                    if layer_no < int(k):
                        temp_wd = v
                        break
            else:
                temp_wd = wd

            weight_decay = 0.0 if 'bias' in n else temp_wd

            parameter_setting = {"params" : p, "lr" : temp_lr, "weight_decay" : temp_wd}

            parameter_settings.append(parameter_setting)

            #print(f'no {no} | params {n} | lr {temp_lr} | weight_decay {weight_decay} | requires_grad {p.requires_grad}')

        return parameter_settings

In [12]:
def ArcFace_criterion(logits_m, target, margins):
    arc = utilities.ArcFaceLossAdaptiveMargin(margins=margins, s=CFG.s)
    loss_m = arc(logits_m, target, CFG.n_classes)
    return loss_m

In [13]:
def train(model, train_loader, optimizer, scaler, scheduler, epoch):
    model.train()
    loss_metrics = utilities.AverageMeter()
    criterion = ArcFace_criterion

    tmp = np.sqrt(1 / np.sqrt(value_counts))
    margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * CFG.m + CFG.m_min
        
    bar = tqdm(train_loader)
    for step, data in enumerate(bar):
        step += 1
        images = data['images'].to(CFG.device, dtype=torch.float)
        labels = data['labels'].to(CFG.device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.autocast):
            outputs, features = model(images)

        loss = criterion(outputs, labels, margins)
        loss_metrics.update(loss.item(), batch_size)
        loss = loss / CFG.acc_steps
        scaler.scale(loss).backward()

        if step % CFG.acc_steps == 0 or step == len(bar):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            CFG.global_step += 1
            
        lrs = utilities.get_lr_groups(optimizer.param_groups)

        loss_avg = loss_metrics.avg

        bar.set_postfix(loss=loss_avg, epoch=epoch, lrs=lrs, step=CFG.global_step)

@torch.no_grad()
def val(model, valid_loader):
    model.eval() 

    all_embeddings = []
    all_labels = [] 

    for data in tqdm(valid_loader):
        images = data['images'].to(CFG.device, dtype=torch.float)
        labels = data['labels'].to(CFG.device)

        _, embeddings = model(images)

        all_embeddings.append(embeddings.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())


    all_embeddings = np.concatenate(all_embeddings, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_embeddings, all_labels

def training(train_loader, gallery_loader, query_loader, experiment_folder, version='v1', k=3):
    
    os.makedirs(experiment_folder, exist_ok=True)
    
    backbone, _, _ = open_clip.create_model_and_transforms(CFG.model_name, CFG.model_data)
    if version == 'v1':
        CFG.hidden_layer = 640
    elif version == 'v2':
        CFG.hidden_layer = 1024
    model = Model(backbone, CFG.hidden_layer, version, k).to(CFG.device)
    
    optimizer = torch.optim.AdamW(model.get_parameters())
 
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.autocast)

    steps_per_epoch = math.ceil(len(train_loader) / CFG.acc_steps)

    num_training_steps = math.ceil(CFG.n_epochs * steps_per_epoch)

    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=num_training_steps,
                                                num_warmup_steps=CFG.n_warmup_steps)  

    best_score = 0
    best_updated_ = 0
    CFG.global_step = 0                   
    for epoch in range(math.ceil(CFG.n_epochs)):
        print(f'starting epoch {epoch}')

        # train of product-10k
        train(model, train_loader, optimizer, scaler, scheduler, epoch)

        # aicrowd test data
        print('gallery embeddings')
        embeddings_gallery, labels_gallery = val(model, gallery_loader)
        print('query embeddings')
        embeddings_query, labels_query = val(model, query_loader)

        # idk why it is needed
        gc.collect()
        torch.cuda.empty_cache() 

        # calculate validation score
        _, indices = utilities.get_similiarity_l2(embeddings_gallery, embeddings_query, 1000)


        indices = indices.tolist()
        labels_gallery = labels_gallery.tolist()
        labels_query = labels_query.tolist()

        preds = utilities.convert_indices_to_labels(indices, labels_gallery)
        score = utilities.map_per_set(labels_query, preds)
        print('validation score', score)

        # save model
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                }, f'{experiment_folder}/model_epoch_{epoch+1}_mAP5_{score:.2f}.pt')

        # save the best model
        if score > best_score:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
            }, f'{experiment_folder}/model_best_epoch_{epoch+1}_mAP5_{score:.2f}.pt')
            best_updated_ = 0
            best_score = score

        best_updated_ += 1

        if best_updated_ >= 3:
            print('no improvement done training....')
            break


In [14]:
# data loader

def read_img(img_path, is_gray=False):
    mode = cv2.IMREAD_COLOR if not is_gray else cv2.IMREAD_GRAYSCALE
    img = cv2.imread(img_path, mode)
    if not is_gray:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

def get_final_transform():  
    final_transform = T.Compose([
            T.Resize(
                size=(CFG.image_size, CFG.image_size), 
                interpolation=T.InterpolationMode.BICUBIC,
                antialias=True),
            T.ToTensor(), 
            T.Normalize(
                mean=(0.48145466, 0.4578275, 0.40821073), 
                std=(0.26862954, 0.26130258, 0.27577711)
            )
        ])
    return final_transform

class ProductDataset(Dataset):
    def __init__(self, 
                 data, 
                 transform=None, 
                 final_transform=None):
        self.data = data
        self.transform = transform
        self.final_transform = final_transform
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
       
        img = read_img(self.data[idx][0])            
        
        if self.transform is not None:
            if isinstance(self.transform, A.Compose):
                img = self.transform(image=img)['image']
            else:
                img = self.transform(img)
        
        if self.final_transform is not None:
            if isinstance(img, np.ndarray):
                img =  Image.fromarray(img)
            img = self.final_transform(img)
            
        product_id = self.data[idx][1]
        return {"images": img, "labels": product_id}
    
def get_product_10k_dataloader(data_train, data_aug='image_net'):
    
    transform = None
    if data_aug == 'image_net':
        transform = T.Compose([
            T.ToPILImage(),
            T.AutoAugment(T.AutoAugmentPolicy.IMAGENET)
        ])
    elif data_aug == 'cut_out':
        aug8p3 = A.OneOf([
            A.Sharpen(p=0.3),
            A.ToGray(p=0.3),
            A.CLAHE(p=0.3),
        ], p=0.5)

        transform = A.Compose([
            A.ShiftScaleRotate(rotate_limit=15, scale_limit=0.1, border_mode=cv2.BORDER_REFLECT, p=0.5),
            A.Resize(CFG.image_size, CFG.image_size),
            aug8p3,
            A.HorizontalFlip(p=0.5),
            A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1)
        ])
    
    elif data_aug == 'happy_whale':        
        transform = A.Compose([
            A.HorizontalFlip(p=0.5),
            A.ImageCompression(quality_lower=99, quality_upper=100),
            A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=10, border_mode=0, p=0.7),
            A.Resize(CFG.image_size, CFG.image_size),
            A.Cutout(max_h_size=int(CFG.image_size * 0.4), 
                     max_w_size=int(CFG.image_size * 0.4), 
                     num_holes=1, p=0.5),
        ])
        
    final_transform = get_final_transform()
    train_dataset = ProductDataset(data_train, 
                                   transform, 
                                   final_transform)
    train_loader = DataLoader(train_dataset, 
                              batch_size = CFG.train_batch_size, 
                              num_workers=CFG.workers, 
                              shuffle=True, 
                              drop_last=True)
    print(f'Training Data -> Dataset Length ({len(train_dataset)})')
    return train_loader

def aicrowd_data_loader(csv_path, img_dir='../development_test_data'):
    df = pd.read_csv(csv_path)
    df_ = df[['img_path', 'product_id']]
    df_['img_path'] = df_.apply(lambda x: img_dir + '/' + x['img_path'], axis=1)
    data_ = np.array(df_).tolist()
    
    final_transform = get_final_transform()
    dataset = ProductDataset(data_, None, final_transform)
    data_loader = DataLoader(dataset, 
                             batch_size = CFG.valid_batch_size, 
                             num_workers=CFG.workers, 
                             shuffle=False, 
                             drop_last=False)
    
    print(f'{csv_path} -> Dataset Length ({len(dataset)})')
    return data_loader

In [15]:
# aicrowd datasets
gallery_loader = aicrowd_data_loader('../development_test_data/gallery.csv') 
query_loader = aicrowd_data_loader('../development_test_data/queries.csv')

../development_test_data/gallery.csv -> Dataset Length (1067)
../development_test_data/queries.csv -> Dataset Length (1935)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['img_path'] = df_.apply(lambda x: img_dir + '/' + x['img_path'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_['img_path'] = df_.apply(lambda x: img_dir + '/' + x['img_path'], axis=1)


In [16]:
data_aug = 'image_net'
train_loader = get_product_10k_dataloader(data_train, data_aug)
experiment_folder = f'my_experiments/{CFG.model_name}-{CFG.model_data}-{str(data_aug)}-product-10k-all'
training(train_loader, gallery_loader, query_loader, experiment_folder, version=None)
# idk why it is needed
gc.collect()
torch.cuda.empty_cache() 

Training Data -> Dataset Length (196944)
starting epoch 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  bar = tqdm(train_loader)


  0%|          | 0/24618 [00:00<?, ?it/s]



gallery embeddings


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(valid_loader):


  0%|          | 0/34 [00:00<?, ?it/s]

query embeddings


  0%|          | 0/61 [00:00<?, ?it/s]

Processing indices...
Finished processing indices, took 0.16538691520690918s
validation score 0.5494832041343669
starting epoch 1


  0%|          | 0/24618 [00:00<?, ?it/s]

gallery embeddings


  0%|          | 0/34 [00:00<?, ?it/s]

query embeddings


  0%|          | 0/61 [00:00<?, ?it/s]

Processing indices...
Finished processing indices, took 0.15622305870056152s
validation score 0.5463910422049957
starting epoch 2


  0%|          | 0/24618 [00:00<?, ?it/s]

gallery embeddings


  0%|          | 0/34 [00:00<?, ?it/s]

query embeddings


  0%|          | 0/61 [00:00<?, ?it/s]

Processing indices...
Finished processing indices, took 0.15944647789001465s
validation score 0.5474246339362618
no improvement done training....


In [None]:
data_aug = 'image_net'
train_loader = get_product_10k_dataloader(data_train, data_aug)
experiment_folder = f'my_experiments/{CFG.model_name}-{CFG.model_data}-{str(data_aug)}-product-10k-all-ArcFace(k=1)'
training(train_loader, gallery_loader, query_loader, experiment_folder, version=None, k=1)
# idk why it is needed
gc.collect()
torch.cuda.empty_cache() 

In [20]:
CFG.train_batch_size = 100
data_aug = None
train_loader = get_product_10k_dataloader(data_train, data_aug)

for bs in tqdm(train_loader):
    pass

Training Data -> Dataset Length (196944)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for bs in tqdm(train_loader):


  0%|          | 0/1969 [00:00<?, ?it/s]