### Imports

In [1]:
import pandas as pd

import numpy as np

import cv2
import os

import faiss

import torch
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import StratifiedKFold

import albumentations as A
from albumentations.pytorch import ToTensorV2

# Utils
from utils import set_seed

# Dataset
# from dataset import HappyWhaleDataset

# Model
from model import HappyWhaleModel

import warnings
warnings.filterwarnings("ignore")

### Configure

In [2]:
# Weights file
weights_file = 'output/Loss13.7040_epoch16.bin'

# Set usefull directories
ROOT_DIR = "/home/jean/datas/happy-whale-and-dolphin"
TRAIN_DIR = "/home/jean/datas/happy-whale-and-dolphin/train_images"
TEST_DIR = "/home/jean/datas/happy-whale-and-dolphin/test_images"

def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}"

CONFIG = {"seed": 2022,
          "img_size": 448,
          "model_name": "tf_efficientnet_b0_ns",
          "num_classes": 15587,
          "embedding_size": 512,
          "train_batch_size": 64,
          "valid_batch_size": 64,
          "n_fold": 5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          # ArcFace Hyperparameters
          "s": 30.0, 
          "m": 0.30,
          "ls_eps": 0.0,
          "easy_margin": False
          }



# Set seed
set_seed(CONFIG['seed'])

sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']


### Read and custom the data

In [3]:
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image'].apply(get_train_file_path)
df.head()

Unnamed: 0,image,species,individual_id,file_path
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,/home/jean/datas/happy-whale-and-dolphin/train...
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,/home/jean/datas/happy-whale-and-dolphin/train...
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,/home/jean/datas/happy-whale-and-dolphin/train...
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,/home/jean/datas/happy-whale-and-dolphin/train...
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,/home/jean/datas/happy-whale-and-dolphin/train...


In [4]:
encoder = LabelEncoder()
df["individual_id"] = encoder.fit_transform(df["individual_id"])

# Create folds
skf = StratifiedKFold(n_splits=CONFIG["n_fold"])

for fold, (_, val_) in enumerate(skf.split(X=df, y=df.individual_id)):
    df.loc[val_, "kfold"] = fold

### Set data augmentation

In [5]:
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

### Get the model

In [6]:
model = HappyWhaleModel(CONFIG['model_name'], CONFIG['embedding_size'],CONFIG)
model.load_state_dict(torch.load(weights_file))
model.to(CONFIG['device'])

HappyWhaleModel(
  (model): EfficientNet(
    (conv_stem): Conv2dSame(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (act1): SiLU(inplace=True)
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act1): SiLU(inplace=True)
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_

### Get the embeddings

In [7]:
@torch.inference_mode()
def get_embeddings(model, dataloader, device):
    model.eval()
    
    LABELS = []
    EMBEDS = []
    IDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar: 
        print
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        ids = data['id']

        outputs = model.extract(images)
        
        LABELS.append(labels.cpu().numpy())
        EMBEDS.append(outputs.cpu().numpy())
        IDS.append(ids)
    
    EMBEDS = np.vstack(EMBEDS)
    LABELS = np.concatenate(LABELS)
    IDS = np.concatenate(IDS)
    
    return EMBEDS, LABELS, IDS

In [8]:
class HappyWhaleDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.ids = df['image'].values
        self.file_names = df['file_path'].values
        self.labels = df['individual_id'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        idx = self.ids[index]
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = self.labels[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'label': torch.tensor(label, dtype=torch.long),
            'id': idx
        }

In [9]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"])
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG["train_batch_size"],
        num_workers=2,
        shuffle=True,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=CONFIG["valid_batch_size"],
        num_workers=2,
        shuffle=False,
        pin_memory=True,
    )

    return train_loader, valid_loader


In [10]:
# compute embeddings
train_loader, valid_loader = prepare_loaders(df, fold=0)
train_embeds, train_labels, train_ids = get_embeddings(model, train_loader, CONFIG['device'])
valid_embeds, valid_labels, valid_ids = get_embeddings(model, valid_loader, CONFIG['device'])

100%|█████████████████████████████████████████| 637/637 [21:23<00:00,  2.01s/it]
100%|█████████████████████████████████████████| 160/160 [06:24<00:00,  2.40s/it]


In [11]:
train_embeds = normalize(train_embeds, axis=1, norm='l2')
valid_embeds = normalize(valid_embeds, axis=1, norm='l2')

In [12]:
train_labels = encoder.inverse_transform(train_labels)
valid_labels = encoder.inverse_transform(valid_labels)

In [13]:
index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)

In [14]:
D, I = index.search(valid_embeds, k=50)

allowed_targets = np.unique(train_labels)


In [15]:
val_targets_df = pd.DataFrame(np.stack([valid_ids, valid_labels], axis=1), columns=['image','target'])
val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), 'target'] = 'new_individual'
val_targets_df.target.value_counts()

new_individual    1854
37c7aba965a5        80
114207cab555        34
a6e325d8e924        31
19fbb960f07d        31
                  ... 
e05030227860         1
2faf1bcb2167         1
c511fbe2acd1         1
b90f72a6be9b         1
26145086bca6         1
Name: target, Length: 4008, dtype: int64

In [16]:
valid_df = []
for i, val_id in tqdm(enumerate(valid_ids)):
    targets = train_labels[I[i]]
    distances = D[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = val_id
    valid_df.append(subset_preds)

10207it [00:04, 2252.06it/s]


In [17]:
valid_df = pd.concat(valid_df).reset_index(drop=True)
valid_df = valid_df.groupby(['image','target']).distances.max().reset_index()
valid_df.head()

Unnamed: 0,image,target,distances
0,00021adfb725ed.jpg,0594dd3c4e6b,0.999316
1,00021adfb725ed.jpg,06b287d73a9f,0.999054
2,00021adfb725ed.jpg,0f529de1b7a0,0.999198
3,00021adfb725ed.jpg,1394c7e5fb92,0.99935
4,00021adfb725ed.jpg,164a5c36c8a1,0.999033


In [18]:
valid_df = valid_df.sort_values('distances', ascending=False).reset_index(drop=True)
valid_df.to_csv('val_neighbors.csv')

In [19]:
def get_predictions(test_df, threshold=0.2):
    predictions = {}
    for i, row in tqdm(test_df.iterrows()):
        if row.image in predictions:
            if len(predictions[row.image]) == 5:
                continue
            predictions[row.image].append(row.target)
        elif row.distances > threshold:
            predictions[row.image] = [row.target, 'new_individual']
        else:
            predictions[row.image] = ['new_individual', row.target]

    for x in tqdm(predictions):
        if len(predictions[x]) < 5:
            remaining = [y for y in sample_list if y not in predictions]
            predictions[x] = predictions[x] + remaining
            predictions[x] = predictions[x][:5]
        
    return predictions


def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

In [20]:
best_th = 0
best_cv = 0
for th in [0.1*x for x in range(11)]:
    all_preds = get_predictions(valid_df, threshold=th)
    cv = 0
    for i,row in val_targets_df.iterrows():
        target = row.target
        preds = all_preds[row.image]
        val_targets_df.loc[i,th] = map_per_image(target, preds)
    cv = val_targets_df[th].mean()
    print(f"CV at threshold {th}: {cv}")
    if cv > best_cv:
        best_th = th
        best_cv = cv

383040it [00:18, 20496.34it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2402158.06it/s]


CV at threshold 0.0: 0.20598118937983734


383040it [00:18, 21177.09it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2551478.69it/s]


CV at threshold 0.1: 0.20598118937983734


383040it [00:17, 22100.32it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2590853.36it/s]


CV at threshold 0.2: 0.20598118937983734


383040it [00:16, 22688.11it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2294156.85it/s]


CV at threshold 0.30000000000000004: 0.20598118937983734


383040it [00:17, 22403.61it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2210868.67it/s]


CV at threshold 0.4: 0.20598118937983734


383040it [00:16, 22625.69it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2173601.79it/s]


CV at threshold 0.5: 0.20598118937983734


383040it [00:17, 21867.18it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2492359.60it/s]


CV at threshold 0.6000000000000001: 0.20598118937983734


383040it [00:16, 22600.81it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2704951.09it/s]


CV at threshold 0.7000000000000001: 0.20598118937983734


383040it [00:16, 22719.46it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2635674.50it/s]


CV at threshold 0.8: 0.20598118937983734


383040it [00:16, 22892.14it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2470355.51it/s]


CV at threshold 0.9: 0.20598118937983734


383040it [00:16, 22984.89it/s]
100%|████████████████████████████████| 10207/10207 [00:00<00:00, 2543898.09it/s]


CV at threshold 1.0: 0.25046046830606444


In [21]:
print("Best threshold", best_th)
print("Best cv", best_cv)
val_targets_df.describe()

Best threshold 1.0
Best cv 0.25046046830606444


Unnamed: 0,0.0,0.1,0.2,0.30000000000000004,0.4,0.5,0.6000000000000001,0.7000000000000001,0.8,0.9,1.0
count,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0,10207.0
mean,0.205981,0.205981,0.205981,0.205981,0.205981,0.205981,0.205981,0.205981,0.205981,0.205981,0.25046
std,0.319504,0.319504,0.319504,0.319504,0.319504,0.319504,0.319504,0.319504,0.319504,0.319504,0.385362
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual'
print(val_targets_df.is_new_individual.value_counts().to_dict())
val_scores = val_targets_df.groupby('is_new_individual').mean().T
val_scores['adjusted_cv'] = val_scores[True]*0.1+val_scores[False]*0.9
best_threshold_adjusted = val_scores['adjusted_cv'].idxmax()
print("best_threshold",best_threshold_adjusted)
val_scores

{False: 8353, True: 1854}
best_threshold 0.0


is_new_individual,False,True,adjusted_cv
0.0,0.140722,0.5,0.17665
0.1,0.140722,0.5,0.17665
0.2,0.140722,0.5,0.17665
0.3,0.140722,0.5,0.17665
0.4,0.140722,0.5,0.17665
0.5,0.140722,0.5,0.17665
0.6000000000000001,0.140722,0.5,0.17665
0.7000000000000001,0.140722,0.5,0.17665
0.8,0.140722,0.5,0.17665
0.9,0.140722,0.5,0.17665


In [23]:
train_embeds = np.concatenate([train_embeds, valid_embeds])
train_labels = np.concatenate([train_labels, valid_labels])
print(train_embeds.shape,train_labels.shape)

(50975, 512) (50975,)


In [24]:
index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)

In [25]:
test = pd.DataFrame()
test["image"] = os.listdir("/home/jean/datas/happy-whale-and-dolphin/test_images")
test["file_path"] = test["image"].apply(lambda x: f"{TEST_DIR}/{x}")
test["individual_id"] = -1  #dummy value
test.head()

Unnamed: 0,image,file_path,individual_id
0,4780170daf460a.jpg,/home/jean/datas/happy-whale-and-dolphin/test_...,-1
1,d9ece950295bc7.jpg,/home/jean/datas/happy-whale-and-dolphin/test_...,-1
2,3491858282dc25.jpg,/home/jean/datas/happy-whale-and-dolphin/test_...,-1
3,37269dbb569b3e.jpg,/home/jean/datas/happy-whale-and-dolphin/test_...,-1
4,fe39a9c93ac13e.jpg,/home/jean/datas/happy-whale-and-dolphin/test_...,-1


In [26]:
test_dataset = HappyWhaleDataset(test, transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                         num_workers=2, shuffle=False, pin_memory=True)


In [27]:
test_embeds, _, test_ids = get_embeddings(model, test_loader, CONFIG['device'])
test_embeds = normalize(test_embeds, axis=1, norm='l2')

100%|█████████████████████████████████████████| 437/437 [16:51<00:00,  2.31s/it]


In [28]:
D, I = index.search(test_embeds, k=50)

In [29]:
test_df = []
for i, test_id in tqdm(enumerate(test_ids)):
    targets = train_labels[I[i]]
    distances = D[i]
    subset_preds = pd.DataFrame(np.stack([targets, distances], axis=1), columns=['target','distances'])
    subset_preds['image'] = test_id
    test_df.append(subset_preds)
    
test_df = pd.concat(test_df).reset_index(drop=True)
test_df = test_df.groupby(['image','target']).distances.max().reset_index()
test_df = test_df.sort_values('distances', ascending=False).reset_index(drop=True)
test_df.to_csv('test_neighbors.csv')

27956it [00:11, 2358.31it/s]


In [30]:
predictions = get_predictions(test_df, best_threshold_adjusted)

predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions['predictions'] = predictions['predictions'].apply(lambda x: ' '.join(x))
predictions.to_csv('submission.csv',index=False)
predictions.head()

1076896it [00:46, 22992.33it/s]
100%|████████████████████████████████| 27956/27956 [00:00<00:00, 2028790.27it/s]


Unnamed: 0,image,predictions
0,491c56c1a3baac.jpg,5fc809d9e819 new_individual babd014300b7 6c029...
1,b53f392ea1409b.jpg,29623de1f9a5 new_individual 91970ecd7d55 babd0...
2,6ba6bc50e83c7c.jpg,3ee0ab1d8230 new_individual 86257eaa613b 45fc3...
3,1b557f16c8366a.jpg,262f464ee602 new_individual 35bc67b3b071 5fc80...
4,f1367ddbab5c50.jpg,12f2a4406d7e new_individual 79c94459ece0 84502...
