## Sprint

- [ ] ...

## Backlog

- [ ] ...

## Tested / Implemented

- [x] ...


## Setup & Config

In [None]:

print("\n... IMPORTS STARTING ...\n")

print("\n... PIP/APT INSTALLS AND DOWNLOADS/ZIP STARTING ...")
!pip install -q efficientnet
!pip install tensorflow_addons
#!pip install -q "tf-models-official==2.7.0"
!pip install transformers
print("... PIP/APT INSTALLS COMPLETE ...\n")

print("\n\tVERSION INFORMATION")
import tensorflow as tf; print(f"\t\t– TENSORFLOW VERSION: {tf.__version__}");
import tensorflow_addons as tfa; print(f"\t\t– TENSORFLOW ADDONS VERSION: {tfa.__version__}");
import tensorflow_hub as tfhub;
from tensorflow.keras import backend as K;
import efficientnet.tfkeras as efn;
import pandas as pd; pd.options.mode.chained_assignment = None;
import numpy as np; print(f"\t\t– NUMPY VERSION: {np.__version__}");
import sklearn; print(f"\t\t– SKLEARN VERSION: {sklearn.__version__}");
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
#from pandarallel import pandarallel; pandarallel.initialize();
from sklearn.model_selection import GroupKFold, StratifiedKFold;
from sklearn.neighbors import NearestNeighbors;
#from official.nlp import optimization;
import albumentations as A; 
from transformers import AdamWeightDecay


# RAPIDS
#import cudf, cupy, cuml

# Built In Imports
from collections import Counter
from datetime import datetime
from glob import glob
import warnings
import requests
import hashlib
import imageio
import IPython
import sklearn
import urllib
import zipfile
import pickle
import random
import shutil
import string
import json
import math
import time
from tqdm.auto import tqdm
import gzip
import ast
import sys
import io
import os
import gc
import re

# Visualization Imports
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm; tqdm.pandas();
#import plotly.express as px
import seaborn as sns
from PIL import Image, ImageEnhance
import matplotlib; print(f"\t\t– MATPLOTLIB VERSION: {matplotlib.__version__}");
from matplotlib import animation, rc; rc('animation', html='jshtml')
import plotly
import PIL
import cv2
    
print("\n\n... IMPORTS COMPLETE ...\n")

In [None]:
!pip install timm
!pip install tfimm
import tfimm
import timm

In [None]:
# check if colab 
IS_COLAB = not os.path.exists('/kaggle/input')
print('Colab: ', IS_COLAB)

# mount google drive in colab
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    with open('./drive/MyDrive/int2str.json', 'r') as fp:
        int2str = json.load(fp)
    with open('./drive/MyDrive/str2int.json', 'r') as fp:
        str2int = json.load(fp)
    int2str = {int(k):v for k,v in int2str.items()}
else:
  from kaggle_datasets import KaggleDatasets

In [None]:
print(f"\n... ACCELERATOR SETUP STARTING ...\n")

# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  
except ValueError:
    TPU = None

if TPU:
    print(f"\n... RUNNING ON TPU - {TPU.master()}...")
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    print(f"\n... RUNNING ON CPU/GPU ...")
    strategy = tf.distribute.get_strategy() 

N_REPLICAS = strategy.num_replicas_in_sync
    
print(f"... # OF REPLICAS: {N_REPLICAS} ...\n")
print(f"\n... ACCELERATOR SETUP COMPLTED ...\n")

In [None]:
# set up save directory 

save_dir = '.'
EXPERIMENT = 4
run_ts = datetime.now().strftime('%Y%m%d-%H%M%S')
print(run_ts)
descr = 'full'
if IS_COLAB:
    save_dir = f'/content/drive/MyDrive/Kaggle/HappyWhale-2022/experiments-{EXPERIMENT}/{descr}_{run_ts}'
    !mkdir -p {save_dir}

In [None]:
class config:
    
    
    SEED = 5329
    FOLD_TO_RUN = 0
    FOLDS = 5
    DEBUG = False
    EVALUATE = True
    RESUME = False
    RESUME_EPOCH = None
    
    ### Dataset
    DATA_SOURCE = 'backfintfrecords' # 'backfintfrecords', 'happywhale-tfrecords-fullbody', 'happywhale-tfrecords-bb', 'happywhale-tfr-normal'
    BATCH_SIZE = 8 * strategy.num_replicas_in_sync
    IMAGE_SIZE = 768
    N_CLASSES = 15587
    if DATA_SOURCE == 'happywhale-tfrecords-bb':
      bounding_box = True
    else:
      bounding_box = False
    
    ### Model
    model_type = 'effnetv1'  # effnetv1, swin, hybrid
    EFF_NET = 5
    EFF_NETV2 = 's-21k-ft1k'
    FREEZE_BATCH_NORM = False
    pool = None # None, 'gem'
    concat = False
    dropout = 0.2
    embed_size = 512 # 512
    neck_activation = None # 'prelu'
    head = 'arcface' # 'subcenter_arcface', 'arcface'
    arcface_m = 0.2 # 0.3 
    arcface_s = 20 # 30
    subarcface_k = 3

    ### Optimizer
    optimizer = 'AdamW' # 'AdamW'
    EPOCHS = 25
    LR = 0.0003 #0.0001 #0.0003
    label_smoothing = False
    smoothing_param = 0.05

    message='higher_drop'
    
    ### Augmentations
    CUTOUT = False
    
    ### Save-Directory
    save_dir = save_dir
    
    ### Inference
    KNN = 100
    
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def is_interactive():
    return 'runtime'    in get_ipython().config.IPKernelApp.connection_file
IS_INTERACTIVE = is_interactive()
print(IS_INTERACTIVE)

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
MODEL_NAME = None
if config.model_type == 'effnetv1':
    MODEL_NAME = f'effnetv1_b{config.EFF_NET}'
elif config.model_type == 'effnetv2':
    MODEL_NAME = f'effnetv2_{config.EFF_NETV2}'
elif config.model_type == 'swin':
    MODEL_NAME = 'SwinTransformer'
    config.IMAGE_SIZE = 384

config.MODEL_NAME = MODEL_NAME
print(MODEL_NAME)

In [8]:
# save config as file
with open(config.save_dir+'/config.json', 'w') as fp:
    json.dump({x:dict(config.__dict__)[x] for x in dict(config.__dict__) if not x.startswith('_')}, fp)

## Augmentation

In [None]:
def albumentations_aug(image):
    transform = A.Compose([
                      A.ToGray(p=0.01),
                      A.OneOf([
                              A.GaussNoise(var_limit=[10, 50]),
                              #A.GaussianBlur(),
                              A.MotionBlur(),
                              A.MedianBlur(),
                              ], p=0.2),
                      A.OneOf([
                              A.OpticalDistortion(distort_limit=1.0),
                              A.GridDistortion(num_steps=5, distort_limit=1.),
                              A.ElasticTransform(alpha=3),
                              ], p=0.2),
                      A.OneOf([
                              A.CLAHE(),
                              A.RandomBrightnessContrast(),
                              ], p=0.25),
                      A.HueSaturationValue(p=0.25),
                      A.ShiftScaleRotate(p=0.5, shift_limit=0.0625, scale_limit=0.2, rotate_limit=20),
                      #A.Cutout(max_h_size=int(input_size * 0.1), max_w_size=int(input_size * 0.1), num_holes=5, p=0.5),
                      #A.Normalize(),
                      ])
    aug_img = image #.numpy()
    aug_img = transform(aug_img)["image"]
    #aug_img = tf.cast(aug_img, tf.float32)
    return aug_img

def data_augment_albumentations(posting_id, image, label_group, matches):
    image = tf.numpy_function(func=albumentations_aug, inp=[image], Tout=tf.float32)
    #image = albumentations_aug(image)
    return posting_id, image, label_group, matches

## Data Retrieval

<h3 style="font-family: Verdana; font-size: 20px; font-style: normal; font-weight: normal; text-decoration: none; text-transform: none; letter-spacing: 2px; color: #3eb489; background-color: #ffffff;">2.2 COMPETITION DATA ACCESS</h3>

---

TPUs read data must be read directly from **G**oogle **C**loud **S**torage **(GCS)**. Kaggle provides a utility library – **`KaggleDatasets`** – which has a utility function **`.get_gcs_path`** that will allow us to access the location of our input datasets within **GCS**.<br><br>

<div class="alert alert-block alert-info" style="margin: 2em; line-height: 1.7em; font-family: Verdana;">
    <b style="font-size: 16px;">📌 &nbsp; TIPS:</b><br><br>- If you have multiple datasets attached to the notebook, you should pass the name of a specific dataset to the <b><code>`get_gcs_path()`</code></b> function. <i>In our case, the name of the dataset is the name of the directory the dataset is mounted within.</i><br><br>
</div>

In [None]:
# get array of filepaths for train and test (tfrecords)
if config.DATA_SOURCE == 'happywhale-tfrecords-bb':
  GCS_PATH = 'gs://kds-a5eeea32d4c32a10dc6b97fbb717078dc399034f3ea0eaf98adea861' 
if config.DATA_SOURCE == 'backfintfrecords':
  GCS_PATH = 'gs://kds-94784845edbaab59b6c263479b01221bbb4d752853f35846a3485146'
if config.DATA_SOURCE == 'happywhale-tfr-normal':
  GCS_PATH = 'gs://kds-984c91f885ed5d26ae9b4af1ee7fbf0d10c6a49739a78532070597ec'  # Get GCS Path from kaggle notebook if GCS Path is expired
if config.DATA_SOURCE == 'happywhale-tfrecords-fullbody':
  GCS_PATH = 'gs://kds-18f0566db9d63fdf865734f94afc3ad154574ec54ce6b3e42ef9008d'
if not IS_COLAB:
    GCS_PATH = KaggleDatasets().get_gcs_path(config.DATA_SOURCE)
    
train_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-train*.tfrec')))
test_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-test*.tfrec')))
print(GCS_PATH)
print(len(train_files),len(test_files),count_data_items(train_files),count_data_items(test_files))

In [10]:
# ADJUST ORDER OF REPEAT & SHUFFLE


# utility to format inputs for calling arcface
def arcface_format(posting_id, image, label_group, matches):
    return posting_id, {'inp1': image, 'inp2': label_group}, label_group, matches

def arcface_inference_format(posting_id, image, label_group, matches):
    return image,posting_id

def arcface_eval_format(posting_id, image, label_group, matches):
    return image,label_group

# Data augmentation function
def data_augment(posting_id, image, label_group, matches):

    ### CUTOUT
    if tf.random.uniform([])>0.5 and config.CUTOUT:
      N_CUTOUT = 6
      for cutouts in range(N_CUTOUT):
        if tf.random.uniform([])>0.5:
           DIM = config.IMAGE_SIZE
           CUTOUT_LENGTH = DIM//8
           x1 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
           x2 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
           filter_ = tf.concat([tf.zeros((x1,CUTOUT_LENGTH)),tf.ones((CUTOUT_LENGTH,CUTOUT_LENGTH)),tf.zeros((DIM-x1-CUTOUT_LENGTH,CUTOUT_LENGTH))],axis=0)
           filter_ = tf.concat([tf.zeros((DIM,x2)),filter_,tf.zeros((DIM,DIM-x2-CUTOUT_LENGTH))],axis=1)
           cutout = tf.reshape(1-filter_,(DIM,DIM,1))
           image = cutout*image

    image = tf.image.random_flip_left_right(image)
    # image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01) #0.01
    image = tf.image.random_saturation(image, 0.70, 1.30) #0.70, 1.30
    image = tf.image.random_contrast(image, 0.80, 1.20) #0.80, 1.20
    image = tf.image.random_brightness(image, 0.10) #0.10
    image = image / 255.0
    return posting_id, image, label_group, matches

# Function to decode our images
if config.bounding_box:
    def decode_image(image_data, box):
      if box is not None and box[0] != -1:
          left, top, right, bottom = box[0], box[1], box[2], box[3]
          bbs = tf.convert_to_tensor([top, left, bottom - top, right - left])
          image = tf.io.decode_and_crop_jpeg(image_data, bbs, channels=3)
      else:
          image = tf.image.decode_jpeg(image_data, channels = 3)

      image = tf.image.resize(image, [config.IMAGE_SIZE,config.IMAGE_SIZE])
      image = tf.cast(image, tf.float32) #/ 255.0
      return image

    def read_labeled_tfrecord(example):
      LABELED_TFREC_FORMAT = {
          "image_name": tf.io.FixedLenFeature([], tf.string),
          "image": tf.io.FixedLenFeature([], tf.string),
          "target": tf.io.FixedLenFeature([], tf.int64),
          'detic_box': tf.io.FixedLenFeature([4], tf.int64),
          # 'yolov5_box': tf.io.FixedLenFeature([4], tf.int64),
      }

      example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
      posting_id = example['image_name']
      bb = tf.cast(example['detic_box'], tf.int32)
      image = decode_image(example['image'], bb)
      #label_group = tf.one_hot(tf.cast(example['label_group'], tf.int32), depth = N_CLASSES)
      label_group = tf.cast(example['target'], tf.int32)
      #matches = example['matches']
      matches = 1
      return posting_id, image, label_group, matches

else:
    def decode_image(image_data):
        image = tf.image.decode_jpeg(image_data, channels = 3)
        image = tf.image.resize(image, [config.IMAGE_SIZE,config.IMAGE_SIZE])
        image = tf.cast(image, tf.float32) / 255.0
        return image

    # This function parse our images and also get the target variable
    def read_labeled_tfrecord(example):
        LABELED_TFREC_FORMAT = {
            "image_name": tf.io.FixedLenFeature([], tf.string),
            "image": tf.io.FixedLenFeature([], tf.string),
            "target": tf.io.FixedLenFeature([], tf.int64),
    #         "matches": tf.io.FixedLenFeature([], tf.string)
        }

        example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
        posting_id = example['image_name']
        image = decode_image(example['image'])
    #     label_group = tf.one_hot(tf.cast(example['label_group'], tf.int32), depth = N_CLASSES)
        label_group = tf.cast(example['target'], tf.int32)
    #     matches = example['matches']
        matches = 1
        return posting_id, image, label_group, matches

# This function loads TF Records and parse them into tensors
def load_dataset(filenames, ordered = False):
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
#     dataset = dataset.cache()
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# This function is to get our training tensors
def get_training_dataset(filenames):
    dataset = load_dataset(filenames, ordered = False)
    # look into data augmentation
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO) #data_augment_albumentations
    #dataset = dataset.map(data_augment_albumentations, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    # shuffle before repeating
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_val_dataset(filenames):
    dataset = load_dataset(filenames, ordered = True)
    # look into augmentation
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    #dataset = dataset.map(data_augment_albumentations, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_eval_dataset(filenames, get_targets = True):
    dataset = load_dataset(filenames, ordered = True)
    # look into augmentation
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    #dataset = dataset.map(data_augment_albumentations, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_eval_format, num_parallel_calls = AUTO)
    if not get_targets:
        dataset = dataset.map(lambda image, target: image)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_test_dataset(filenames, get_names = True):
    dataset = load_dataset(filenames, ordered = True)
    # look into augmentation
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    #dataset = dataset.map(data_augment_albumentations, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_inference_format, num_parallel_calls = AUTO)
    if not get_names:
        dataset = dataset.map(lambda image, posting_id: image)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
# plot training images 

row = 10; col = 8;
row = min(row,config.BATCH_SIZE//col)
N_TRAIN = count_data_items(train_files)
print(N_TRAIN)
ds = get_training_dataset(train_files)

for (sample,label) in ds:
    img = sample['inp1']
    plt.figure(figsize=(25,int(25*row/col)))
    for j in range(row*col):
        plt.subplot(row,col,j+1)
        plt.title(label[j].numpy())
        plt.axis('off')
        plt.imshow(img[j,])
    plt.show()
    break
print(img.shape)

In [None]:
# plot test images

row = 10; col = 8;
row = min(row,config.BATCH_SIZE//col)
N_TEST = count_data_items(test_files)
print(N_TEST)
ds = get_test_dataset(test_files)

for (img,label) in ds:
    plt.figure(figsize=(25,int(25*row/col)))
    for j in range(row*col):
        plt.subplot(row,col,j+1)
        plt.title(label[j].numpy())
        plt.axis('off')
        plt.imshow(img[j,])
    plt.show()
    break
print(img.shape)

## XLA Optimization

In [None]:
print(f"... XLA OPTIMIZATIONS STARTING ...")

print(f"... CONFIGURE JIT (JUST IN TIME) COMPILATION ...")
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)

print(f"... XLA OPTIMIZATIONS COMPLETED ...")

## Model

In [12]:
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [13]:
class SubCenterArcMargin(tf.keras.layers.Layer):
    def __init__(self, n_classes, k=3, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(SubCenterArcMargin, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.k = k
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'k': self.k,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(SubCenterArcMargin, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes*self.k),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        ##########
        cosine = tf.reshape(cosine, shape=[-1, self.n_classes, self.k]) #check if works (batch size included?!)
        # cosine = tf.keras.layers.Reshape((self.n_classes, self.k))(cosine) 
        cosine = tf.math.reduce_max(cosine, -1)
        ##########
        
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [14]:
class GeMPoolingLayer(tf.keras.layers.Layer):
    def __init__(self, p=3., train_p=False):
        super().__init__()
        if train_p:
          self.p = tf.Variable(p, dtype=tf.float32)
        else:
          self.p = p
        self.eps = 1e-6

    def call(self, inputs: tf.Tensor, **kwargs):
        inputs = tf.clip_by_value(inputs, clip_value_min=self.eps, clip_value_max=tf.reduce_max(inputs))
        inputs = tf.pow(inputs, self.p)
        inputs = tf.reduce_mean(inputs, axis=[1, 2], keepdims=False) 
        # tf.keras.layers.GlobalAveragePooling2D(data_format='channels_last', keepdims=False)(inputs)
        outputs = tf.pow(inputs, 1./self.p)
        return outputs

In [15]:
class CategoricalCrossentropyLS(tf.keras.losses.Loss):
  def call(self, y_true, y_pred):
    #y_pred = tf.convert_to_tensor_v2(y_pred)
    y_true = tf.squeeze(tf.one_hot(tf.cast(y_true, tf.int32), config.N_CLASSES))
    y_true = tf.cast(y_true, y_pred.dtype)
    return tf.keras.metrics.categorical_crossentropy(y_true, y_pred, label_smoothing = config.smoothing_param)

def scce_with_ls(y_true, y_pred):
      y_true = tf.one_hot(tf.cast(y_true, tf.int32), config.N_CLASSES)
      y_true = tf.reshape(y_true, shape=[-1, config.N_CLASSES])
      return tf.keras.losses.categorical_crossentropy(y_true, y_pred, label_smoothing = config.smoothing_param)

In [127]:
class HybridEmbedding(tf.keras.layers.Layer):
    def __init__(self, backbone, img_size=224, patch_size=1, feature_size=None, in_channels=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.backbone = backbone
        self.grid_size = (feature_size // patch_size, feature_size // patch_size)
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.proj = tf.keras.layers.Conv2D(embed_dim, kernel_size=patch_size, strides=patch_size)

    def call(self, inputs: tf.Tensor):
        outputs = self.backbone(inputs)
        outputs = self.proj(outputs)
        batch_size, height, width, dim = tf.unstack(tf.shape(outputs)[:4])
        outputs = tf.reshape(tensor=outputs, shape=(batch_size, height * width, -1))
        return outputs

In [129]:
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6, efn.EfficientNetB7]

class HybridModel(tf.keras.layers.Layer):
    def __init__(self, backbone='swin_base_patch4_window12_384_in22k', img_size=config.IMAGE_SIZE, patch_size=1):
        super().__init__()
        embedder = EFNS[config.EFF_NET](weights = 'noisy-student', include_top = False,input_shape = [config.IMAGE_SIZE, config.IMAGE_SIZE, 3])
        self.embedder = tf.keras.Model(inputs=embedder.input, outputs=embedder.layers[37].output)
        self.backbone = tfimm.create_model(backbone, pretrained="timm", in_channels=3, nb_classes=0)
        self.backbone.cfg.patch_size = patch_size
        #self.backbone.cfg.patch_resolution = 224 // patch_size
        self.backbone.patch_embed = HybridEmbedding(self.embedder,img_size=img_size, 
                                              patch_size=patch_size, 
                                              feature_size= self.backbone.cfg.input_size[0] // self.backbone.cfg.patch_size, 
                                              in_channels=3, 
                                              embed_dim=self.backbone.cfg.embed_dim)
        
    def call(self, inputs: tf.Tensor):
        outputs = self.backbone(inputs)
        return outputs

In [130]:
# model

EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6, efn.EfficientNetB7]

def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

# Function to create our EfficientNetB3 model
def get_model(optimizer):

    if config.head=='arcface':
      head = ArcMarginProduct(n_classes=config.N_CLASSES, s=config.arcface_s, m=config.arcface_m, 
                              name=f'head/{config.head}', dtype='float32')
    elif config.head=='subcenter_arcface':
      head = SubCenterArcMargin(n_classes=config.N_CLASSES, k=config.subarcface_k, s=config.arcface_s, m=config.arcface_m, 
                                name=f'head/{config.head}', dtype='float32')
    else:
        assert 1==2, "INVALID HEAD"
      
    if config.pool == 'gem':
      pool = GeMPoolingLayer(train_p=True)
    else:
      pool = tf.keras.layers.GlobalAveragePooling2D()
    
    #with strategy.scope():

    inp = tf.keras.layers.Input(shape = [config.IMAGE_SIZE, config.IMAGE_SIZE, 3], name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
        
    if config.model_type == 'effnetv1':
        x = EFNS[config.EFF_NET](weights = 'noisy-student', include_top = False)(inp)
        if config.concat:
          inp = EFNS[config.EFF_NET](weights = 'noisy-student', include_top = False,input_shape = [config.IMAGE_SIZE, config.IMAGE_SIZE, 3])
          inp.layers[0]._name = 'inp1'
          x1=pool(inp.layers[-1].output)
          x2=pool(inp.layers[-5].output)
          x3=pool(inp.layers[-7].output)
          x4=pool(inp.layers[-13].output)
          embed =  tf.concat([x1,x2,x3,x4],axis = 1)
        else:
          embed = pool(x)
    elif config.model_type == 'effnetv2':
        FEATURE_VECTOR = f'{EFFNETV2_ROOT}/tfhub_models/efficientnetv2-{config.EFF_NETV2}/feature_vector'
        embed = tfhub.KerasLayer(FEATURE_VECTOR, trainable=True)(inp)
    #elif config.model_type == 'swin':
        #embed = tfimm.create_model("swin_base_patch4_window12_384_in22k", pretrained="timm", in_channels=3, nb_classes=0)(inp)
    elif config.model_type == 'hybrid':
        embed = HybridModel(backbone='swin_base_patch4_window7_224', patch_size=4)(inp)
        
            
    embed = tf.keras.layers.Dropout(config.dropout)(embed)
    if config.concat:
      embed = tf.keras.layers.Dense(2048)(embed) #2048
    else:
      embed = tf.keras.layers.Dense(config.embed_size)(embed)

    if config.neck_activation == 'prelu':
      embed = tf.keras.layers.BatchNormalization()(embed)
      embed = tf.keras.layers.PReLU()(embed)
    embed = tf.keras.layers.BatchNormalization()(embed)
    embed = tf.keras.layers.ReLU()(embed)
    x = head([embed, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)
        
    if config.concat:
      model = tf.keras.models.Model(inputs = [inp.input, label], outputs = [output])
      embed_model = tf.keras.models.Model(inputs = inp.input, outputs = embed)  
    else:
      model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
      embed_model = tf.keras.models.Model(inputs = inp, outputs = embed)  
        
    if config.FREEZE_BATCH_NORM:
        freeze_BN(model)

    if config.label_smoothing:
      loss = scce_with_ls
    else:
      loss = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(
        optimizer = optimizer,
        loss = [loss], #tf.keras.losses.SparseCategoricalCrossentropy()
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy(),tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5)]
        #,steps_per_execution=64
        ) 
        
    return model,embed_model

In [None]:
def get_lr_callback(plot=False):
    lr_start   = 0.000001
    lr_max     = 0.000005 * config.BATCH_SIZE
    lr_min     = 0.000001
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if config.RESUME:
            epoch = epoch + config.RESUME_EPOCH
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr
        
    if plot:
        epochs = list(range(config.EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

get_lr_callback(plot=True)

In [132]:
# Lr Schedule & Optimizer

class WarmUpandDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(
      self,
      initial_learning_rate: float,
      decay_schedule_fn: callable,
      warmup_steps: int,
      power: float = 1.0,
      name: str = None,
  ):
      super().__init__()
      self.initial_learning_rate = initial_learning_rate
      self.warmup_steps = warmup_steps
      self.power = power
      self.decay_schedule_fn = decay_schedule_fn
      self.name = name

  def __call__(self, step):
      with tf.name_scope(self.name or "WarmUp") as name:
          global_step_float = tf.cast(step, tf.float32)
          warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
          warmup_percent_done = global_step_float / warmup_steps_float
          warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
          return tf.cond(
              global_step_float < warmup_steps_float,
              lambda: warmup_learning_rate,
              lambda: self.decay_schedule_fn(step - self.warmup_steps),
              name=name,
          )

#plt.plot([custom_schedule(n) for n in range(10000)])
#plt.show()

def get_optimizer(learning_rate, max_epochs, warmup_epochs, steps_per_epoch, epsilon=1e-6, weight_decay_rate=0.0001, 
                  warmup_power=1, alpha=0.0):
  decay_steps = steps_per_epoch * (max_epochs-warmup_epochs)
  warmup_steps = warmup_epochs * steps_per_epoch
  decay_schedule = tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate=learning_rate, decay_steps=decay_steps, alpha=alpha)
  custom_schedule = WarmUpandDecaySchedule(learning_rate, decay_schedule, warmup_steps, warmup_power)
  optimizer = AdamWeightDecay(learning_rate=custom_schedule, weight_decay_rate=weight_decay_rate, epsilon=epsilon,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
  return optimizer

In [133]:
class Snapshot(tf.keras.callbacks.Callback):
    
    def __init__(self,fold,snapshot_epochs=[]):
        super(Snapshot, self).__init__()
        self.snapshot_epochs = snapshot_epochs
        self.fold = fold
        
        
    def on_epoch_end(self, epoch, logs=None):
        # logs is a dictionary
#         print(f"epoch: {epoch}, train_acc: {logs['acc']}, valid_acc: {logs['val_acc']}")
        if epoch in self.snapshot_epochs: # your custom condition         
            self.model.save_weights(config.save_dir+f"/EF{config.MODEL_NAME}_epoch{epoch}.h5")
        self.model.save_weights(config.save_dir+f"/{config.MODEL_NAME}_last.h5")

In [None]:
TRAINING_FILENAMES = [x for i,x in enumerate(train_files) if i%config.FOLDS!=config.FOLD_TO_RUN]
VALIDATION_FILENAMES = [x for i,x in enumerate(train_files) if i%config.FOLDS==config.FOLD_TO_RUN]
print(len(TRAINING_FILENAMES),len(VALIDATION_FILENAMES),count_data_items(TRAINING_FILENAMES),count_data_items(VALIDATION_FILENAMES))

In [135]:
if config.DEBUG:
    TRAINING_FILENAMES = [TRAINING_FILENAMES[0]]
    VALIDATION_FILENAMES = [VALIDATION_FILENAMES[0]]
    print(len(TRAINING_FILENAMES),len(VALIDATION_FILENAMES),count_data_items(TRAINING_FILENAMES),count_data_items(VALIDATION_FILENAMES))
    test_files = [test_files[0]]

In [None]:
seed_everything(config.SEED)
VERBOSE = 1
train_dataset = get_training_dataset(TRAINING_FILENAMES)
val_dataset = get_val_dataset(VALIDATION_FILENAMES)
STEPS_PER_EPOCH = count_data_items(TRAINING_FILENAMES) // config.BATCH_SIZE
train_logger = tf.keras.callbacks.CSVLogger(config.save_dir+'/training-log-fold-%i.h5.csv'%config.FOLD_TO_RUN)
# SAVE BEST MODEL EACH FOLD        
sv_loss = tf.keras.callbacks.ModelCheckpoint(
    config.save_dir+f"/{config.MODEL_NAME}_loss.h5", monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
# BUILD MODEL
K.clear_session()
if config.optimizer == 'AdamW':
  opt = get_optimizer(config.LR, max_epochs=config.EPOCHS, warmup_epochs=4, steps_per_epoch=STEPS_PER_EPOCH, epsilon=1e-6, weight_decay_rate=0.0001, 
                  warmup_power=1, alpha=0.0)
  callbacks = [train_logger, sv_loss]
else:
  opt = tf.keras.optimizers.Adam(learning_rate = config.LR)
  callbacks = [get_lr_callback(), train_logger, sv_loss]
with strategy.scope():
  model,embed_model = get_model(opt)
snap = Snapshot(fold=config.FOLD_TO_RUN,snapshot_epochs=[5,8])
model.summary()

if config.RESUME:   
    model.load_weights(config.resume_model_wts)

In [None]:
print('#### Image Size %i with EfficientNet B%i and batch_size %i'%
      (config.IMAGE_SIZE,config.EFF_NET,config.BATCH_SIZE))

history = model.fit(train_dataset,
                validation_data = val_dataset,
                steps_per_epoch = STEPS_PER_EPOCH,
                epochs = config.EPOCHS,
                callbacks = callbacks, #[get_lr_callback(), train_logger, sv_loss]
                verbose = VERBOSE)

In [None]:
model.load_weights(config.save_dir+f"/{config.MODEL_NAME}_loss.h5")

## Evaluation Metric

In [None]:
if config.DATA_SOURCE in ['happywhale-tfrecords-bb', 'backfintfrecords', 'happywhale-tfrecords-fullbody']:
    with open('/content/drive/MyDrive/Kaggle/HappyWhale-2022/individual_ids.json', 'r') as fp:
      int2str = json.loads(fp.read())
    int2str = {int2str[x]:x for x in int2str}

In [None]:
def get_ids(filename):
    ds = get_test_dataset([filename],get_names=True).map(lambda image, image_name: image_name).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy().astype('U')
    return ids

def get_targets(filename):
    ds = get_eval_dataset([filename],get_targets=True).map(lambda image, target: target).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy()
    return ids

def get_embeddings(filename):
    ds = get_test_dataset([filename],get_names=False)
    if config.ensemble:
      embeddings = np.mean(np.stack([embed_models[x][1].predict(ds,verbose=0) for x in range(len(embed_models))]), axis=0)
    else:
      embeddings = embed_model.predict(ds,verbose=0) 
    return embeddings

def get_preds(filename):
    ds = get_test_dataset([filename],get_names=False)
    preds = model.predict(ds,verbose=0)
    return preds

def get_predictions(test_df,threshold=0.2):
    predictions = {}
    for i,row in tqdm(test_df.iterrows()):
        if row.image in predictions:
            if len(predictions[row.image])==5: 
                continue
            predictions[row.image].append(row.target)
        elif row.confidence>threshold:
            predictions[row.image] = [row.target,'new_individual']
        else:
            predictions[row.image] = ['new_individual',row.target]

    for x in tqdm(predictions):
        if len(predictions[x])<5:
            remaining = [y for y in sample_list if y not in predictions]
            predictions[x] = predictions[x]+remaining
            predictions[x] = predictions[x][:5]
        
    return predictions

def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0
    
def map_per_set(labels, predictions):
    """Computes the average over multiple images.

    Parameters
    ----------
    labels : list
             A list of the true labels. (Only one true label per images allowed!)
    predictions : list of list
             A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """
    return np.mean([map_per_image(l, p) for l,p in zip(labels, predictions)])
    
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

In [None]:
train_targets = []
train_embeddings = []
train_ids = []
for filename in tqdm(TRAINING_FILENAMES):
    embeddings = get_embeddings(filename)
    targets = get_targets(filename)
    train_embeddings.append(embeddings)
    train_targets.append(targets)
    ids = get_ids(filename)
    train_ids.append(ids)
train_embeddings = np.concatenate(train_embeddings)
train_targets = np.concatenate(train_targets)
train_ids = np.concatenate(train_ids)

In [None]:
neigh = NearestNeighbors(n_neighbors=config.KNN,metric='cosine')
neigh.fit(train_embeddings)

In [None]:

val_ids = []
test_nn_distances = []
test_nn_idxs = []
val_targets = []
val_embeddings = []
# val_preds = [] # new
for filename in tqdm(VALIDATION_FILENAMES):
    embeddings = get_embeddings(filename)
    targets = get_targets(filename)
    # preds = get_preds(filename) # new
    ids = get_ids(filename)
    distances,idxs = neigh.kneighbors(embeddings, config.KNN, return_distance=True)
    val_ids.append(ids)
    test_nn_idxs.append(idxs)
    test_nn_distances.append(distances)
    val_embeddings.append(embeddings)
    val_targets.append(targets)
    # val_preds.append(preds) # new
test_nn_distances = np.concatenate(test_nn_distances)
test_nn_idxs = np.concatenate(test_nn_idxs)
val_ids = np.concatenate(val_ids)
val_embeddings = np.concatenate(val_embeddings)
val_targets = np.concatenate(val_targets)
# val_preds = np.concatenate(val_preds) # new

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
allowed_targets = set([int2str[x] for x in np.unique(train_targets)]) 
val_targets_df = pd.DataFrame(np.stack([val_ids,val_targets],axis=1),columns=['image','target'])
val_targets_df['target'] = val_targets_df['target'].astype(int).map(int2str)
val_targets_df.loc[~val_targets_df.target.isin(allowed_targets),'target'] = 'new_individual'
val_targets_df.target.value_counts()

In [None]:
test_df = []
for i in tqdm(range(len(val_ids))):
    id_ = val_ids[i]
    targets = train_targets[test_nn_idxs[i]]
    distances = test_nn_distances[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    test_df.append(subset_preds)
test_df = pd.concat(test_df).reset_index(drop=True)
test_df['confidence'] = 1-test_df['distances']
test_df = test_df.groupby(['image','target']).confidence.max().reset_index() 
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df['target'] = test_df['target'].map(int2str)
test_df.to_csv('val_neighbors.csv')
test_df.image.value_counts().value_counts()

In [None]:
def get_predictions_2(test_df,threshold=0.2):
    predictions = {}
    for i,row in tqdm(test_df.iterrows()):
        if row.image in predictions:
            if len(predictions[row.image])==5:
                continue
            elif row.confidence<=threshold:
                if 'new_individual' not in predictions[row.image]:
                    predictions[row.image].append('new_individual')
                    if len(predictions[row.image])==5:
                        continue
            predictions[row.image].append(row.target)
        elif row.confidence>threshold:
            predictions[row.image] = [row.target]
        else:
            predictions[row.image] = ['new_individual',row.target]

    for x in tqdm(predictions):
        if len(predictions[x])<5:
            remaining = [y for y in sample_list if y not in predictions]
            predictions[x] = predictions[x]+remaining
            predictions[x] = predictions[x][:5]
        
    return predictions

In [None]:
## Compute CV
best_th = 0
best_cv = 0
for th in [0.05*x for x in range(21)]:
    all_preds = get_predictions_2(test_df,threshold=th) # switch function
    cv = 0
    for i,row in val_targets_df.iterrows():
        target = row.target
        preds = all_preds[row.image]
        val_targets_df.loc[i,th] = map_per_image(target,preds)
    cv = val_targets_df[th].mean()
    print(f"CV at threshold {th}: {cv}")
    if cv>best_cv:
        best_th = th
        best_cv = cv

In [None]:
print("Best threshold",best_th)
print("Best cv",best_cv)
val_targets_df.describe()

In [None]:
# Adjustment: Since Public lb has nearly 10% 'new_individual' (Be Careful for private LB)
val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual'
print(val_targets_df.is_new_individual.value_counts().to_dict())
val_scores = val_targets_df.groupby('is_new_individual').mean().T
val_scores['adjusted_cv'] = val_scores[True]*0.1+val_scores[False]*0.9
best_threshold_adjusted = val_scores['adjusted_cv'].idxmax()
print("best_threshold",best_threshold_adjusted)
val_scores

## Inference

In [None]:
train_embeddings_2 = np.concatenate([train_embeddings,val_embeddings])
train_targets_2 = np.concatenate([train_targets,val_targets])
print(train_embeddings_2.shape,train_targets_2.shape)

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=config.KNN,metric='cosine')
neigh.fit(train_embeddings_2)

In [None]:
test_ids = []
test_nn_distances = []
test_nn_idxs = []
test_embeddings = []
for filename in tqdm(test_files):
    embeddings = get_embeddings(filename)
    ids = get_ids(filename)
    distances,idxs = neigh.kneighbors(embeddings, config.KNN, return_distance=True)
    test_ids.append(ids)
    test_nn_idxs.append(idxs)
    test_nn_distances.append(distances)
    test_embeddings.append(embeddings)
test_nn_distances = np.concatenate(test_nn_distances)
test_nn_idxs = np.concatenate(test_nn_idxs)
test_ids = np.concatenate(test_ids)
test_embeddings = np.concatenate(test_embeddings)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
test_df = []
for i in tqdm(range(len(test_ids))):
    id_ = test_ids[i]
    targets = train_targets_2[test_nn_idxs[i]]
    distances = test_nn_distances[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    test_df.append(subset_preds)
test_df = pd.concat(test_df).reset_index(drop=True)
test_df['confidence'] = 1-test_df['distances']
test_df = test_df.groupby(['image','target']).confidence.max().reset_index()
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df['target'] = test_df['target'].map(int2str)
test_df.to_csv('test_neighbors.csv')
test_df.image.value_counts().value_counts()

In [None]:
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

In [None]:
predictions = {}
for i,row in tqdm(test_df.iterrows()):
    if row.image in predictions:
        if len(predictions[row.image])==5:
            continue
        predictions[row.image].append(row.target)
    elif row.confidence>best_threshold_adjusted:
        predictions[row.image] = [row.target,'new_individual']
    else:
        predictions[row.image] = ['new_individual',row.target]
        
for x in tqdm(predictions):
    if len(predictions[x])<5:
        remaining = [y for y in sample_list if y not in predictions]
        predictions[x] = predictions[x]+remaining
        predictions[x] = predictions[x][:5]
    predictions[x] = ' '.join(predictions[x])
    
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions.to_csv('submission.csv',index=False)
predictions.head()

## Merge Predictions for backfin datasets

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/Kaggle/HappyWhale-2022/ArcFaceEffB6InferBaseline.csv") #prev submission
#whales without backfins determined with a simple classifier
ids = np.load("/content/drive/MyDrive/Kaggle/HappyWhale-2022/ids_without_backfin.npy", allow_pickle = True) 
ids2 = df2["image"][~df2["image"].isin(predictions["image"])] #images without a bounding box
submission = pd.concat([
    predictions[~(predictions["image"].isin(ids))],
    df2[df2["image"].isin(ids)],
    df2[df2["image"].isin(ids2)]
])
submission = submission.drop_duplicates()
submission.to_csv('submission2.csv',index=False)
submission.head()

## Save train & test embeddings

In [None]:
## TO DO 
with open(f'{config.MODEL_NAME}_600fullbody_s20m2_{config.FOLD_TO_RUN}.npy', 'wb') as fh:
    np.save(fh, np.concatenate([train_ids,val_ids]))
    np.save(fh, np.concatenate([train_embeddings,val_embeddings]))
    np.save(fh, np.concatenate([train_targets,val_targets]))
    np.save(fh, test_ids)
    np.save(fh, test_embeddings)

In [None]:
print('train ids: ', train_ids.shape, val_ids.shape)
print('train embeds: ', train_embeddings.shape, val_embeddings.shape)
print('train targets: ', train_targets.shape, val_targets.shape)
print('test ids: ', test_ids.shape)
print('test embeds: ', test_embeddings.shape)

## Logging

In [None]:
df_log = pd.read_pickle('./drive/MyDrive/happywhale_log.pkl')
log = {}
log['model_description'] = ['eff5_25ep_768size_100nn_s20m2_embed512_f1'] 
log['model_head'] = ['arcface']
log['config'] = [config.save_dir+'/config.json']
log['data'] = ['fullbody'] 
log['stratify_level'] = ['individual']
log['test_metric'] = [0.4517] # 
log['val_metric'] = [0.7767] # 
log['public_metric'] = [0.730] # 
log['post_processing'] = ['threshold 0.60']
df_log_new = pd.DataFrame(log)
df_log = pd.concat([df_log, df_log_new], ignore_index=True)
df_log.to_pickle('./drive/MyDrive/happywhale_log.pkl')
df_log