In [2]:
from src.train import SphereClassifier, WhaleDataModule
from src.dataset import load_df
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
from config.config import Config, load_config

cuda = torch.device("cuda:0")

## Load model

In [3]:
# model = SphereClassifier.load_from_checkpoint(
#     checkpoint_path="/app/sandbox/happy_whale/kaggle-happywhale-1st-place/result/b6_new/1/last.ckpt"
# )
# model.to(cuda)
# model.eval()


# image = torch.rand(1, 3, 528, 528).to(cuda)
# logits_ids, logits_species = model(image)

## Compute embs on train ds

In [4]:
cfg = load_config("config/efficientnet_b6_new.yaml", "config/default.yaml")

used default config lr_backbone: 0.0016
used default config lr_head: 0.016
used default config lr_decay_scale: 0.01
used default config num_classes: 15587
used default config num_species_classes: 26
used default config pretrained: True
used default config val_bbox: fullbody
used default config test_bboxes: ['fullbody', 'fullbody_charm']
used default config bboxes: {'fullbody_charm': 0.15, 'fullbody': 0.6, 'backfin': 0.15, 'detic': 0.05, 'none': 0.05}
used default config bbox_conf_threshold: 0.01
used default config n_data: -1
used default config global_pool: {'arch': 'GeM', 'p': 3, 'train': False}
used default config normalization: batchnorm
used default config optimizer: AdamW
used default config loss_fn: CrossEntropy
used default config loss_id_ratio: 0.437338
used default config margin_coef_id: 0.27126
used default config margin_coef_species: 0.226253
used default config margin_power_id: -0.364399
used default config margin_power_species: -0.720133
used default config s_id: 20.9588


In [5]:
df = load_df("input", cfg, "train.csv", True)
data_module = WhaleDataModule(
    df,
    cfg,
    f"input/train_images",
    cfg.val_bbox,
    -1,
)

detic low conf: 0 / 51033
fullbody low conf: 0 / 51033
fullbody_charm low conf: 10 / 51033
backfin low conf: 1587 / 51033


In [6]:
train_dataset = data_module.get_dataset(df, False)
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=32,
    pin_memory=True,
    drop_last=False,
)

In [7]:
# predictions = []
# model.eval()

# for batch in tqdm(train_loader):
#     images = batch['image'].to(cuda)
#     feats = F.normalize(model.get_feat(images), p=2.0, dim=1)
#     predictions.append(feats.detach().cpu())
# embs = torch.cat(predictions, axis=0).numpy()
# #np.savez(f"whale_train_emb.npz", embs=embs)

## Create Whale Train OSFR protocol

In [8]:
train_dataset.ids  # defines unique id of image. id \in [0, 15587)
a = np.load("whale_train_emb.npz")

In [12]:
train_dataset.x_paths

array(['00021adfb725ed.jpg', '000562241d384d.jpg', '0007c33415ce37.jpg',
       ..., 'fff94675cc1aef.jpg', 'fffbc5dd642d8c.jpg',
       'fffdcd42312777.jpg'], dtype=object)

In [9]:
a["embs"].shape

(51033, 776)

In [19]:
unique_ids, count_ids = np.unique(train_dataset.ids, return_counts=True)

out_of_gallery_ids = unique_ids[count_ids == 1]  # single image ids
in_gallery_ids = unique_ids[count_ids > 1]
assert len(out_of_gallery_ids) + len(in_gallery_ids) == 15587

In [23]:
np.unique(train_dataset.x_paths).shape

(51033,)

In [24]:
from pathlib import Path

# construct gallery and probe temlates
image_path_to_template_id = {}
image_path_to_subject_id = {}

gallery_templates = []
known_probe_templates = []
subject_id = 0
gallery_template_id = 0
probe_template_id = 10000
for subject in in_gallery_ids:
    subject_images_paths = train_dataset.x_paths[train_dataset.ids == subject]
    image_count = len(subject_images_paths)
    for i, image_path in enumerate(subject_images_paths):
        image_path_to_subject_id[image_path] = subject_id
        if i < image_count // 2:
            image_path_to_template_id[image_path] = gallery_template_id
        if i >= image_count // 2:
            image_path_to_template_id[image_path] = probe_template_id

    gallery_templates.append(
        (subject_images_paths[: image_count // 2], gallery_template_id, subject_id)
    )
    known_probe_templates.append(
        (subject_images_paths[image_count // 2 :], probe_template_id, subject_id)
    )
    gallery_template_id += 1
    probe_template_id += 1
    subject_id += 1

assert gallery_template_id < 10000
unknown_probe_templates = []

for probe_subject in out_of_gallery_ids:
    probe_images_paths = train_dataset.x_paths[train_dataset.ids == probe_subject]
    for image_path in probe_images_paths:
        image_path = str(image_path)
        image_path_to_subject_id[image_path] = subject_id
        image_path_to_template_id[image_path] = probe_template_id
    unknown_probe_templates.append((probe_images_paths, probe_template_id, subject_id))
    probe_template_id += 1
    subject_id += 1

In [27]:
assert len(image_path_to_template_id) == len(train_dataset.x_paths)
assert len(image_path_to_subject_id) == len(train_dataset.x_paths)
assert len(set(image_path_to_subject_id.values())) == len(unique_ids)
assert len(set(image_path_to_template_id.values())) == len(unique_ids) + len(
    in_gallery_ids
)

In [28]:
len(gallery_templates), len(known_probe_templates), len(unknown_probe_templates)

(6329, 6329, 9258)

In [38]:
len(unknown_probe_templates) / (
    len(known_probe_templates) + len(unknown_probe_templates)
)

0.5939565022133829

In [40]:
len(known_probe_templates) + len(unknown_probe_templates)

15587

In [29]:
import pandas as pd

ds_name = "whale"
# create meta files
# tid mid
identification_ds_path = Path("/app/datasets/whale_train")
identification_ds_path.mkdir(exist_ok=True)
meta_path = identification_ds_path / "meta"
meta_path.mkdir(exist_ok=True)
img_names = train_dataset.x_paths
# names = [x.split("/")[-1] for x in img_names]
names = img_names
mids = np.arange(len(img_names))
tids = []
sids = []

for image_path in img_names:
    tids.append(image_path_to_template_id[image_path])
    sids.append(image_path_to_subject_id[image_path])

out_file_tid_mid = meta_path / Path(f"{ds_name}_face_tid_mid.txt")
with open(out_file_tid_mid, "w") as fd:
    for name, tid, sid, mid in zip(names, tids, sids, mids):
        fd.write(f"{name} {tid} {mid} {sid}\n")

out_file_probe = meta_path / Path(f"{ds_name}_1N_probe_mixed.csv")
out_file_gallery = meta_path / Path(f"{ds_name}_1N_gallery_G1.csv")

tids_probe = []
sids_probe = []
names_probe = []
for probe_meta in known_probe_templates + unknown_probe_templates:
    tids_probe.extend([probe_meta[1]] * len(probe_meta[0]))
    sids_probe.extend([probe_meta[2]] * len(probe_meta[0]))
    names_probe.extend([x.split("/")[-1] for x in probe_meta[0]])

tids_gallery = []
sids_gallery = []
names_gallery = []

for gallery_meta in gallery_templates:
    tids_gallery.extend([gallery_meta[1]] * len(gallery_meta[0]))
    sids_gallery.extend([gallery_meta[2]] * len(gallery_meta[0]))
    names_gallery.extend([x.split("/")[-1] for x in gallery_meta[0]])

assert len(tids_gallery) + len(tids_probe) == len(img_names)
probe = pd.DataFrame(
    {
        "TEMPLATE_ID": tids_probe,
        "SUBJECT_ID": sids_probe,
        "FILENAME": names_probe,
    }
)
gallery = pd.DataFrame(
    {
        "TEMPLATE_ID": tids_gallery,
        "SUBJECT_ID": sids_gallery,
        "FILENAME": names_gallery,
    }
)

probe.to_csv(out_file_probe, sep=",", index=False)
gallery.to_csv(out_file_gallery, sep=",", index=False)

In [37]:
emb_dir = identification_ds_path / "embeddings"
emb_dir.mkdir(exist_ok=True)
np.savez(emb_dir / "b6_embs_whale.npz", **a, unc=np.ones((a["embs"].shape[0], 1)) * 30)

In [36]:
np.ones((a["embs"].shape[0], 1)).shape

(51033, 1)