In [None]:
!export CUDA_LAUNCH_BLOCKING=1 # for tracing issues
!pip install timm
!pip install torchinfo
!pip install --upgrade torchmetrics

import numpy as np
import pandas as pd
import gc
import torch
import random
from typing import Callable
from typing import Dict
from typing import Optional
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True # fixes a weird issue

from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torchmetrics
from torchinfo import summary
import pytorch_lightning as pl

import timm
from timm.data.transforms_factory import create_transform
from timm.optim import create_optimizer_v2

# Configuration

In [None]:
class CFG:
    SEED = 42
    
    # Dataset
    N_FOLDS = 5
    NUM_WORKERS = 6 # number of threads for dataloaders
    
    ### Model
    MODEL_NAME = "tf_efficientnet_b4"
    PRETRAINED=True
    IMAGE_SIZE = 380
    EMBEDDING_SIZE = 512
    DROPOUT=0.0
    
    # Arcface
    S = 10
    M = 0.1
    
    # Training
    OPTIMIZER="adamW"
    MODEL_PATH="model.ckpt" # file to store the model checkpoints
    BATCH_SIZE = 32 # Effective batch size will be BATCH_SIZE*ACCUMULATE_GRAD_BATCHES
    ACCUMULATE_GRAD_BATCHES = 1 # "1" means updates model after every batch
    NUM_EPOCHS = 30
    LR = 0.001
    WEIGHT_DECAY = 0.000001

    DEBUG = False # smaller set if debugging

In [None]:
# Make a deterministic pipeline
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    #gpu randomseed fixed
    torch.backends.cudnn.deterministic = True
    pl.seed_everything(seed)

fix_seed(CFG.SEED)

# Dataset

In [None]:
# Dataset PATHs
BASE_PATH = '../input/happywhale-enhanced-luke'
DATA_PATH = '../input/happy-whale-and-dolphin'
CHECKPOINTS_PATH = '../input/hwmodelcheckpoint/tf_efficientnet_b4_380.ckpt'
TRAIN_DIR = f"{BASE_PATH}/train_images"
TRAIN_DIR2 = f"{BASE_PATH}/imgextra (1)"
TEST_DIR = f"{DATA_PATH}/test_images"
TRAIN_CSV_PATH = '../input/hwmodelcheckpoint/train.csv'
TEST_CSV_PATH = '../input/hwmodelcheckpoint/test.csv'

OUTPUT_DIR = '/kaggle/working'
TRAIN_CSV_OUTPUT_PATH = f"{OUTPUT_DIR}/train.csv"
TEST_CSV_OUTPUT_PATH = f"{OUTPUT_DIR}/test.csv"
ENCODER_CLASSES_PATH = f"{OUTPUT_DIR}/encoder_classes.npy"
CHECKPOINTS_DIR = f"{OUTPUT_DIR}/checkpoints"
SUBMISSION_CSV_PATH = f"{OUTPUT_DIR}/submission.csv"

In [None]:
# get encoder classes
!cp ../input/hwmodelcheckpoint/encoder_classes.npy ./encoder_classes.npy 

# get train csv
!cp ../input/hwmodelcheckpoint/train.csv ./train.csv

train_df = pd.read_csv(TRAIN_CSV_PATH)

N_CLASSES = len(train_df["individual_id"].unique())

train_df.head()

# get test csv
!cp ../input/hwmodelcheckpoint/test.csv ./test.csv

test_df = pd.read_csv(TEST_CSV_PATH)
test_df.head()

In [None]:
# need to convert to RGB because some images are grayscale
def pil_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')

In [None]:
class HappyWhaleDataset(Dataset):
    """
    Class to Make any dataset
    """
    def __init__(self, df: pd.DataFrame, transform: Optional[Callable] = None):
        self.df = df
        self.transform = transform

        self.image_names = self.df["image"].values
        self.image_paths = self.df["image_path"].values
        self.targets = self.df["individual_id"].values

    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        image_name = self.image_names[index]

        image_path = self.image_paths[index]

        image = pil_loader(image_path) # need this for testing
    
        if self.transform:
            image = self.transform(image)

        target = self.targets[index]
        target = torch.tensor(target, dtype=torch.long)

        return {"image_name": image_name, "image": image, "target": target}

    def __len__(self) -> int:
        return len(self.df)

In [None]:
class LitDataModule(pl.LightningDataModule):
    """
    Lightning data module for testing, validation, and testing
    """
    def __init__(
        self,
        train_csv: str,
        test_csv: str,
        val_fold: float,
        image_size: int,
        batch_size: int,
        num_workers: int,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.train_df = pd.read_csv(train_csv)
        self.test_df = pd.read_csv(test_csv)
        
        self.transform_train = create_transform( # should pre-fetch=True since we're using a DataLoader?
            input_size=(self.hparams.image_size, self.hparams.image_size),
            crop_pct=1.0,
        )
        self.transform_eval = create_transform(
            input_size=(self.hparams.image_size, self.hparams.image_size),
            crop_pct=1.0,
            is_training=False,
        )
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            # Split train df using fold
            train_df = self.train_df[self.train_df.folds != self.hparams.val_fold].reset_index(drop=True)
            val_df = self.train_df[self.train_df.folds == self.hparams.val_fold].reset_index(drop=True)

            self.train_dataset = HappyWhaleDataset(train_df, transform=self.transform_train)
            self.val_dataset = HappyWhaleDataset(val_df, transform=self.transform_eval)

        if stage == "test" or stage is None:
            self.test_dataset = HappyWhaleDataset(self.test_df, transform=self.transform_eval)

    def train_dataloader(self) -> DataLoader:
        return self._dataloader(self.train_dataset, train=True)

    def val_dataloader(self) -> DataLoader:
        return self._dataloader(self.val_dataset)

    def test_dataloader(self) -> DataLoader:
        return self._dataloader(self.test_dataset)

    def _dataloader(self, dataset: HappyWhaleDataset, train: bool = False) -> DataLoader:
        if train == True:
            batch_size = self.hparams.batch_size
        else:
            batch_size = self.hparams.batch_size

        return DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=train,
            num_workers=self.hparams.num_workers,
            pin_memory=True,
            drop_last=train,
        )

In [None]:
class SoftMax(nn.Module):
    """
    Softmax loss, just a linear layer
    """
    def __init__(self, 
        num_features: int,
        num_classes: int,
    ):
        super(SoftMax, self).__init__()
        
        self.num_features = num_features
        self.n_classes = num_classes
        self.W = nn.Parameter(torch.FloatTensor(num_classes, num_features))
        nn.init.xavier_uniform_(self.W)

    def forward(self, input: torch.Tensor, label: torch.Tensor, device: str = "cuda") -> torch.Tensor:
        x=input
        W=self.W

        logits = F.linear(x, W)

        return logits

In [None]:
class ArcFace(nn.Module):
    """
    ArcFace Loss
    """
    def __init__(self, 
        num_features: int,
        num_classes: int,
        s: float, 
        m: float):
        super(ArcFace, self).__init__()
        
        self.num_features = num_features
        self.n_classes = num_classes
        self.s = s
        self.m = m
        self.W = nn.Parameter(torch.FloatTensor(num_classes, num_features))
        nn.init.xavier_uniform_(self.W)

    def forward(self, input: torch.Tensor, label: torch.Tensor, device: str = "cuda") -> torch.Tensor:
        # normalize features
        x = F.normalize(input)
        # normalize weights
        W = F.normalize(self.W)
        # dot product
        logits = F.linear(x, W)

        if label is None:
            return logits
        
        # add margin
        theta = torch.acos(torch.clamp(logits, -1.0 + 1e-7, 1.0 - 1e-7)) # truncate it because we don't need that high resolution
        target_logits = torch.cos(theta + self.m)

        # convert to one-hot encoding
        one_hot = torch.zeros_like(logits)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = logits * (1 - one_hot) + target_logits * one_hot
        
        # feature re-scale
        output *= self.s

        return output

In [None]:
class LitModule(pl.LightningModule):
    """
    Lightning module
    """
    def __init__(
        self,
        model_name: str,
        pretrained: bool,
        drop_rate: float,
        embedding_size: int,
        num_classes: int,
        arc_s: float,
        arc_m: float,
        optimizer: str,
        learning_rate: float,
        weight_decay: float,
        len_train_dl: int,
        epochs:int
    ):
        super().__init__()

        self.save_hyperparameters()

        self.model = timm.create_model(model_name, pretrained=pretrained, drop_rate=drop_rate)
        self.embedding = nn.Linear(self.model.get_classifier().in_features, embedding_size)
        self.model.reset_classifier(num_classes=0, global_pool="avg")

        self.arc = ArcFace(
            num_features=embedding_size,
            num_classes=num_classes,
            s=arc_s,
            m=arc_m,
        )
#         self.soft = SoftMax(
#             num_features=embedding_size,
#             num_classes=num_classes,
#         )

        self.loss_fn = F.cross_entropy
        self.train_acc = torchmetrics.Accuracy()
        self.train_top_5_acc = torchmetrics.Accuracy(top_k=5)
        self.train_f1 = torchmetrics.F1Score(num_classes=num_classes)
        self.val_acc = torchmetrics.Accuracy()
        self.val_top_5_acc = torchmetrics.Accuracy(top_k=5)
        self.val_f1 = torchmetrics.F1Score(num_classes=num_classes)

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        features = self.model(images)
        embeddings = self.embedding(features)

        return embeddings

    def configure_optimizers(self):
        optimizer = create_optimizer_v2(
            self.parameters(),
            opt=self.hparams.optimizer,
            lr=self.hparams.learning_rate,
            weight_decay=self.hparams.weight_decay,
        )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            self.hparams.learning_rate,
            steps_per_epoch=self.hparams.len_train_dl,
            epochs=self.hparams.epochs,
        )
        scheduler = {"scheduler": scheduler, "interval": "step"}

        return [optimizer], [scheduler]

    def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        images, targets = batch["image"], batch["target"]

        embeddings = self(images)
        outputs = self.arc(embeddings, targets, self.device)

        loss = self.loss_fn(outputs, targets)
        self.train_acc(outputs, targets)
        self.train_top_5_acc(outputs, targets)
        self.train_f1(outputs, targets)
        
        self.log(f"train_loss", loss, batch_size=CFG.BATCH_SIZE)
        self.log(f"train_acc", self.train_acc, batch_size=CFG.BATCH_SIZE)
        self.log(f"train_top_5_acc", self.train_top_5_acc, batch_size=CFG.BATCH_SIZE)
        self.log(f"train_f1", self.train_f1,  batch_size=CFG.BATCH_SIZE)
        return loss

    def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        images, targets = batch["image"], batch["target"]

        embeddings = self(images)
        outputs = self.arc(embeddings, targets, self.device)

        loss = self.loss_fn(outputs, targets)
        self.val_acc(outputs, targets)
        self.val_top_5_acc(outputs, targets)
        self.val_f1(outputs, targets)

        self.log(f"val_loss", loss, batch_size=CFG.BATCH_SIZE)
        self.log(f"val_acc", self.val_acc, batch_size=CFG.BATCH_SIZE)
        self.log(f"val_top_5_acc", self.val_top_5_acc, batch_size=CFG.BATCH_SIZE)
        self.log(f"val_f1", self.val_f1, batch_size=CFG.BATCH_SIZE)

        return loss
    
    def predict_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        images = batch["image"]
        
        embeddings = self(images) # when there's no labels, it just does SoftMax with normalization
        pred = self.arc(embeddings, label=None, device=self.device)
        
        predtop = torch.topk(pred, 5) # values and indices of top5 predictions
        
        return predtop

In [None]:
torch.cuda.empty_cache()
gc.collect()

# load weights from checkpoint
module = LitModule.load_from_checkpoint(CHECKPOINTS_PATH)

# create dataloaders
datamodule = LitDataModule(
    train_csv=str(TRAIN_CSV_OUTPUT_PATH),
    test_csv=str(TEST_CSV_OUTPUT_PATH),
    val_fold=2.0,
    image_size=CFG.IMAGE_SIZE,
    batch_size=CFG.BATCH_SIZE*2,
    num_workers=2,
)
datamodule.setup()

test_dataloader = datamodule.test_dataloader()

trainer = pl.Trainer(deterministic=True, gpus=1, max_epochs=1)

# make raw predictions
predictions = trainer.predict(module, dataloaders=test_dataloader)

In [None]:
# put predictions in a usable format
pred_vals = []
pred_inds = []

for i in range(len(predictions)):
    pred_vals.append(predictions[i].values)
    pred_inds.append(predictions[i].indices)

predictions_values = torch.cat((pred_vals), 0)
predictions_indices = torch.cat((pred_inds), 0)

# convert to numpy and save em so we don't have to do this again
predictions_val_np = predictions_values.numpy()
predictions_ind_np = predictions_indices.numpy()
predictions_val_df = pd.DataFrame(predictions_val_np)
predictions_ind_df = pd.DataFrame(predictions_ind_np)
predictions_val_df.to_csv('prediction_values.csv')
predictions_ind_df.to_csv('prediction_indices.csv')

# reload the saved encoder from classes in training
encoder = LabelEncoder()
encoder.classes_ = np.load(ENCODER_CLASSES_PATH, allow_pickle=True)

# take inverse transform for each column and get individual IDs
predictions_ids_df = predictions_ind_df.copy()
for i in range(5):
    predictions_ids_df[i] = encoder.inverse_transform(predictions_ind_df[i])

# save them
predictions_ids_df.to_csv('prediction_ids.csv')


In [None]:
# predictions_val_df = pd.read_csv('../input/hwmodelcheckpoint/prediction_values.csv', index_col=None)
# predictions_ids_df = pd.read_csv('../input/hwmodelcheckpoint/prediction_ids.csv', index_col=None)
# predictions_val_df = predictions_val_df.drop(['Unnamed: 0'], axis=1)
# predictions_ids_df = predictions_ids_df.drop(['Unnamed: 0'], axis=1)

In [None]:
sub_df = pd.read_csv("../input/happy-whale-and-dolphin/sample_submission.csv")

In [None]:
# Check score with just new individuals as first and shuffle. Result is 0.113 which means 11.3% of the data is new individuals
predictions_ids_dfnew = predictions_ids_df.copy()
predictions_ids_dfnew = predictions_ids_dfnew.sample(frac=1).reset_index(drop=True)
predictions_ids_dfnew.loc[:, '0'] = "new_individual"
pred_seriesnew = predictions_ids_dfnew.apply(lambda x: " ".join(x), axis=1)
submission_dfnew = sub_df.copy()
submission_dfnew["predictions"] = pred_seriesnew
submission_dfnew.to_csv("submissionnew.csv", index=False)

In [None]:
predictions_ids_df1 = predictions_ids_df.copy()
new_predictions_ids_df1 = predictions_ids_df.copy()
th = [0.1, 0.3, 0.5, 0.7] # 0.3 thresh gave best result, 0.284

newid = False

# this is a crappy way to go through and add "new_individual" if it was below a certain threshold but I was in a rush
for i, rows in enumerate(predictions_val_df.iterrows()):
    for j in range(5):
        if (not newid):
            if (rows[1][j] < (1-th[0])):
                newid = True
                new_predictions_ids_df1.loc[i, str(j)] = "new_individual"
                # shuffle the predictions in the row over if there is a new individual
                for k in range(4-j):
                    new_predictions_ids_df1.loc[i, str(k+1)] = predictions_ids_df1.loc[i, str(k)]
            
        else:
            newid = False
            break

# put submission in the correct format and save top 3 thresh results are 0.284 at 0.3, 0.254 at 0.1 and 0.251 at 0.5
pred_series1 = new_predictions_ids_df1.apply(lambda x: " ".join(x), axis=1)
submission_df1 = sub_df.copy()
submission_df1["predictions"] = pred_series1
submission_df1.to_csv("submission01.csv", index=False)