# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top)

---

Import all the required libraries for this notebook.


In [1]:
import albumentations as A
import gc
import matplotlib.pyplot as plt
import math
import multiprocessing
import numpy as np
import os
import pandas as pd
import random
import time
import timm
import torch
import torch.nn as nn


from albumentations.pytorch import ToTensorV2
from glob import glob
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from typing import Dict, List

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using", torch.cuda.device_count(), "GPU(s)")



Using 1 GPU(s)


In [2]:
from kaggle_secrets import (
    UserSecretsClient,
)  # see https://www.kaggle.com/discussions/product-feedback/114053 for more info
import sys


user_secrets = UserSecretsClient()
personal_token = user_secrets.get_secret("git-pat")

In [3]:
# !git clone https://{personal_token}@github.com/JulianRodd/MLiP_group_10_task1_HMS.git # for generic 
branch = "preprocessing"
!git clone -b {branch} https://{personal_token}@github.com/JulianRodd/MLiP_group_10_task1_HMS.git # for branch
    
os.chdir("/kaggle/working/MLiP_group_10_task1_HMS")
sys.path.insert(1, "/kaggle/working/MLiP_group_10_task1_HMS") # pos 1 to avoid conflicts

Cloning into 'MLiP_group_10_task1_HMS'...
remote: Enumerating objects: 780, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (144/144), done.[K
remote: Total 780 (delta 94), reused 130 (delta 50), pack-reused 585[K
Receiving objects: 100% (780/780), 8.21 MiB | 26.19 MiB/s, done.
Resolving deltas: 100% (477/477), done.


In [4]:
from utils.data_preprocessing_utils import filter_by_agreement, filter_by_annotators

In [5]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import mean_squared_error


if not os.path.exists("tensorboard"):
    os.mkdir("tensorboard")
WRITER = SummaryWriter(log_dir=os.path.join("tensorboard", f"shufflenet small"))

# <b><span style='color:#F1A424'>|</span> Configuration</b><a class='anchor' id='configuration'></a> [↑](#top)

---


In [6]:
class config:
    AMP = True
    BATCH_SIZE_TRAIN = 32
    BATCH_SIZE_VALID = 32
    EPOCHS = 10
    FOLDS = 5
    FREEZE = False
    GRADIENT_ACCUMULATION_STEPS = 1
    MAX_GRAD_NORM = 1e7
    MODEL = "tf_efficientnet_b0"  # "shufflenet_v2_x1_0"
    NUM_FROZEN_LAYERS = 39
    NUM_WORKERS = 0  # multiprocessing.cpu_count()
    PRINT_FREQ = 20
    SEED = 20
    TRAIN_FULL_DATA = False
    VISUALIZE = True
    WEIGHT_DECAY = 0.01
    LARGE_CLASSIFIER = False

    n_annot_late = False
    n_annot_early = True  # this is before splitting train and val! --> will do full set
    train_n_annot_min = 7
    train_n_annot_max = np.inf
    val_n_annot_min = 0
    val_n_annot_max = np.inf


class paths:
    OUTPUT_DIR = "/kaggle/working/"
    PRE_LOADED_EEGS = (
        "/kaggle/input/final-preprocessed-data/eeg_specs_normalized_final.npy"
    )
    PRE_LOADED_SPECTOGRAMS = (
        "/kaggle/input/final-preprocessed-data/kaggle_specs_normalized_final.npy"
    )
    TRAIN_CSV = "/kaggle/input/hms-harmful-brain-activity-classification/train.csv"
    TRAIN_EEGS = "/kaggle/input/brain-eeg-spectrograms/EEG_Spectrograms/"
    TRAIN_SPECTOGRAMS = (
        "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"
    )

# <b><span style='color:#F1A424'>|</span> Utils</b><a class='anchor' id='utils'></a> [↑](#top)

---

Utility functions.


In [7]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s: float):
    "Convert to minutes."
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since: float, percent: float):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def get_logger(filename=paths.OUTPUT_DIR):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


def plot_spectrogram(spectrogram_path: str):
    """
    Source: https://www.kaggle.com/code/mvvppp/hms-eda-and-domain-journey
    Visualize spectogram recordings from a parquet file.
    :param spectrogram_path: path to the spectogram parquet.
    """
    sample_spect = pd.read_parquet(spectrogram_path)

    split_spect = {
        "LL": sample_spect.filter(regex="^LL", axis=1),
        "RL": sample_spect.filter(regex="^RL", axis=1),
        "RP": sample_spect.filter(regex="^RP", axis=1),
        "LP": sample_spect.filter(regex="^LP", axis=1),
    }

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
    axes = axes.flatten()
    label_interval = 5
    for i, split_name in enumerate(split_spect.keys()):
        ax = axes[i]
        img = ax.imshow(
            np.log(split_spect[split_name]).T,
            cmap="viridis",
            aspect="auto",
            origin="lower",
        )
        cbar = fig.colorbar(img, ax=ax)
        cbar.set_label("Log(Value)")
        ax.set_title(split_name)
        ax.set_ylabel("Frequency (Hz)")
        ax.set_xlabel("Time")

        ax.set_yticks(np.arange(len(split_spect[split_name].columns)))
        ax.set_yticklabels(
            [column_name[3:] for column_name in split_spect[split_name].columns]
        )
        frequencies = [
            column_name[3:] for column_name in split_spect[split_name].columns
        ]
        ax.set_yticks(
            np.arange(0, len(split_spect[split_name].columns), label_interval)
        )
        ax.set_yticklabels(frequencies[::label_interval])
    plt.tight_layout()
    plt.show()


def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


def sep():
    print("-" * 100)


target_preds = [
    x + "_pred"
    for x in [
        "seizure_vote",
        "lpd_vote",
        "gpd_vote",
        "lrda_vote",
        "grda_vote",
        "other_vote",
    ]
]
label_to_num = {"Seizure": 0, "LPD": 1, "GPD": 2, "LRDA": 3, "GRDA": 4, "Other": 5}
num_to_label = {v: k for k, v in label_to_num.items()}
LOGGER = get_logger()
seed_everything(config.SEED)

# <b><span style='color:#F1A424'>|</span> Load Data</b><a class='anchor' id='load_data'></a> [↑](#top)

---

Load the competition's data.


In [8]:
df = pd.read_csv(paths.TRAIN_CSV)
label_cols = df.columns[-6:]
print(f"Train cataframe shape is: {df.shape}")
print(f"Labels: {list(label_cols)}")

if config.n_annot_early:
    df = filter_by_annotators(df, config.train_n_annot_min, config.train_n_annot_max)

filter_by_agree = False
if filter_by_agree:
    df = filter_by_agreement(df, 0)
print(f"Train cataframe shape is: {df.shape}")

df.head()

Train cataframe shape is: (106800, 15)
Labels: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
Train cataframe shape is: (39949, 15)


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
9,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0,0,5,0,1,5
10,2277392603,1,2.0,924234,1,2.0,134339127,30539,GPD,0,0,5,0,1,5
11,722738444,0,0.0,999431,0,0.0,557980729,56885,LRDA,0,1,0,14,0,1
12,722738444,1,2.0,999431,1,2.0,1949834128,56885,LRDA,0,1,0,14,0,1
13,722738444,2,4.0,999431,2,4.0,3790867376,56885,LRDA,0,1,0,14,0,1


## Non Overalpping EEG specs


In [9]:
train_df = df.groupby("eeg_id")[
    ["spectrogram_id", "spectrogram_label_offset_seconds"]
].agg({"spectrogram_id": "first", "spectrogram_label_offset_seconds": "min"})
train_df.columns = ["spectogram_id", "min"]

aux = df.groupby("eeg_id")[["spectrogram_id", "spectrogram_label_offset_seconds"]].agg(
    {"spectrogram_label_offset_seconds": "max"}
)
train_df["max"] = aux

aux = df.groupby("eeg_id")[["patient_id"]].agg("first")
train_df["patient_id"] = aux

aux = df.groupby("eeg_id")[label_cols].agg("sum")
for label in label_cols:
    train_df[label] = aux[label].values


y_data = train_df[label_cols].values
train_df["n_annot"] = y_data.sum(axis=1, keepdims=True)
y_data = y_data / y_data.sum(axis=1, keepdims=True)
train_df[label_cols] = y_data

aux = df.groupby("eeg_id")[["expert_consensus"]].agg("first")
train_df["target"] = aux

train_df = train_df.reset_index()
print("Train non-overlapp eeg_id shape:", train_df.shape)
train_df.head()

Train non-overlapp eeg_id shape: (5939, 13)


Unnamed: 0,eeg_id,spectogram_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,n_annot,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,48,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,154,LPD
2,1895581,128369999,1138.0,1138.0,47999,0.076923,0.0,0.0,0.0,0.076923,0.846154,13,Other
3,2482631,978166025,1902.0,1944.0,20606,0.0,0.0,0.133333,0.066667,0.133333,0.666667,105,Other
4,2521897,673742515,0.0,4.0,62117,0.0,0.0,0.083333,0.083333,0.333333,0.5,24,Other


### <b><span style='color:#F1A424'>Read Train Spectrograms</span></b>


In [None]:
%%time
READ_SPEC_FILES = False

paths_spectograms = glob(paths.TRAIN_SPECTOGRAMS + "*.parquet")
print(f'There are {len(paths_spectograms)} spectrogram parquets')

if READ_SPEC_FILES:    
    all_spectrograms = {}
    for file_path in tqdm(paths_spectograms):
        aux = pd.read_parquet(file_path)
        name = int(file_path.split("/")[-1].split('.')[0])
        all_spectrograms[name] = aux.iloc[:,1:].values
        del aux
else:
    all_spectrograms = np.load(paths.PRE_LOADED_SPECTOGRAMS, allow_pickle=True).item()
    
if config.VISUALIZE:
    idx = np.random.randint(0,len(paths_spectograms))
    spectrogram_path = paths_spectograms[idx]
    plot_spectrogram(spectrogram_path)

There are 11138 spectrogram parquets


### <b><span style='color:#F1A424'>Read EEG Spectrograms</span></b>

The resulting `all_eegs` dictionary contains `eeg_id` as keys (`int` keys) and the values are the eeg sequences (as 3-dimensional `np.array`) of shape `(128, 256, 4)`.


In [None]:
%%time
READ_EEG_SPEC_FILES = False

paths_eegs = glob(paths.TRAIN_EEGS + "*.npy")
print(f'There are {len(paths_eegs)} EEG spectograms')

if READ_EEG_SPEC_FILES:
    all_eegs = {}
    for file_path in tqdm(paths_eegs):
        eeg_id = file_path.split("/")[-1].split(".")[0]
        eeg_spectogram = np.load(file_path)
        all_eegs[eeg_id] = eeg_spectogram
else:
    all_eegs = np.load(paths.PRE_LOADED_EEGS, allow_pickle=True).item()

# <b><span style='color:#F1A424'>|</span> Validation</b><a class='anchor' id='validation'></a> [↑](#top)

---

We train using `GroupKFold` on `patient_id`.


In [None]:
from sklearn.model_selection import KFold, GroupKFold


gkf = GroupKFold(n_splits=config.FOLDS)
for fold, (train_index, valid_index) in enumerate(
    gkf.split(train_df, train_df.target, train_df.patient_id)
):
    train_df.loc[valid_index, "fold"] = int(fold)

display(train_df.groupby("fold").size()), sep()
display(train_df.head())

# <b><span style='color:#F1A424'>|</span> Dataset</b><a class='anchor' id='dataset'></a> [↑](#top)

---

Create a custom `Dataset` to load data.

Our dataloader outputs both Kaggle spectrograms and EEG spectrogams as 8 channel image of size `(128, 256, 8)`

[1]: https://www.kaggle.com/code/cdeotte/efficientnetb0-starter-lb-0-43/comments#2617811


In [None]:
class CustomDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        config,
        augment: bool = False,
        mode: str = "train",
        specs: Dict[int, np.ndarray] = all_spectrograms,
        eeg_specs: Dict[int, np.ndarray] = all_eegs,
    ):
        self.df = df
        self.config = config
        self.batch_size = self.config.BATCH_SIZE_TRAIN
        self.augment = augment
        self.mode = mode
        self.spectograms = all_spectrograms
        self.eeg_spectograms = eeg_specs

    def __len__(self):
        """
        Denotes the number of batches per epoch.
        """
        return len(self.df)

    def __getitem__(self, index):
        """
        Generate one batch of data.
        """
        X, y = self.__data_generation(index)
        if self.augment:
            X = self.__transform(X)
        return torch.tensor(X, dtype=torch.float32), torch.tensor(
            y, dtype=torch.float32
        )

    def __data_generation(self, index):
        """
        Generates data containing batch_size samples.
        """
        X = np.zeros((128, 256, 8), dtype="float32")
        y = np.zeros(6, dtype="float32")
        img = np.ones((128, 256), dtype="float32")
        row = self.df.iloc[index]
        if self.mode == "test":
            r = 0
        else:
            r = int((row["min"] + row["max"]) // 4)

        for region in range(4):
            img = self.spectograms[row.spectogram_id][
                r : r + 300, region * 100 : (region + 1) * 100
            ].T

            # Log transform spectogram
            img = np.clip(img, np.exp(-4), np.exp(8))
            img = np.log(img)

            # Standarize per image
            ep = 1e-6
            mu = np.nanmean(img.flatten())
            std = np.nanstd(img.flatten())
            img = (img - mu) / (std + ep)
            img = np.nan_to_num(img, nan=0.0)
            X[14:-14, :, region] = img[:, 22:-22] / 2.0
            img = self.eeg_spectograms[row.eeg_id]
            X[:, :, 4:] = img

            if self.mode != "test":
                y = row[label_cols].values.astype(np.float32)

        return X, y

    def __transform(self, img):
        transforms = A.Compose(
            [
                A.HorizontalFlip(p=0.5),
            ]
        )
        return transforms(image=img)["image"]

# <b><span style='color:#F1A424'>|</span> DataLoader</b><a class='anchor' id='dataloader'></a> [↑](#top)

---

Below we display example dataloader spectrogram images.


In [None]:
train_dataset = CustomDataset(train_df, config, mode="train")
train_loader = DataLoader(
    train_dataset,
    batch_size=config.BATCH_SIZE_TRAIN,
    shuffle=False,
    num_workers=config.NUM_WORKERS,
    pin_memory=True,
    drop_last=True,
)
X, y = train_dataset[0]
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

### <b><span style='color:#F1A424'> Visualize DataLoader</span></b>


In [None]:
if config.VISUALIZE:
    ROWS = 2
    COLS = 3
    for X, y in train_loader:
        plt.figure(figsize=(20, 8))
        for row in range(ROWS):
            for col in range(COLS):
                plt.subplot(ROWS, COLS, row * COLS + col + 1)
                t = y[row * COLS + col]
                img = X[row * COLS + col, :, :, 0]
                mn = img.flatten().min()
                mx = img.flatten().max()
                img = (img - mn) / (mx - mn)
                plt.imshow(img)
                tars = f"[{t[0]:0.2f}"
                for s in t[1:]:
                    tars += f", {s:0.2f}"
                eeg = train_df.eeg_id.values[
                    row * config.BATCH_SIZE_TRAIN + row * COLS + col
                ]
                plt.title(f"EEG = {eeg}\nTarget = {tars}", size=12)
                plt.yticks([])
                plt.ylabel("Frequencies (Hz)", size=14)
                plt.xlabel("Time (sec)", size=16)
        plt.show()
        break

# <b><span style='color:#F1A424'>|</span> Model</b><a class='anchor' id='model'></a> [↑](#top)

---

We will be using the [timm](https://github.com/huggingface/pytorch-image-models) library for our models.

Our models receives both Kaggle spectrograms and EEG spectrograms from our data loader. We then reshape these 8 spectrograms into 1 large flat image and feed it into EfficientNet.


In [None]:
class CustomModel(nn.Module):
    def __init__(self, config, num_classes: int = 6, pretrained: bool = True):
        super(CustomModel, self).__init__()
        self.config = config
        self.num_classes = num_classes
        self.USE_KAGGLE_SPECTROGRAMS = True
        self.USE_EEG_SPECTROGRAMS = True
        self.model = (
            timm.create_model(
                config.MODEL,
                pretrained=True,
                drop_rate=0.1,
                drop_path_rate=0.2,
            )
            if config.MODEL.startswith("tf_")
            else torch.hub.load("pytorch/vision:v0.10.0", config.MODEL, pretrained=True)
        )

        if config.FREEZE:
            for i, (name, param) in enumerate(
                list(self.model.named_parameters())[0 : config.NUM_FROZEN_LAYERS]
            ):
                param.requires_grad = False

        self.features = self.set_feature_layers()
        self.custom_layers = self.set_custom_layers()

    def set_custom_layers(self):
        # this should probs become a dict once we know which sizes we are going to use
        if self.config.MODEL.startswith("tf_"):
            num_features = self.model.num_features
        elif self.config.MODEL.startswith("shufflenet"):
            num_features = 1024  # need to make this better
        elif self.config.MODEL.startswith("resnet"):
            num_features = 2048
        else:
            raise NotImplementedError("Model not implemented - check model name.")

        if getattr(
            self.config, "LARGE_CLASSIFIER", False
        ):  # not all will have attribute so to not break it return False if attr does not exist
            return nn.Sequential(
                nn.AdaptiveAvgPool2d(1),
                nn.Flatten(),
                nn.Linear(num_features, 256),
                nn.BatchNorm1d(
                    256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
                ),
                nn.ReLU(inplace=True),
                nn.Linear(256, self.num_classes),
            )

        else:
            return nn.Sequential(
                nn.AdaptiveAvgPool2d(1),
                nn.Flatten(),
                nn.Linear(num_features, self.num_classes),
            )

    def set_feature_layers(self):
        if self.config.MODEL.startswith("tf_") or self.config.MODEL.startswith(
            "resnet"
        ):
            return nn.Sequential(*list(self.model.children())[:-2])

        elif self.config.MODEL.startswith("shufflenet"):
            return nn.Sequential(*list(self.model.children())[:-1])

    def __reshape_input(self, x):
        """
        Reshapes input (128, 256, 8) -> (512, 512, 3) monotone image.
        """
        # === Get spectograms ===
        spectograms = [x[:, :, :, i : i + 1] for i in range(4)]
        spectograms = torch.cat(spectograms, dim=1)

        # === Get EEG spectograms ===
        eegs = [x[:, :, :, i : i + 1] for i in range(4, 8)]
        eegs = torch.cat(eegs, dim=1)

        # === Reshape (512,512,3) ===
        if self.USE_KAGGLE_SPECTROGRAMS & self.USE_EEG_SPECTROGRAMS:
            x = torch.cat([spectograms, eegs], dim=2)
        elif self.USE_EEG_SPECTROGRAMS:
            x = eegs
        else:
            x = spectograms

        x = torch.cat([x, x, x], dim=3)
        x = x.permute(0, 3, 1, 2)
        return x

    def forward(self, x):
        x = self.__reshape_input(x)
        x = self.features(x)
        x = self.custom_layers(x)
        return x

# <b><span style='color:#F1A424'>|</span> Scheduler</b><a class='anchor' id='scheduler'></a> [↑](#top)

---

We will train our model with a Step Train Schedule for 4 epochs. First 2 epochs are LR=1e-3. Then epochs 3 and 4 use LR=1e-4 and 1e-5 respectively. (Below we also provide a Cosine Train Schedule if you want to experiment with it. Note it is not used in this notebook).


In [None]:
from torch.optim.lr_scheduler import OneCycleLR

EPOCHS = config.EPOCHS
BATCHES = len(train_loader)
steps = []
lrs = []
optim_lrs = []
model = CustomModel(config)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = OneCycleLR(
    optimizer,
    max_lr=1e-3,
    epochs=config.EPOCHS,
    steps_per_epoch=len(train_loader),
    pct_start=0.05,
    anneal_strategy="cos",
    final_div_factor=100,
)
for epoch in range(EPOCHS):
    for batch in range(BATCHES):
        scheduler.step()
        lrs.append(scheduler.get_last_lr()[0])
        steps.append(epoch * BATCHES + batch)

max_lr = max(lrs)
min_lr = min(lrs)
print(f"Maximum LR: {max_lr} | Minimum LR: {min_lr}")
plt.figure()
plt.plot(steps, lrs, label="OneCycle")
plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
plt.xlabel("Step")
plt.ylabel("Learning Rate")
plt.show()

# <b><span style='color:#F1A424'>|</span> Loss Function</b><a class='anchor' id='loss'></a> [↑](#top)

---

In PyTorch's [KLDivLoss][1], the reduction parameter determines how the loss is aggregated across different dimensions. Two common options are `mean` and `batchmean`.

- `reduction`='mean': When reduction is set to "mean", the Kullback-Leibler Divergence loss is computed and then averaged over all the elements in the input tensor. The result is a scalar value representing the mean loss.
- `reduction`='batchmean': When reduction is set to "batchmean", the Kullback-Leibler Divergence loss is computed independently for each item in the batch, and then the mean is taken over the batch dimension. This is useful when you have a batch of samples, and you want the average loss per sample.

[1]: https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html


In [None]:
import torch.nn.functional as F

# === Reduction = "mean" ===
criterion = nn.KLDivLoss(reduction="mean")
y_pred = F.log_softmax(torch.randn(6, 2, requires_grad=True), dim=1)
y_true = F.softmax(torch.rand(6, 2), dim=1)
print(f"Predictions: {y_pred}")
print(f"Targets: {y_true}")
output = criterion(y_pred, y_true)
print(f"Output: {output}")

print("\n", "=" * 100, "\n")

# === Reduction = "batchmean" ===
criterion = nn.KLDivLoss(reduction="batchmean")
y_pred = F.log_softmax(torch.randn(2, 6, requires_grad=True), dim=1)
y_true = F.softmax(torch.rand(2, 6), dim=1)
print(f"Predictions: {y_pred}")
print(f"Targets: {y_true}")
output = criterion(y_pred, y_true)
print(f"Output: {output}")

# <b><span style='color:#F1A424'>|</span> Train and Validation Functions</b><a class='anchor' id='functions'></a> [↑](#top)

---

We train using Group KFold on patient id. If `LOAD_MODELS_FROM = None`, then we will train new models in this notebook version. Otherwise we will load saved models from the path `LOAD_MODELS_FROM`.


In [None]:
def train_epoch(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    """One epoch training pass."""
    model.train()
    criterion = nn.KLDivLoss(reduction="batchmean")
    scaler = torch.cuda.amp.GradScaler(enabled=config.AMP)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0

    # ========== ITERATE OVER TRAIN BATCHES ============
    with tqdm(train_loader, unit="train_batch", desc="Train") as tqdm_train_loader:
        for step, (X, y) in enumerate(tqdm_train_loader):
            X = X.to(device)
            y = y.to(device)
            batch_size = y.size(0)
            with torch.cuda.amp.autocast(enabled=config.AMP):
                y_preds = model(X)
                loss = criterion(F.log_softmax(y_preds, dim=1), y)
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), config.MAX_GRAD_NORM
            )

            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                scheduler.step()
            end = time.time()

            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(train_loader) - 1):
                print(
                    "Epoch: [{0}][{1}/{2}] "
                    "Elapsed {remain:s} "
                    "Loss: {loss.avg:.4f} "
                    "Grad: {grad_norm:.4f}  "
                    "LR: {lr:.8f}  ".format(
                        epoch + 1,
                        step,
                        len(train_loader),
                        remain=timeSince(start, float(step + 1) / len(train_loader)),
                        loss=losses,
                        grad_norm=grad_norm,
                        lr=scheduler.get_last_lr()[0],
                    )
                )

    return losses.avg


def valid_epoch(valid_loader, model, criterion, device):
    model.eval()
    softmax = nn.Softmax(dim=1)
    losses = AverageMeter()
    prediction_dict = {}
    preds = []
    start = end = time.time()
    with tqdm(valid_loader, unit="valid_batch", desc="Validation") as tqdm_valid_loader:
        for step, (X, y) in enumerate(tqdm_valid_loader):
            X = X.to(device)
            y = y.to(device)
            batch_size = y.size(0)
            with torch.no_grad():
                y_preds = model(X)
                loss = criterion(F.log_softmax(y_preds, dim=1), y)
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size)
            y_preds = softmax(y_preds)
            preds.append(y_preds.to("cpu").numpy())
            end = time.time()

            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(valid_loader) - 1):
                print(
                    "EVAL: [{0}/{1}] "
                    "Elapsed {remain:s} "
                    "Loss: {loss.avg:.4f} ".format(
                        step,
                        len(valid_loader),
                        remain=timeSince(start, float(step + 1) / len(valid_loader)),
                        loss=losses,
                    )
                )

    prediction_dict["predictions"] = np.concatenate(preds)
    return losses.avg, prediction_dict

# <b><span style='color:#F1A424'>|</span> Train Loop</b><a class='anchor' id='train_loop'></a> [↑](#top)

---


In [None]:
def train_loop(df, fold):

    LOGGER.info(f"========== Fold: {fold} training ==========")

    # ======== SPLIT ==========
    train_folds = df[df["fold"] != fold].reset_index(drop=True)
    valid_folds = df[df["fold"] == fold].reset_index(drop=True)
    if config.n_annot_late:
        train_folds = train_folds[
            (train_folds["n_annot"] >= config.train_n_annot_min)
            & (train_folds["n_annot"] < config.train_n_annot_max)
        ]
        valid_folds = valid_folds[
            (valid_folds["n_annot"] >= config.val_n_annot_min)
            & (valid_folds["n_annot"] < config.val_n_annot_max)
        ]

    LOGGER.info(f"Training on {len(train_folds)} samples.")
    LOGGER.info(f"Validating on {len(valid_folds)} samples.")

    # ======== DATASETS ==========
    train_dataset = CustomDataset(train_folds, config, mode="train", augment=True)
    valid_dataset = CustomDataset(valid_folds, config, mode="train", augment=False)

    # ======== DATALOADERS ==========
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE_TRAIN,
        shuffle=True,
        num_workers=config.NUM_WORKERS,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config.BATCH_SIZE_VALID,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        pin_memory=True,
        drop_last=False,
    )

    # ======== MODEL ==========
    model = CustomModel(config)
    print(f"Model: {model.config.MODEL}, Large CF: {model.config.LARGE_CLASSIFIER}")
    model.to(device)

    optimizer = torch.optim.AdamW(
        model.parameters(), lr=0.1, weight_decay=config.WEIGHT_DECAY
    )
    scheduler = OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=config.EPOCHS,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy="cos",
        final_div_factor=100,
    )

    # ======= LOSS ==========
    criterion = nn.KLDivLoss(reduction="batchmean")

    best_loss = np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(config.EPOCHS):
        start_time = time.time()

        # ======= TRAIN ==========
        avg_train_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )

        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(
            valid_loader, model, criterion, device
        )
        predictions = prediction_dict["predictions"]

        # ======= SCORING ==========
        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f"Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "predictions": predictions},
                paths.OUTPUT_DIR
                + f"/{config.MODEL.replace('/', '_')}_fold_{fold}_best_{config.LARGE_CLASSIFIER}.pth",
            )

    predictions = torch.load(
        paths.OUTPUT_DIR
        + f"/{config.MODEL.replace('/', '_')}_fold_{fold}_best_{config.LARGE_CLASSIFIER}.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    valid_folds[target_preds] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

# <b><span style='color:#F1A424'>|</span> Train Full Data</b><a class='anchor' id='train_full'></a> [↑](#top)

---


In [None]:
def train_loop_full_data(df):
    train_dataset = CustomDataset(df, config, mode="train", augment=True)
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE_TRAIN,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        pin_memory=True,
        drop_last=True,
    )
    model = CustomModel(config)
    model.to(device)
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=0.1, weight_decay=config.WEIGHT_DECAY
    )
    scheduler = OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=config.EPOCHS,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy="cos",
        final_div_factor=100,
    )
    criterion = nn.KLDivLoss(reduction="batchmean")
    best_loss = np.inf
    for epoch in range(config.EPOCHS):
        start_time = time.time()
        avg_train_loss = train_epoch(
            train_loader, model, criterion, optimizer, epoch, scheduler, device
        )
        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  time: {elapsed:.0f}s"
        )
        torch.save(
            {"model": model.state_dict()},
            paths.OUTPUT_DIR + f"/{config.MODEL.replace('/', '_')}_epoch_{epoch}.pth",
        )
    torch.cuda.empty_cache()
    gc.collect()
    return _

# <b><span style='color:#F1A424'>|</span> Train</b><a class='anchor' id='train'></a> [↑](#top)

---


In [None]:
def get_result(oof_df):
    kl_loss = nn.KLDivLoss(reduction="batchmean")
    labels = torch.tensor(oof_df[label_cols].values)
    preds = torch.tensor(oof_df[target_preds].values)
    preds = F.log_softmax(preds, dim=1)
    result = kl_loss(preds, labels)
    return result


if not config.TRAIN_FULL_DATA:
    oof_df = pd.DataFrame()
    for fold in range(config.FOLDS):
        if fold != 4:  # in [0, 1, 2, 3, 4]
            print(f"Skipping fold {fold}")
            continue
        else:
            _oof_df = train_loop(train_df, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(
                f"========== Fold {fold} result: {get_result(_oof_df)} =========="
            )
            print(f"========== Fold {fold} result: {get_result(_oof_df)} ==========")
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV: {get_result(oof_df)} ==========")
    oof_df.to_csv(paths.OUTPUT_DIR + "/oof_df.csv", index=False)
else:
    train_loop_full_data(train_df)