# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    geese_net_layers = 12
    geese_net_filters = 32

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 10
    model_name = "geese_net"
    pre_train_file = ""

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [4]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [5]:
if Config.apex:
    from apex import amp

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

22251


In [9]:
# fit for memory size...
# paths = paths[-12000:]
paths = paths[:-12000]
print(len(paths))

10251


In [10]:
if Config.debug:
    paths = paths[:10]

## Utils

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [12]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [13]:
next_position_map = {}
for pos in range(77):
    position = []
    position.append((11 * (1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (-1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (pos // 11) + (pos + 1) % 11) % 77)
    position.append((11 * (pos // 11) + (pos - 1) % 11) % 77)
    next_position_map[pos] = set(position)

In [14]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [15]:
def get_reverse_cube(obses):
    """
    尻尾から順番に 1, 0.9, 0.8, ... という並び
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, geese in enumerate(obs["geese"]):
        # whole position reverse
        for num_reverse, pos in enumerate(geese[::-1]):
            b[(p - obs["index"]) % 4, pos] = 1 - num_reverse * 0.1

    return b.reshape(-1, 7, 11)

In [16]:
def get_next_disappear_cube(obses):
    """
    次になくなる場所: 1
    次になくなる可能性のある場所: 0.5
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    # foodを食べる可能性があるか。
    eat_food_possibility = defaultdict(int)
    for p, geese in enumerate(obs["geese"]):
        for pos in geese[:1]:
            if not next_position_map[pos].isdisjoint(obs["food"]):
                eat_food_possibility[p] = 1

    if (step % 40) == 39:  # 1つ短くなる
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 尻尾が1、尻尾の１つ前0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1
                for pos in geese[-2:-1]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし -> 尻尾が1, 尻尾の1つ前1
                for pos in geese[-2:]:
                    b[(p - obs["index"]) % 4, pos] = 1
    else:  # 1つ短くならない
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 食べる可能性があり -> 尻尾を0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし # 尻尾を1
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1

    return b.reshape(-1, 7, 11)

In [17]:
def get_features(obses):
    b = np.zeros((7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    my_goose = obs["geese"][obs["index"]]
    my_length = len(my_goose)

    # num step
    b[0] = step % 200
    b[1] = step % 40

    """
    2-5: geese length
    6-8: 1 if my_length is greater than opponent length
    9-11: difference between my_length and opponent length
    """
    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        p_length = len(pos_list)
        b[2 + pid] = p_length

        if pid == 0:
            continue

        if my_length > p_length:
            b[5 + pid] = 1
            b[8 + pid] = my_length - p_length
        else:
            b[5 + pid] = 0
            b[8 + pid] = p_length - my_length

    """
    12-14: difference between my head position and opponent one
    """
    if my_length != 0:

        for p, pos_list in enumerate(obs["geese"]):
            pid = (p - obs["index"]) % 4

            if pid == 0 or len(pos_list) == 0:
                continue

            diff = abs(my_goose[0] - pos_list[0])
            x_ = diff % 11
            x = min(x_, 11 - x_)
            y_ = diff // 11
            y = min(y_, 7 - y_)
            b[11 + pid] = x + y

    return b.reshape(1, 7, 11)

## Data

In [18]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            # 反転可能な特徴量
            X_ = []
            X_.append(make_input(obses[: j + 1]))
            X_.append(get_reverse_cube(obses[: j + 1]))
            X_.append(get_next_disappear_cube(obses[: j + 1]))

            # 反転不可能な特徴量
            X_i = []
            X_i.append(get_features(obses[: j + 1]))

            X_ = np.concatenate(X_)
            X_i = np.concatenate(X_i)

            X.append(np.concatenate([X_, X_i]))
            X.append(np.concatenate([X_[:, ::-1, :], X_i]))  # 上下反転
            X.append(np.concatenate([X_[:, :, ::-1], X_i]))  # 左右反転
            X.append(np.concatenate([X_[:, ::-1, ::-1], X_i]))  # 上下左右反転

        X = np.array(X, dtype=np.float16)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except Exception as e:
        if Config.debug:
            raise Exception from e
        return 0, 0

In [19]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10251.0), HTML(value='')))


Num episode: 6381992


In [20]:
unique_ = False

In [21]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [22]:
if unique_:
    X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
    X_train_group = np.unique(X_train_sum_obs)
    X_train_group.shape

In [23]:
if unique_:
    X_train_unique = []
    y_train_unique = []
    for group in tqdm(X_train_group):
        group_index = np.where(X_train_sum_obs == group)

        X_train_ = X_train[group_index]
        y_train_ = y_train[group_index]

        X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
        y_train_ = y_train_[unique_index]

        X_train_unique.append(X_train_)
        y_train_unique.append(y_train_)

    X_train = np.concatenate(X_train_unique)
    y_train = np.concatenate(y_train_unique)

    print(f"Num episode: {len(X_train)}")

In [24]:
if unique_:
    del X_train_sum_obs
    del X_train_group
    del X_train_unique
    del y_train_unique
    del X_train_
    del y_train_
    del group_index
    del unique_index

In [25]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [26]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [27]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,3
1,3
2,2
3,2
4,3
...,...
6381987,2
6381988,3
6381989,3
6381990,2


## CV Split

In [28]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         152973
      1         152972
      2         166127
      3         166128
1     0         152973
      1         152972
      2         166127
      3         166128
2     0         152972
      1         152972
      2         166127
      3         166128
3     0         152972
      1         152972
      2         166127
      3         166128
4     0         152972
      1         152972
      2         166128
      3         166127
5     0         152972
      1         152972
      2         166128
      3         166127
6     0         152972
      1         152972
      2         166128
      3         166127
7     0         152972
      1         152972
      2         166128
      3         166127
8     0         152972
      1         152973
      2         166127
      3         166127
9     0         152972
      1         152973
      2         166127
      3         166127
dtype: int64


## Dataset

In [29]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [30]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [31]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [32]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters
        dim = 324

        self.embed_step = nn.Embedding(200, 11)
        self.embed_hunger = nn.Embedding(40, 6)
        self.embed_length = nn.Embedding(100, 7)
        self.embed_diff_len = nn.Embedding(100, 8)
        self.embed_diff_head = nn.Embedding(9, 5)

        self.conv0 = TorusConv2d(25, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.attention = nn.MultiheadAttention(dim, 1)

        self.head_p1 = nn.Linear(dim, dim // 2, bias=False)
        self.head_p2 = nn.Linear(dim // 2, 4, bias=False)
        self.head_v1 = nn.Linear(dim, dim // 2, bias=False)
        self.head_v2 = nn.Linear(dim // 2, 1, bias=False)

    def forward(self, x, _=None):
        x_feats = x[:, -1].view(x.size(0), -1).long()

        # Embedding for features
        e_step = self.embed_step(x_feats[:, 0])
        e_hung = self.embed_hunger(x_feats[:, 1])
        e_leng = self.embed_length(x_feats[:, 2:6]).view(x.size(0), -1)
        e_diff_lb = x_feats[:, 6:9]
        e_diff_l = self.embed_diff_len(x_feats[:, 9:12]).view(x.size(0), -1)
        e_diff_h = self.embed_diff_head(x_feats[:, 12:15]).view(x.size(0), -1)

        x = x[:, :-1].float()

        # CNN for observation
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        # Extract head position
        h_head = (h * x[:, :1]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head2 = (h * x[:, 1:2]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head3 = (h * x[:, 2:3]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head4 = (h * x[:, 3:4]).view(h.size(0), h.size(1), -1).sum(-1)
        h_avg1 = h.view(h.size(0), h.size(1), -1).mean(-1)
        h_avg2 = h.view(h.size(0), h.size(1), -1).mean(1)

        # Merge features
        h = torch.cat(
            [
                h_head,
                h_head2,
                h_head3,
                h_head4,
                h_avg1,
                h_avg2,
                e_step,
                e_hung,
                e_leng,
                e_diff_lb,
                e_diff_l,
                e_diff_h,
            ],
            1,
        ).view(1, h.size(0), -1)

        h, _ = self.attention(h, h, h)

        h_p = F.relu_(self.head_p1(h.view(x.size(0), -1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.head_v1(h.view(x.size(0), -1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [33]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        print(f"input shape: {obs.shape}")
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [34]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [35]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [36]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [37]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [38]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [39]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    try:
        model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    except:
        print(f"Failed to load pre-train weight.")

    # Disable training for value network
    for param in model.head_v1.parameters():
        param.requires_grad = False
    for param in model.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [40]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [41]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [42]:
if __name__ == "__main__":
    main()



Failed to load pre-train weight.
Epoch: [1][0/1794] Elapsed 0m 3s (remain 106m 4s) Loss avg.: 1.4126 Grad: 1.0745 LR: 0.00100  
Epoch: [1][100/1794] Elapsed 0m 32s (remain 9m 2s) Loss avg.: 0.7706 Grad: 1.0980 LR: 0.00100  
Epoch: [1][200/1794] Elapsed 1m 1s (remain 8m 4s) Loss avg.: 0.6814 Grad: 0.9063 LR: 0.00100  
Epoch: [1][300/1794] Elapsed 1m 30s (remain 7m 26s) Loss avg.: 0.6432 Grad: 0.8570 LR: 0.00100  
Epoch: [1][400/1794] Elapsed 1m 59s (remain 6m 53s) Loss avg.: 0.6189 Grad: 0.9146 LR: 0.00100  
Epoch: [1][500/1794] Elapsed 2m 28s (remain 6m 22s) Loss avg.: 0.6014 Grad: 1.3915 LR: 0.00100  
Epoch: [1][600/1794] Elapsed 2m 57s (remain 5m 51s) Loss avg.: 0.5878 Grad: 1.3533 LR: 0.00100  
Epoch: [1][700/1794] Elapsed 3m 26s (remain 5m 21s) Loss avg.: 0.5772 Grad: 1.1665 LR: 0.00100  
Epoch: [1][800/1794] Elapsed 3m 55s (remain 4m 51s) Loss avg.: 0.5686 Grad: 0.7836 LR: 0.00100  
Epoch: [1][900/1794] Elapsed 4m 24s (remain 4m 22s) Loss avg.: 0.5611 Grad: 0.7663 LR: 0.00100  
Ep

Epoch 1 - avg_train_loss: 0.5267  avg_val_loss: 0.4827  time: 541s
Epoch 1 - Accuracy: 0.7898088373550611
Epoch 1 - Save Best Score: 0.7898 Model


Epoch: [2][0/1794] Elapsed 0m 1s (remain 41m 26s) Loss avg.: 0.5057 Grad: 0.7053 LR: 0.00098  
Epoch: [2][100/1794] Elapsed 0m 30s (remain 8m 27s) Loss avg.: 0.4805 Grad: 0.8124 LR: 0.00098  
Epoch: [2][200/1794] Elapsed 0m 59s (remain 7m 48s) Loss avg.: 0.4793 Grad: 0.6930 LR: 0.00098  
Epoch: [2][300/1794] Elapsed 1m 28s (remain 7m 17s) Loss avg.: 0.4804 Grad: 0.4962 LR: 0.00098  
Epoch: [2][400/1794] Elapsed 1m 57s (remain 6m 46s) Loss avg.: 0.4805 Grad: 0.5619 LR: 0.00098  
Epoch: [2][500/1794] Elapsed 2m 26s (remain 6m 16s) Loss avg.: 0.4803 Grad: 0.4345 LR: 0.00098  
Epoch: [2][600/1794] Elapsed 2m 55s (remain 5m 47s) Loss avg.: 0.4798 Grad: 0.5748 LR: 0.00098  
Epoch: [2][700/1794] Elapsed 3m 24s (remain 5m 18s) Loss avg.: 0.4790 Grad: 0.4678 LR: 0.00098  
Epoch: [2][800/1794] Elapsed 3m 53s (remain 4m 49s) Loss avg.: 0.4788 Grad: 0.6104 LR: 0.00098  
Epoch: [2][900/1794] Elapsed 4m 22s (remain 4m 19s) Loss avg.: 0.4783 Grad: 0.6998 LR: 0.00098  
Epoch: [2][1000/1794] Elapsed 4m

Epoch 2 - avg_train_loss: 0.4758  avg_val_loss: 0.4695  time: 538s
Epoch 2 - Accuracy: 0.7964854277655907
Epoch 2 - Save Best Score: 0.7965 Model


Epoch: [3][0/1794] Elapsed 0m 1s (remain 43m 55s) Loss avg.: 0.4700 Grad: 0.5338 LR: 0.00091  
Epoch: [3][100/1794] Elapsed 0m 30s (remain 8m 31s) Loss avg.: 0.4667 Grad: 0.5614 LR: 0.00091  
Epoch: [3][200/1794] Elapsed 0m 59s (remain 7m 53s) Loss avg.: 0.4664 Grad: 0.5605 LR: 0.00091  
Epoch: [3][300/1794] Elapsed 1m 28s (remain 7m 20s) Loss avg.: 0.4658 Grad: 0.6559 LR: 0.00091  
Epoch: [3][400/1794] Elapsed 1m 57s (remain 6m 49s) Loss avg.: 0.4659 Grad: 0.4052 LR: 0.00091  
Epoch: [3][500/1794] Elapsed 2m 26s (remain 6m 19s) Loss avg.: 0.4664 Grad: 0.3396 LR: 0.00091  
Epoch: [3][600/1794] Elapsed 2m 56s (remain 5m 49s) Loss avg.: 0.4662 Grad: 0.5704 LR: 0.00091  
Epoch: [3][700/1794] Elapsed 3m 25s (remain 5m 19s) Loss avg.: 0.4657 Grad: 0.4209 LR: 0.00091  
Epoch: [3][800/1794] Elapsed 3m 54s (remain 4m 50s) Loss avg.: 0.4656 Grad: 0.4056 LR: 0.00091  
Epoch: [3][900/1794] Elapsed 4m 23s (remain 4m 20s) Loss avg.: 0.4658 Grad: 0.5724 LR: 0.00091  
Epoch: [3][1000/1794] Elapsed 4m

Epoch 3 - avg_train_loss: 0.4655  avg_val_loss: 0.4647  time: 540s
Epoch 3 - Accuracy: 0.7987151363209025
Epoch 3 - Save Best Score: 0.7987 Model


Epoch: [4][0/1794] Elapsed 0m 1s (remain 41m 21s) Loss avg.: 0.4556 Grad: 0.3991 LR: 0.00081  
Epoch: [4][100/1794] Elapsed 0m 30s (remain 8m 28s) Loss avg.: 0.4588 Grad: 0.3424 LR: 0.00081  
Epoch: [4][200/1794] Elapsed 0m 59s (remain 7m 50s) Loss avg.: 0.4600 Grad: 0.3638 LR: 0.00081  
Epoch: [4][300/1794] Elapsed 1m 28s (remain 7m 17s) Loss avg.: 0.4602 Grad: 0.5142 LR: 0.00081  
Epoch: [4][400/1794] Elapsed 1m 57s (remain 6m 47s) Loss avg.: 0.4600 Grad: 0.4954 LR: 0.00081  
Epoch: [4][500/1794] Elapsed 2m 26s (remain 6m 16s) Loss avg.: 0.4600 Grad: 0.4509 LR: 0.00081  
Epoch: [4][600/1794] Elapsed 2m 55s (remain 5m 47s) Loss avg.: 0.4604 Grad: 0.4398 LR: 0.00081  
Epoch: [4][700/1794] Elapsed 3m 24s (remain 5m 18s) Loss avg.: 0.4599 Grad: 0.6407 LR: 0.00081  
Epoch: [4][800/1794] Elapsed 3m 53s (remain 4m 49s) Loss avg.: 0.4599 Grad: 0.4257 LR: 0.00081  
Epoch: [4][900/1794] Elapsed 4m 22s (remain 4m 20s) Loss avg.: 0.4599 Grad: 0.3944 LR: 0.00081  
Epoch: [4][1000/1794] Elapsed 4m

Epoch 4 - avg_train_loss: 0.4595  avg_val_loss: 0.4634  time: 539s
Epoch 4 - Accuracy: 0.7991162644938891
Epoch 4 - Save Best Score: 0.7991 Model


Epoch: [5][0/1794] Elapsed 0m 1s (remain 44m 48s) Loss avg.: 0.4691 Grad: 0.4792 LR: 0.00069  
Epoch: [5][100/1794] Elapsed 0m 30s (remain 8m 32s) Loss avg.: 0.4534 Grad: 0.3588 LR: 0.00069  
Epoch: [5][200/1794] Elapsed 0m 59s (remain 7m 53s) Loss avg.: 0.4544 Grad: 0.4103 LR: 0.00069  
Epoch: [5][300/1794] Elapsed 1m 28s (remain 7m 20s) Loss avg.: 0.4544 Grad: 0.5985 LR: 0.00069  
Epoch: [5][400/1794] Elapsed 1m 58s (remain 6m 50s) Loss avg.: 0.4539 Grad: 0.4865 LR: 0.00069  
Epoch: [5][500/1794] Elapsed 2m 27s (remain 6m 19s) Loss avg.: 0.4543 Grad: 0.3719 LR: 0.00069  
Epoch: [5][600/1794] Elapsed 2m 56s (remain 5m 49s) Loss avg.: 0.4544 Grad: 0.3574 LR: 0.00069  
Epoch: [5][700/1794] Elapsed 3m 25s (remain 5m 20s) Loss avg.: 0.4546 Grad: 0.3612 LR: 0.00069  
Epoch: [5][800/1794] Elapsed 3m 54s (remain 4m 50s) Loss avg.: 0.4546 Grad: 0.5201 LR: 0.00069  
Epoch: [5][900/1794] Elapsed 4m 23s (remain 4m 21s) Loss avg.: 0.4544 Grad: 0.3487 LR: 0.00069  
Epoch: [5][1000/1794] Elapsed 4m

Epoch 5 - avg_train_loss: 0.4547  avg_val_loss: 0.4560  time: 539s
Epoch 5 - Accuracy: 0.8029034785333751
Epoch 5 - Save Best Score: 0.8029 Model


Epoch: [6][0/1794] Elapsed 0m 1s (remain 42m 26s) Loss avg.: 0.4402 Grad: 0.3685 LR: 0.00055  
Epoch: [6][100/1794] Elapsed 0m 30s (remain 8m 26s) Loss avg.: 0.4516 Grad: 0.3568 LR: 0.00055  
Epoch: [6][200/1794] Elapsed 0m 59s (remain 7m 48s) Loss avg.: 0.4502 Grad: 0.3990 LR: 0.00055  
Epoch: [6][300/1794] Elapsed 1m 28s (remain 7m 16s) Loss avg.: 0.4498 Grad: 0.3793 LR: 0.00055  
Epoch: [6][400/1794] Elapsed 1m 57s (remain 6m 46s) Loss avg.: 0.4500 Grad: 0.4265 LR: 0.00055  
Epoch: [6][500/1794] Elapsed 2m 26s (remain 6m 17s) Loss avg.: 0.4501 Grad: 0.4202 LR: 0.00055  
Epoch: [6][600/1794] Elapsed 2m 55s (remain 5m 47s) Loss avg.: 0.4502 Grad: 0.3246 LR: 0.00055  
Epoch: [6][700/1794] Elapsed 3m 24s (remain 5m 18s) Loss avg.: 0.4503 Grad: 0.5350 LR: 0.00055  
Epoch: [6][800/1794] Elapsed 3m 53s (remain 4m 49s) Loss avg.: 0.4505 Grad: 0.3006 LR: 0.00055  
Epoch: [6][900/1794] Elapsed 4m 22s (remain 4m 20s) Loss avg.: 0.4505 Grad: 0.5226 LR: 0.00055  
Epoch: [6][1000/1794] Elapsed 4m

Epoch 6 - avg_train_loss: 0.4503  avg_val_loss: 0.4544  time: 540s
Epoch 6 - Accuracy: 0.8039235349420244
Epoch 6 - Save Best Score: 0.8039 Model


Epoch: [7][0/1794] Elapsed 0m 1s (remain 43m 46s) Loss avg.: 0.4374 Grad: 0.3070 LR: 0.00041  
Epoch: [7][100/1794] Elapsed 0m 30s (remain 8m 35s) Loss avg.: 0.4470 Grad: 0.4263 LR: 0.00041  
Epoch: [7][200/1794] Elapsed 0m 59s (remain 7m 53s) Loss avg.: 0.4478 Grad: 0.4581 LR: 0.00041  
Epoch: [7][300/1794] Elapsed 1m 28s (remain 7m 20s) Loss avg.: 0.4473 Grad: 0.3623 LR: 0.00041  
Epoch: [7][400/1794] Elapsed 1m 57s (remain 6m 49s) Loss avg.: 0.4477 Grad: 0.4550 LR: 0.00041  
Epoch: [7][500/1794] Elapsed 2m 26s (remain 6m 18s) Loss avg.: 0.4470 Grad: 0.3230 LR: 0.00041  
Epoch: [7][600/1794] Elapsed 2m 55s (remain 5m 48s) Loss avg.: 0.4466 Grad: 0.3533 LR: 0.00041  
Epoch: [7][700/1794] Elapsed 3m 24s (remain 5m 19s) Loss avg.: 0.4468 Grad: 0.5908 LR: 0.00041  
Epoch: [7][800/1794] Elapsed 3m 53s (remain 4m 49s) Loss avg.: 0.4467 Grad: 0.4448 LR: 0.00041  
Epoch: [7][900/1794] Elapsed 4m 23s (remain 4m 20s) Loss avg.: 0.4467 Grad: 0.4171 LR: 0.00041  
Epoch: [7][1000/1794] Elapsed 4m

Epoch 7 - avg_train_loss: 0.4467  avg_val_loss: 0.4513  time: 540s
Epoch 7 - Accuracy: 0.8050219366969602
Epoch 7 - Save Best Score: 0.8050 Model


Epoch: [8][0/1794] Elapsed 0m 1s (remain 40m 45s) Loss avg.: 0.4174 Grad: 0.2912 LR: 0.00029  
Epoch: [8][100/1794] Elapsed 0m 30s (remain 8m 27s) Loss avg.: 0.4425 Grad: 0.3412 LR: 0.00029  
Epoch: [8][200/1794] Elapsed 0m 59s (remain 7m 49s) Loss avg.: 0.4427 Grad: 0.3515 LR: 0.00029  
Epoch: [8][300/1794] Elapsed 1m 28s (remain 7m 17s) Loss avg.: 0.4425 Grad: 0.3788 LR: 0.00029  
Epoch: [8][400/1794] Elapsed 1m 57s (remain 6m 47s) Loss avg.: 0.4424 Grad: 0.2969 LR: 0.00029  
Epoch: [8][500/1794] Elapsed 2m 26s (remain 6m 17s) Loss avg.: 0.4426 Grad: 0.3366 LR: 0.00029  
Epoch: [8][600/1794] Elapsed 2m 55s (remain 5m 48s) Loss avg.: 0.4429 Grad: 0.3967 LR: 0.00029  
Epoch: [8][700/1794] Elapsed 3m 24s (remain 5m 18s) Loss avg.: 0.4428 Grad: 0.4355 LR: 0.00029  
Epoch: [8][800/1794] Elapsed 3m 53s (remain 4m 49s) Loss avg.: 0.4427 Grad: 0.3034 LR: 0.00029  
Epoch: [8][900/1794] Elapsed 4m 22s (remain 4m 20s) Loss avg.: 0.4431 Grad: 0.5039 LR: 0.00029  
Epoch: [8][1000/1794] Elapsed 4m

Epoch 8 - avg_train_loss: 0.4434  avg_val_loss: 0.4501  time: 539s
Epoch 8 - Accuracy: 0.8054810404261987
Epoch 8 - Save Best Score: 0.8055 Model


Eval: [199/200] Elapsed 0m 16s (remain 0m 0s) Loss avg.: 0.4501 
Epoch: [9][0/1794] Elapsed 0m 1s (remain 44m 0s) Loss avg.: 0.4340 Grad: 0.4222 LR: 0.00019  
Epoch: [9][100/1794] Elapsed 0m 30s (remain 8m 36s) Loss avg.: 0.4383 Grad: 0.3655 LR: 0.00019  
Epoch: [9][200/1794] Elapsed 0m 59s (remain 7m 54s) Loss avg.: 0.4398 Grad: 0.4049 LR: 0.00019  
Epoch: [9][300/1794] Elapsed 1m 28s (remain 7m 20s) Loss avg.: 0.4409 Grad: 0.2714 LR: 0.00019  
Epoch: [9][400/1794] Elapsed 1m 57s (remain 6m 49s) Loss avg.: 0.4404 Grad: 0.4010 LR: 0.00019  
Epoch: [9][500/1794] Elapsed 2m 26s (remain 6m 19s) Loss avg.: 0.4405 Grad: 0.3322 LR: 0.00019  
Epoch: [9][600/1794] Elapsed 2m 56s (remain 5m 49s) Loss avg.: 0.4405 Grad: 0.3308 LR: 0.00019  
Epoch: [9][700/1794] Elapsed 3m 25s (remain 5m 19s) Loss avg.: 0.4407 Grad: 0.4179 LR: 0.00019  
Epoch: [9][800/1794] Elapsed 3m 54s (remain 4m 50s) Loss avg.: 0.4407 Grad: 0.3431 LR: 0.00019  
Epoch: [9][900/1794] Elapsed 4m 23s (remain 4m 21s) Loss avg.: 0.

Epoch 9 - avg_train_loss: 0.4407  avg_val_loss: 0.4488  time: 540s
Epoch 9 - Accuracy: 0.8062112190535882
Epoch 9 - Save Best Score: 0.8062 Model


Epoch: [10][0/1794] Elapsed 0m 1s (remain 40m 56s) Loss avg.: 0.4314 Grad: 0.3215 LR: 0.00012  
Epoch: [10][100/1794] Elapsed 0m 30s (remain 8m 29s) Loss avg.: 0.4403 Grad: 0.3150 LR: 0.00012  
Epoch: [10][200/1794] Elapsed 0m 59s (remain 7m 50s) Loss avg.: 0.4385 Grad: 0.2875 LR: 0.00012  
Epoch: [10][300/1794] Elapsed 1m 28s (remain 7m 18s) Loss avg.: 0.4385 Grad: 0.3171 LR: 0.00012  
Epoch: [10][400/1794] Elapsed 1m 57s (remain 6m 48s) Loss avg.: 0.4389 Grad: 0.3276 LR: 0.00012  
Epoch: [10][500/1794] Elapsed 2m 26s (remain 6m 18s) Loss avg.: 0.4386 Grad: 0.4329 LR: 0.00012  
Epoch: [10][600/1794] Elapsed 2m 55s (remain 5m 49s) Loss avg.: 0.4387 Grad: 0.4370 LR: 0.00012  
Epoch: [10][700/1794] Elapsed 3m 25s (remain 5m 19s) Loss avg.: 0.4388 Grad: 0.4442 LR: 0.00012  
Epoch: [10][800/1794] Elapsed 3m 54s (remain 4m 50s) Loss avg.: 0.4386 Grad: 0.3318 LR: 0.00012  
Epoch: [10][900/1794] Elapsed 4m 23s (remain 4m 20s) Loss avg.: 0.4385 Grad: 0.3075 LR: 0.00012  
Epoch: [10][1000/1794]

Epoch 10 - avg_train_loss: 0.4388  avg_val_loss: 0.4480  time: 540s
Epoch 10 - Accuracy: 0.8065042306486995
Epoch 10 - Save Best Score: 0.8065 Model
Epoch 10 - Save final model
Score: 0.80650
