# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    geese_net_layers = 12
    geese_net_filters = 64

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 10
    model_name = "geese_net"
    pre_train_file = "geese_net_fold0_best_64_bbe3e7ad850239e497e7b15eb2c6caaf2fe33938.pth"

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [4]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [5]:
if Config.apex:
    from apex import amp

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

18380


In [9]:
# fit for memory size...
paths = paths[-11000:]
# paths = paths[:-11000]
print(len(paths))

11000


In [10]:
if Config.debug:
    paths = paths[:10]

## Utils

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [12]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [13]:
next_position_map = {}
for pos in range(77):
    position = []
    position.append((11 * (1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (-1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (pos // 11) + (pos + 1) % 11) % 77)
    position.append((11 * (pos // 11) + (pos - 1) % 11) % 77)
    next_position_map[pos] = set(position)

In [14]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [15]:
def get_reverse_cube(obses):
    """
    尻尾から順番に 1, 0.9, 0.8, ... という並び
    """
    b = np.zeros((4, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, geese in enumerate(obs["geese"]):
        # whole position reverse
        for num_reverse, pos in enumerate(geese[::-1]):
            b[(p - obs["index"]) % 4, pos] = 1 - num_reverse * 0.1

    return b.reshape(-1, 7, 11)

In [16]:
def get_next_disappear_cube(obses):
    """
    次になくなる場所: 1
    次になくなる可能性のある場所: 0.5
    """
    b = np.zeros((4, 7 * 11), dtype=np.float32)
    obs = obses[-1]
    step = obs["step"]

    # foodを食べる可能性があるか。
    eat_food_possibility = defaultdict(int)
    for p, geese in enumerate(obs["geese"]):
        for pos in geese[:1]:
            if not next_position_map[pos].isdisjoint(obs["food"]):
                eat_food_possibility[p] = 1

    if (step % 40) == 39:  # 1つ短くなる
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 尻尾が1、尻尾の１つ前0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1
                for pos in geese[-2:-1]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし -> 尻尾が1, 尻尾の1つ前1
                for pos in geese[-2:]:
                    b[(p - obs["index"]) % 4, pos] = 1
    else:  # 1つ短くならない
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 食べる可能性があり -> 尻尾を0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし # 尻尾を1
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1

    return b.reshape(-1, 7, 11)

In [17]:
def get_step_cube_v2(obses):
    """
    step0: 0, step199: 1
    step0: 0, step39 + 40n: 1
    """
    b = np.zeros((1, 7, 11), dtype=np.float32)
    obs = obses[-1]
    step = obs["step"]

    b[:, :, :5] = (step % 200) / 199
    b[:, :, 5:] = (step % 40) / 39

    return b

In [18]:
def get_length_cube(obses):
    b = np.zeros((2, 7, 11), dtype=np.float32)
    obs = obses[-1]

    my_length = len(obs["geese"][obs["index"]])
    opposite1_length = len(obs["geese"][(obs["index"] + 1) % 4])
    opposite2_length = len(obs["geese"][(obs["index"] + 2) % 4])
    opposite3_length = len(obs["geese"][(obs["index"] + 3) % 4])

    b[0] = my_length / 10
    max_opposite_length = max(opposite1_length, opposite2_length, opposite3_length)
    b[1, :, 0:2] = (my_length - max_opposite_length) / 10
    b[1, :, 2:5] = (my_length - opposite1_length) / 10
    b[1, :, 5:8] = (my_length - opposite2_length) / 10
    b[1, :, 8:11] = (my_length - opposite3_length) / 10

    return b

## Data

In [19]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            X_.append(get_reverse_cube(obses[: j + 1]))
            X_.append(get_next_disappear_cube(obses[: j + 1]))
            X_.append(get_step_cube_v2(obses[: j + 1]))
            X_.append(get_length_cube(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.float16)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except Exception as e:
        # raise Exception from e
        return 0, 0

In [20]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11000.0), HTML(value='')))


Num episode: 6885564


In [21]:
unique_ = False

In [22]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [23]:
if unique_:
    X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
    X_train_group = np.unique(X_train_sum_obs)
    X_train_group.shape

In [24]:
if unique_:
    X_train_unique = []
    y_train_unique = []
    for group in tqdm(X_train_group):
        group_index = np.where(X_train_sum_obs == group)

        X_train_ = X_train[group_index]
        y_train_ = y_train[group_index]

        X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
        y_train_ = y_train_[unique_index]

        X_train_unique.append(X_train_)
        y_train_unique.append(y_train_)

    X_train = np.concatenate(X_train_unique)
    y_train = np.concatenate(y_train_unique)

    print(f"Num episode: {len(X_train)}")

In [25]:
if unique_:
    del X_train_sum_obs
    del X_train_group
    del X_train_unique
    del y_train_unique
    del X_train_
    del y_train_
    del group_index
    del unique_index

In [26]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [27]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [28]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,1
1,0
2,1
3,0
4,2
...,...
6885559,3
6885560,1
6885561,0
6885562,1


## CV Split

In [29]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         164414
      1         164414
      2         179864
      3         179865
1     0         164414
      1         164414
      2         179864
      3         179865
2     0         164413
      1         164414
      2         179865
      3         179865
3     0         164413
      1         164414
      2         179865
      3         179865
4     0         164413
      1         164414
      2         179865
      3         179864
5     0         164413
      1         164414
      2         179865
      3         179864
6     0         164414
      1         164413
      2         179865
      3         179864
7     0         164414
      1         164413
      2         179865
      3         179864
8     0         164414
      1         164413
      2         179864
      3         179865
9     0         164414
      1         164413
      2         179864
      3         179865
dtype: int64


## Dataset

In [30]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [31]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [32]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [33]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters

        self.conv0 = TorusConv2d(28, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_p2 = nn.Linear(filters * 3, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_v2 = nn.Linear(filters * 3, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v2 = (h_v * x[:, 1:2]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v3 = (h_v * x[:, 2:3]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v4 = (h_v * x[:, 3:4]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v1 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)
        h_avg_v2 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_head_v2, h_head_v3, h_head_v4, h_avg_v1, h_avg_v2], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [34]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [35]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [36]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [37]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [38]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [39]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [40]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    try:
        model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    except:
        print(f"Failed to load pre-train weight.")

    # Disable training for value network
    for param in model.conv_v.parameters():
        param.requires_grad = False
    for param in model.head_v1.parameters():
        param.requires_grad = False
    for param in model.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [41]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [42]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [43]:
if __name__ == "__main__":
    main()



Epoch: [1][0/1936] Elapsed 0m 5s (remain 176m 17s) Loss avg.: 0.4478 Grad: 0.2341 LR: 0.00100  
Epoch: [1][100/1936] Elapsed 1m 17s (remain 23m 22s) Loss avg.: 0.4619 Grad: 0.3070 LR: 0.00100  
Epoch: [1][200/1936] Elapsed 2m 28s (remain 21m 24s) Loss avg.: 0.4617 Grad: 0.2188 LR: 0.00100  
Epoch: [1][300/1936] Elapsed 3m 40s (remain 19m 59s) Loss avg.: 0.4609 Grad: 0.3170 LR: 0.00100  
Epoch: [1][400/1936] Elapsed 4m 52s (remain 18m 39s) Loss avg.: 0.4609 Grad: 0.2148 LR: 0.00100  
Epoch: [1][500/1936] Elapsed 6m 4s (remain 17m 22s) Loss avg.: 0.4606 Grad: 0.2329 LR: 0.00100  
Epoch: [1][600/1936] Elapsed 7m 15s (remain 16m 7s) Loss avg.: 0.4602 Grad: 0.2047 LR: 0.00100  
Epoch: [1][700/1936] Elapsed 8m 27s (remain 14m 53s) Loss avg.: 0.4599 Grad: 0.2868 LR: 0.00100  
Epoch: [1][800/1936] Elapsed 9m 38s (remain 13m 40s) Loss avg.: 0.4598 Grad: 0.3099 LR: 0.00100  
Epoch: [1][900/1936] Elapsed 10m 50s (remain 12m 27s) Loss avg.: 0.4594 Grad: 0.2018 LR: 0.00100  
Epoch: [1][1000/1936] E

Epoch 1 - avg_train_loss: 0.4576  avg_val_loss: 0.4611  time: 1439s
Epoch 1 - Accuracy: 0.799509699269632
Epoch 1 - Save Best Score: 0.7995 Model


Epoch: [2][0/1936] Elapsed 0m 1s (remain 64m 29s) Loss avg.: 0.4712 Grad: 0.1920 LR: 0.00098  
Epoch: [2][100/1936] Elapsed 1m 13s (remain 22m 20s) Loss avg.: 0.4498 Grad: 0.2377 LR: 0.00098  
Epoch: [2][200/1936] Elapsed 2m 25s (remain 20m 55s) Loss avg.: 0.4512 Grad: 0.2140 LR: 0.00098  
Epoch: [2][300/1936] Elapsed 3m 37s (remain 19m 39s) Loss avg.: 0.4512 Grad: 0.1962 LR: 0.00098  
Epoch: [2][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4510 Grad: 0.1867 LR: 0.00098  
Epoch: [2][500/1936] Elapsed 6m 0s (remain 17m 13s) Loss avg.: 0.4513 Grad: 0.1819 LR: 0.00098  
Epoch: [2][600/1936] Elapsed 7m 12s (remain 16m 0s) Loss avg.: 0.4513 Grad: 0.1808 LR: 0.00098  
Epoch: [2][700/1936] Elapsed 8m 24s (remain 14m 48s) Loss avg.: 0.4514 Grad: 0.2159 LR: 0.00098  
Epoch: [2][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4514 Grad: 0.1916 LR: 0.00098  
Epoch: [2][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4515 Grad: 0.1855 LR: 0.00098  
Epoch: [2][1000/1936] El

Epoch 2 - avg_train_loss: 0.4515  avg_val_loss: 0.4544  time: 1436s
Epoch 2 - Accuracy: 0.8024912970168047
Epoch 2 - Save Best Score: 0.8025 Model


Epoch: [3][0/1936] Elapsed 0m 2s (remain 65m 8s) Loss avg.: 0.4496 Grad: 0.1786 LR: 0.00091  
Epoch: [3][100/1936] Elapsed 1m 13s (remain 22m 19s) Loss avg.: 0.4475 Grad: 0.2071 LR: 0.00091  
Epoch: [3][200/1936] Elapsed 2m 25s (remain 20m 55s) Loss avg.: 0.4462 Grad: 0.2117 LR: 0.00091  
Epoch: [3][300/1936] Elapsed 3m 37s (remain 19m 38s) Loss avg.: 0.4460 Grad: 0.2263 LR: 0.00091  
Epoch: [3][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4460 Grad: 0.2244 LR: 0.00091  
Epoch: [3][500/1936] Elapsed 6m 0s (remain 17m 12s) Loss avg.: 0.4465 Grad: 0.2627 LR: 0.00091  
Epoch: [3][600/1936] Elapsed 7m 11s (remain 15m 59s) Loss avg.: 0.4466 Grad: 0.1825 LR: 0.00091  
Epoch: [3][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4470 Grad: 0.2040 LR: 0.00091  
Epoch: [3][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4475 Grad: 0.2215 LR: 0.00091  
Epoch: [3][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4476 Grad: 0.1812 LR: 0.00091  
Epoch: [3][1000/1936] El

Epoch 3 - avg_train_loss: 0.4477  avg_val_loss: 0.4536  time: 1435s
Epoch 3 - Accuracy: 0.8032740935027892
Epoch 3 - Save Best Score: 0.8033 Model


Epoch: [4][0/1936] Elapsed 0m 1s (remain 63m 10s) Loss avg.: 0.4657 Grad: 0.2071 LR: 0.00081  
Epoch: [4][100/1936] Elapsed 1m 13s (remain 22m 22s) Loss avg.: 0.4442 Grad: 0.1747 LR: 0.00081  
Epoch: [4][200/1936] Elapsed 2m 25s (remain 20m 56s) Loss avg.: 0.4432 Grad: 0.2139 LR: 0.00081  
Epoch: [4][300/1936] Elapsed 3m 37s (remain 19m 39s) Loss avg.: 0.4427 Grad: 0.2007 LR: 0.00081  
Epoch: [4][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4430 Grad: 0.2252 LR: 0.00081  
Epoch: [4][500/1936] Elapsed 6m 0s (remain 17m 12s) Loss avg.: 0.4435 Grad: 0.1961 LR: 0.00081  
Epoch: [4][600/1936] Elapsed 7m 12s (remain 15m 59s) Loss avg.: 0.4437 Grad: 0.2427 LR: 0.00081  
Epoch: [4][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4438 Grad: 0.1674 LR: 0.00081  
Epoch: [4][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4436 Grad: 0.1667 LR: 0.00081  
Epoch: [4][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4439 Grad: 0.1885 LR: 0.00081  
Epoch: [4][1000/1936] E

Epoch 4 - avg_train_loss: 0.4442  avg_val_loss: 0.4518  time: 1436s
Epoch 4 - Accuracy: 0.8039697512333764
Epoch 4 - Save Best Score: 0.8040 Model


Epoch: [5][0/1936] Elapsed 0m 1s (remain 63m 46s) Loss avg.: 0.4601 Grad: 0.2066 LR: 0.00069  
Epoch: [5][100/1936] Elapsed 1m 13s (remain 22m 17s) Loss avg.: 0.4415 Grad: 0.1952 LR: 0.00069  
Epoch: [5][200/1936] Elapsed 2m 25s (remain 20m 53s) Loss avg.: 0.4404 Grad: 0.1905 LR: 0.00069  
Epoch: [5][300/1936] Elapsed 3m 37s (remain 19m 38s) Loss avg.: 0.4401 Grad: 0.1554 LR: 0.00069  
Epoch: [5][400/1936] Elapsed 4m 48s (remain 18m 24s) Loss avg.: 0.4400 Grad: 0.1550 LR: 0.00069  
Epoch: [5][500/1936] Elapsed 6m 0s (remain 17m 11s) Loss avg.: 0.4399 Grad: 0.1891 LR: 0.00069  
Epoch: [5][600/1936] Elapsed 7m 11s (remain 15m 59s) Loss avg.: 0.4398 Grad: 0.1952 LR: 0.00069  
Epoch: [5][700/1936] Elapsed 8m 23s (remain 14m 46s) Loss avg.: 0.4400 Grad: 0.1751 LR: 0.00069  
Epoch: [5][800/1936] Elapsed 9m 34s (remain 13m 34s) Loss avg.: 0.4401 Grad: 0.1506 LR: 0.00069  
Epoch: [5][900/1936] Elapsed 10m 46s (remain 12m 22s) Loss avg.: 0.4401 Grad: 0.1785 LR: 0.00069  
Epoch: [5][1000/1936] E

Epoch 5 - avg_train_loss: 0.4408  avg_val_loss: 0.4486  time: 1435s
Epoch 5 - Accuracy: 0.8054089930100196
Epoch 5 - Save Best Score: 0.8054 Model


Epoch: [6][0/1936] Elapsed 0m 1s (remain 62m 34s) Loss avg.: 0.4273 Grad: 0.2411 LR: 0.00055  
Epoch: [6][100/1936] Elapsed 1m 13s (remain 22m 18s) Loss avg.: 0.4349 Grad: 0.1760 LR: 0.00055  
Epoch: [6][200/1936] Elapsed 2m 25s (remain 20m 54s) Loss avg.: 0.4351 Grad: 0.2150 LR: 0.00055  
Epoch: [6][300/1936] Elapsed 3m 36s (remain 19m 38s) Loss avg.: 0.4344 Grad: 0.1798 LR: 0.00055  
Epoch: [6][400/1936] Elapsed 4m 48s (remain 18m 24s) Loss avg.: 0.4348 Grad: 0.3159 LR: 0.00055  
Epoch: [6][500/1936] Elapsed 6m 0s (remain 17m 11s) Loss avg.: 0.4350 Grad: 0.2057 LR: 0.00055  
Epoch: [6][600/1936] Elapsed 7m 12s (remain 15m 59s) Loss avg.: 0.4354 Grad: 0.1798 LR: 0.00055  
Epoch: [6][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4356 Grad: 0.2162 LR: 0.00055  
Epoch: [6][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4358 Grad: 0.2098 LR: 0.00055  
Epoch: [6][900/1936] Elapsed 10m 46s (remain 12m 23s) Loss avg.: 0.4358 Grad: 0.1857 LR: 0.00055  
Epoch: [6][1000/1936] E

Epoch 6 - avg_train_loss: 0.4371  avg_val_loss: 0.4468  time: 1436s
Epoch 6 - Accuracy: 0.8064009225089571
Epoch 6 - Save Best Score: 0.8064 Model


Epoch: [7][0/1936] Elapsed 0m 1s (remain 62m 21s) Loss avg.: 0.4284 Grad: 0.1852 LR: 0.00041  
Epoch: [7][100/1936] Elapsed 1m 13s (remain 22m 18s) Loss avg.: 0.4310 Grad: 0.2139 LR: 0.00041  
Epoch: [7][200/1936] Elapsed 2m 25s (remain 20m 54s) Loss avg.: 0.4314 Grad: 0.1808 LR: 0.00041  
Epoch: [7][300/1936] Elapsed 3m 37s (remain 19m 38s) Loss avg.: 0.4316 Grad: 0.2047 LR: 0.00041  
Epoch: [7][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4315 Grad: 0.1875 LR: 0.00041  
Epoch: [7][500/1936] Elapsed 6m 0s (remain 17m 12s) Loss avg.: 0.4316 Grad: 0.1849 LR: 0.00041  
Epoch: [7][600/1936] Elapsed 7m 12s (remain 15m 59s) Loss avg.: 0.4322 Grad: 0.2330 LR: 0.00041  
Epoch: [7][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4323 Grad: 0.2176 LR: 0.00041  
Epoch: [7][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4322 Grad: 0.2275 LR: 0.00041  
Epoch: [7][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4324 Grad: 0.2219 LR: 0.00041  
Epoch: [7][1000/1936] E

Epoch 7 - avg_train_loss: 0.4331  avg_val_loss: 0.4462  time: 1435s
Epoch 7 - Accuracy: 0.8071662912438622
Epoch 7 - Save Best Score: 0.8072 Model


Epoch: [8][0/1936] Elapsed 0m 1s (remain 63m 8s) Loss avg.: 0.4204 Grad: 0.1881 LR: 0.00029  
Epoch: [8][100/1936] Elapsed 1m 13s (remain 22m 19s) Loss avg.: 0.4272 Grad: 0.1965 LR: 0.00029  
Epoch: [8][200/1936] Elapsed 2m 25s (remain 20m 56s) Loss avg.: 0.4283 Grad: 0.1775 LR: 0.00029  
Epoch: [8][300/1936] Elapsed 3m 37s (remain 19m 39s) Loss avg.: 0.4284 Grad: 0.1960 LR: 0.00029  
Epoch: [8][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4286 Grad: 0.2252 LR: 0.00029  
Epoch: [8][500/1936] Elapsed 6m 0s (remain 17m 12s) Loss avg.: 0.4286 Grad: 0.2100 LR: 0.00029  
Epoch: [8][600/1936] Elapsed 7m 12s (remain 16m 0s) Loss avg.: 0.4283 Grad: 0.1855 LR: 0.00029  
Epoch: [8][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4285 Grad: 0.1920 LR: 0.00029  
Epoch: [8][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4284 Grad: 0.1992 LR: 0.00029  
Epoch: [8][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4284 Grad: 0.1991 LR: 0.00029  
Epoch: [8][1000/1936] Ela

Epoch 8 - avg_train_loss: 0.4290  avg_val_loss: 0.4462  time: 1436s
Epoch 8 - Accuracy: 0.8073333071917067
Epoch 8 - Save Best Score: 0.8073 Model


Epoch: [9][0/1936] Elapsed 0m 2s (remain 72m 58s) Loss avg.: 0.4319 Grad: 0.1814 LR: 0.00019  
Epoch: [9][100/1936] Elapsed 1m 14s (remain 22m 25s) Loss avg.: 0.4265 Grad: 0.1940 LR: 0.00019  
Epoch: [9][200/1936] Elapsed 2m 25s (remain 20m 57s) Loss avg.: 0.4240 Grad: 0.2050 LR: 0.00019  
Epoch: [9][300/1936] Elapsed 3m 37s (remain 19m 40s) Loss avg.: 0.4241 Grad: 0.2091 LR: 0.00019  
Epoch: [9][400/1936] Elapsed 4m 49s (remain 18m 27s) Loss avg.: 0.4241 Grad: 0.1983 LR: 0.00019  
Epoch: [9][500/1936] Elapsed 6m 0s (remain 17m 13s) Loss avg.: 0.4242 Grad: 0.2078 LR: 0.00019  
Epoch: [9][600/1936] Elapsed 7m 12s (remain 16m 0s) Loss avg.: 0.4247 Grad: 0.1884 LR: 0.00019  
Epoch: [9][700/1936] Elapsed 8m 24s (remain 14m 48s) Loss avg.: 0.4247 Grad: 0.2178 LR: 0.00019  
Epoch: [9][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4246 Grad: 0.2100 LR: 0.00019  
Epoch: [9][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4245 Grad: 0.2236 LR: 0.00019  
Epoch: [9][1000/1936] El

Epoch 9 - avg_train_loss: 0.4250  avg_val_loss: 0.4451  time: 1436s
Epoch 9 - Accuracy: 0.8076470067111364
Epoch 9 - Save Best Score: 0.8076 Model


Epoch: [10][0/1936] Elapsed 0m 1s (remain 62m 21s) Loss avg.: 0.4161 Grad: 0.2179 LR: 0.00012  
Epoch: [10][100/1936] Elapsed 1m 13s (remain 22m 18s) Loss avg.: 0.4183 Grad: 0.1958 LR: 0.00012  
Epoch: [10][200/1936] Elapsed 2m 25s (remain 20m 54s) Loss avg.: 0.4196 Grad: 0.1963 LR: 0.00012  
Epoch: [10][300/1936] Elapsed 3m 37s (remain 19m 38s) Loss avg.: 0.4201 Grad: 0.2054 LR: 0.00012  
Epoch: [10][400/1936] Elapsed 4m 48s (remain 18m 25s) Loss avg.: 0.4204 Grad: 0.2254 LR: 0.00012  
Epoch: [10][500/1936] Elapsed 6m 0s (remain 17m 12s) Loss avg.: 0.4204 Grad: 0.2252 LR: 0.00012  
Epoch: [10][600/1936] Elapsed 7m 12s (remain 16m 0s) Loss avg.: 0.4207 Grad: 0.2209 LR: 0.00012  
Epoch: [10][700/1936] Elapsed 8m 23s (remain 14m 47s) Loss avg.: 0.4208 Grad: 0.1987 LR: 0.00012  
Epoch: [10][800/1936] Elapsed 9m 35s (remain 13m 35s) Loss avg.: 0.4212 Grad: 0.2005 LR: 0.00012  
Epoch: [10][900/1936] Elapsed 10m 47s (remain 12m 23s) Loss avg.: 0.4213 Grad: 0.2116 LR: 0.00012  
Epoch: [10][10

Epoch 10 - avg_train_loss: 0.4220  avg_val_loss: 0.4456  time: 1436s
Epoch 10 - Accuracy: 0.807446587573723
Epoch 10 - Save final model
Score: 0.80765
