# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Load Data

In [3]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

16521


## Config

In [5]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    geese_net_layers = 12
    geese_net_filters = 96

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 10
    model_name = "geese_net"

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [6]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [7]:
if Config.apex:
    from apex import amp

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [10]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [11]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [12]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    b[0, 0] = obs["step"]  # 0-198

    return b.reshape(1, 7, 11)

## Data

In [13]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.uint8)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [14]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16521.0), HTML(value='')))


Num episode: 10343040


In [15]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [16]:
X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [17]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 10341154


In [18]:
del X_train_sum_obs
del X_train_group
del X_train_unique
del y_train_unique
del X_train_
del y_train_
del group_index
del unique_index

In [19]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [20]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [21]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,3
2,1
3,2
4,0
...,...
10341149,3
10341150,3
10341151,2
10341152,2


## CV Split

In [22]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         246342
      1         246342
      2         270716
      3         270716
1     0         246342
      1         246342
      2         270716
      3         270716
2     0         246342
      1         246342
      2         270716
      3         270716
3     0         246342
      1         246342
      2         270716
      3         270716
4     0         246342
      1         246342
      2         270716
      3         270715
5     0         246342
      1         246342
      2         270716
      3         270715
6     0         246342
      1         246342
      2         270716
      3         270715
7     0         246342
      1         246342
      2         270716
      3         270715
8     0         246342
      1         246342
      2         270715
      3         270716
9     0         246342
      1         246342
      2         270715
      3         270716
dtype: int64


## Dataset

In [23]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [24]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [25]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [26]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [27]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters

        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_p2 = nn.Linear(filters * 3, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_v2 = nn.Linear(filters * 3, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v2 = (h_v * x[:, 1:2]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v3 = (h_v * x[:, 2:3]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v4 = (h_v * x[:, 3:4]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v1 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)
        h_avg_v2 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_head_v2, h_head_v3, h_head_v4, h_avg_v1, h_avg_v2], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [28]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [29]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [30]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [31]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [32]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [33]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [34]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()

    # Disable training for value network
    for param in model.conv_v.parameters():
        param.requires_grad = False
    for param in model.head_v1.parameters():
        param.requires_grad = False
    for param in model.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [35]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [36]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [37]:
if __name__ == "__main__":
    main()



Epoch: [1][0/2908] Elapsed 0m 5s (remain 288m 8s) Loss avg.: 1.3911 Grad: 0.4668 LR: 0.00100  
Epoch: [1][100/2908] Elapsed 2m 14s (remain 62m 19s) Loss avg.: 0.6794 Grad: 0.4572 LR: 0.00100  
Epoch: [1][200/2908] Elapsed 4m 23s (remain 59m 3s) Loss avg.: 0.6236 Grad: 0.9811 LR: 0.00100  
Epoch: [1][300/2908] Elapsed 6m 31s (remain 56m 31s) Loss avg.: 0.5945 Grad: 0.5229 LR: 0.00100  
Epoch: [1][400/2908] Elapsed 8m 40s (remain 54m 11s) Loss avg.: 0.5765 Grad: 0.4198 LR: 0.00100  
Epoch: [1][500/2908] Elapsed 10m 48s (remain 51m 57s) Loss avg.: 0.5640 Grad: 0.6569 LR: 0.00100  
Epoch: [1][600/2908] Elapsed 12m 57s (remain 49m 44s) Loss avg.: 0.5544 Grad: 0.2902 LR: 0.00100  
Epoch: [1][700/2908] Elapsed 15m 5s (remain 47m 32s) Loss avg.: 0.5473 Grad: 0.3747 LR: 0.00100  
Epoch: [1][800/2908] Elapsed 17m 14s (remain 45m 21s) Loss avg.: 0.5412 Grad: 0.4690 LR: 0.00100  
Epoch: [1][900/2908] Elapsed 19m 22s (remain 43m 10s) Loss avg.: 0.5360 Grad: 0.4779 LR: 0.00100  
Epoch: [1][1000/2908

Epoch 1 - avg_train_loss: 0.4961  avg_val_loss: 0.4731  time: 3869s
Epoch 1 - Accuracy: 0.7940327777541397
Epoch 1 - Save Best Score: 0.7940 Model


Epoch: [2][0/2908] Elapsed 0m 3s (remain 146m 15s) Loss avg.: 0.4662 Grad: 0.2057 LR: 0.00098  
Epoch: [2][100/2908] Elapsed 2m 11s (remain 60m 58s) Loss avg.: 0.4687 Grad: 0.3817 LR: 0.00098  
Epoch: [2][200/2908] Elapsed 4m 20s (remain 58m 24s) Loss avg.: 0.4692 Grad: 0.2498 LR: 0.00098  
Epoch: [2][300/2908] Elapsed 6m 28s (remain 56m 7s) Loss avg.: 0.4687 Grad: 0.2592 LR: 0.00098  
Epoch: [2][400/2908] Elapsed 8m 37s (remain 53m 55s) Loss avg.: 0.4682 Grad: 0.1994 LR: 0.00098  
Epoch: [2][500/2908] Elapsed 10m 46s (remain 51m 44s) Loss avg.: 0.4680 Grad: 0.2279 LR: 0.00098  
Epoch: [2][600/2908] Elapsed 12m 54s (remain 49m 33s) Loss avg.: 0.4678 Grad: 0.2298 LR: 0.00098  
Epoch: [2][700/2908] Elapsed 15m 3s (remain 47m 23s) Loss avg.: 0.4674 Grad: 0.3040 LR: 0.00098  
Epoch: [2][800/2908] Elapsed 17m 11s (remain 45m 14s) Loss avg.: 0.4675 Grad: 0.2602 LR: 0.00098  
Epoch: [2][900/2908] Elapsed 19m 20s (remain 43m 4s) Loss avg.: 0.4673 Grad: 0.3424 LR: 0.00098  
Epoch: [2][1000/2908

Epoch 2 - avg_train_loss: 0.4650  avg_val_loss: 0.4660  time: 3866s
Epoch 2 - Accuracy: 0.7974095749413025
Epoch 2 - Save Best Score: 0.7974 Model


Epoch: [3][0/2908] Elapsed 0m 2s (remain 144m 40s) Loss avg.: 0.4682 Grad: 0.3653 LR: 0.00091  
Epoch: [3][100/2908] Elapsed 2m 11s (remain 60m 56s) Loss avg.: 0.4598 Grad: 0.3612 LR: 0.00091  
Epoch: [3][200/2908] Elapsed 4m 20s (remain 58m 23s) Loss avg.: 0.4594 Grad: 0.1839 LR: 0.00091  
Epoch: [3][300/2908] Elapsed 6m 28s (remain 56m 6s) Loss avg.: 0.4597 Grad: 0.2543 LR: 0.00091  
Epoch: [3][400/2908] Elapsed 8m 37s (remain 53m 55s) Loss avg.: 0.4598 Grad: 0.2888 LR: 0.00091  
Epoch: [3][500/2908] Elapsed 10m 46s (remain 51m 43s) Loss avg.: 0.4595 Grad: 0.2242 LR: 0.00091  
Epoch: [3][600/2908] Elapsed 12m 54s (remain 49m 32s) Loss avg.: 0.4596 Grad: 0.1603 LR: 0.00091  
Epoch: [3][700/2908] Elapsed 15m 3s (remain 47m 22s) Loss avg.: 0.4591 Grad: 0.1906 LR: 0.00091  
Epoch: [3][800/2908] Elapsed 17m 11s (remain 45m 13s) Loss avg.: 0.4590 Grad: 0.2521 LR: 0.00091  
Epoch: [3][900/2908] Elapsed 19m 19s (remain 43m 3s) Loss avg.: 0.4590 Grad: 0.1749 LR: 0.00091  
Epoch: [3][1000/2908

Epoch 3 - avg_train_loss: 0.4581  avg_val_loss: 0.4611  time: 3865s
Epoch 3 - Accuracy: 0.7997100905507699
Epoch 3 - Save Best Score: 0.7997 Model


Epoch: [4][0/2908] Elapsed 0m 3s (remain 146m 31s) Loss avg.: 0.4661 Grad: 0.1495 LR: 0.00081  
Epoch: [4][100/2908] Elapsed 2m 11s (remain 60m 58s) Loss avg.: 0.4514 Grad: 0.2349 LR: 0.00081  
Epoch: [4][200/2908] Elapsed 4m 20s (remain 58m 25s) Loss avg.: 0.4518 Grad: 0.1463 LR: 0.00081  
Epoch: [4][300/2908] Elapsed 6m 29s (remain 56m 9s) Loss avg.: 0.4525 Grad: 0.2985 LR: 0.00081  
Epoch: [4][400/2908] Elapsed 8m 37s (remain 53m 55s) Loss avg.: 0.4521 Grad: 0.2614 LR: 0.00081  
Epoch: [4][500/2908] Elapsed 10m 46s (remain 51m 44s) Loss avg.: 0.4525 Grad: 0.1292 LR: 0.00081  
Epoch: [4][600/2908] Elapsed 12m 54s (remain 49m 34s) Loss avg.: 0.4529 Grad: 0.1487 LR: 0.00081  
Epoch: [4][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4534 Grad: 0.1896 LR: 0.00081  
Epoch: [4][800/2908] Elapsed 17m 11s (remain 45m 14s) Loss avg.: 0.4532 Grad: 0.1678 LR: 0.00081  
Epoch: [4][900/2908] Elapsed 19m 20s (remain 43m 5s) Loss avg.: 0.4533 Grad: 0.2048 LR: 0.00081  
Epoch: [4][1000/2908

Epoch 4 - avg_train_loss: 0.4534  avg_val_loss: 0.4573  time: 3867s
Epoch 4 - Accuracy: 0.8009730049626927
Epoch 4 - Save Best Score: 0.8010 Model


Epoch: [5][0/2908] Elapsed 0m 3s (remain 147m 15s) Loss avg.: 0.4497 Grad: 0.1848 LR: 0.00069  
Epoch: [5][100/2908] Elapsed 2m 11s (remain 60m 59s) Loss avg.: 0.4473 Grad: 0.1738 LR: 0.00069  
Epoch: [5][200/2908] Elapsed 4m 20s (remain 58m 27s) Loss avg.: 0.4484 Grad: 0.1598 LR: 0.00069  
Epoch: [5][300/2908] Elapsed 6m 29s (remain 56m 9s) Loss avg.: 0.4485 Grad: 0.1928 LR: 0.00069  
Epoch: [5][400/2908] Elapsed 8m 37s (remain 53m 55s) Loss avg.: 0.4484 Grad: 0.1966 LR: 0.00069  
Epoch: [5][500/2908] Elapsed 10m 46s (remain 51m 44s) Loss avg.: 0.4484 Grad: 0.1797 LR: 0.00069  
Epoch: [5][600/2908] Elapsed 12m 54s (remain 49m 34s) Loss avg.: 0.4487 Grad: 0.2211 LR: 0.00069  
Epoch: [5][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4489 Grad: 0.1573 LR: 0.00069  
Epoch: [5][800/2908] Elapsed 17m 12s (remain 45m 14s) Loss avg.: 0.4489 Grad: 0.2461 LR: 0.00069  
Epoch: [5][900/2908] Elapsed 19m 20s (remain 43m 5s) Loss avg.: 0.4488 Grad: 0.1976 LR: 0.00069  
Epoch: [5][1000/2908

Epoch 5 - avg_train_loss: 0.4494  avg_val_loss: 0.4552  time: 3867s
Epoch 5 - Accuracy: 0.8019767608276054
Epoch 5 - Save Best Score: 0.8020 Model


Epoch: [6][0/2908] Elapsed 0m 3s (remain 153m 10s) Loss avg.: 0.4313 Grad: 0.1827 LR: 0.00055  
Epoch: [6][100/2908] Elapsed 2m 12s (remain 61m 12s) Loss avg.: 0.4439 Grad: 0.1318 LR: 0.00055  
Epoch: [6][200/2908] Elapsed 4m 20s (remain 58m 30s) Loss avg.: 0.4432 Grad: 0.2321 LR: 0.00055  
Epoch: [6][300/2908] Elapsed 6m 29s (remain 56m 11s) Loss avg.: 0.4443 Grad: 0.1820 LR: 0.00055  
Epoch: [6][400/2908] Elapsed 8m 37s (remain 53m 57s) Loss avg.: 0.4441 Grad: 0.2073 LR: 0.00055  
Epoch: [6][500/2908] Elapsed 10m 46s (remain 51m 45s) Loss avg.: 0.4443 Grad: 0.1833 LR: 0.00055  
Epoch: [6][600/2908] Elapsed 12m 54s (remain 49m 34s) Loss avg.: 0.4443 Grad: 0.1472 LR: 0.00055  
Epoch: [6][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4444 Grad: 0.2076 LR: 0.00055  
Epoch: [6][800/2908] Elapsed 17m 12s (remain 45m 14s) Loss avg.: 0.4446 Grad: 0.2033 LR: 0.00055  
Epoch: [6][900/2908] Elapsed 19m 20s (remain 43m 6s) Loss avg.: 0.4446 Grad: 0.1537 LR: 0.00055  
Epoch: [6][1000/290

Epoch 6 - avg_train_loss: 0.4452  avg_val_loss: 0.4528  time: 3867s
Epoch 6 - Accuracy: 0.8031603804602192
Epoch 6 - Save Best Score: 0.8032 Model


Epoch: [7][0/2908] Elapsed 0m 3s (remain 152m 27s) Loss avg.: 0.4307 Grad: 0.1642 LR: 0.00041  
Epoch: [7][100/2908] Elapsed 2m 12s (remain 61m 11s) Loss avg.: 0.4394 Grad: 0.1745 LR: 0.00041  
Epoch: [7][200/2908] Elapsed 4m 20s (remain 58m 31s) Loss avg.: 0.4394 Grad: 0.1586 LR: 0.00041  
Epoch: [7][300/2908] Elapsed 6m 29s (remain 56m 11s) Loss avg.: 0.4390 Grad: 0.1552 LR: 0.00041  
Epoch: [7][400/2908] Elapsed 8m 37s (remain 53m 57s) Loss avg.: 0.4394 Grad: 0.1515 LR: 0.00041  
Epoch: [7][500/2908] Elapsed 10m 46s (remain 51m 45s) Loss avg.: 0.4392 Grad: 0.1983 LR: 0.00041  
Epoch: [7][600/2908] Elapsed 12m 55s (remain 49m 35s) Loss avg.: 0.4394 Grad: 0.1864 LR: 0.00041  
Epoch: [7][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4394 Grad: 0.1918 LR: 0.00041  
Epoch: [7][800/2908] Elapsed 17m 12s (remain 45m 15s) Loss avg.: 0.4395 Grad: 0.1646 LR: 0.00041  
Epoch: [7][900/2908] Elapsed 19m 20s (remain 43m 6s) Loss avg.: 0.4395 Grad: 0.1946 LR: 0.00041  
Epoch: [7][1000/290

Epoch 7 - avg_train_loss: 0.4407  avg_val_loss: 0.4520  time: 3866s
Epoch 7 - Accuracy: 0.8033005968382657
Epoch 7 - Save Best Score: 0.8033 Model


Epoch: [8][0/2908] Elapsed 0m 3s (remain 146m 27s) Loss avg.: 0.4435 Grad: 0.1564 LR: 0.00029  
Epoch: [8][100/2908] Elapsed 2m 11s (remain 60m 58s) Loss avg.: 0.4353 Grad: 0.1668 LR: 0.00029  
Epoch: [8][200/2908] Elapsed 4m 20s (remain 58m 24s) Loss avg.: 0.4359 Grad: 0.1993 LR: 0.00029  
Epoch: [8][300/2908] Elapsed 6m 28s (remain 56m 6s) Loss avg.: 0.4354 Grad: 0.1518 LR: 0.00029  
Epoch: [8][400/2908] Elapsed 8m 37s (remain 53m 54s) Loss avg.: 0.4354 Grad: 0.1633 LR: 0.00029  
Epoch: [8][500/2908] Elapsed 10m 45s (remain 51m 42s) Loss avg.: 0.4357 Grad: 0.1858 LR: 0.00029  
Epoch: [8][600/2908] Elapsed 12m 54s (remain 49m 32s) Loss avg.: 0.4355 Grad: 0.1781 LR: 0.00029  
Epoch: [8][700/2908] Elapsed 15m 2s (remain 47m 22s) Loss avg.: 0.4352 Grad: 0.1808 LR: 0.00029  
Epoch: [8][800/2908] Elapsed 17m 11s (remain 45m 13s) Loss avg.: 0.4351 Grad: 0.1870 LR: 0.00029  
Epoch: [8][900/2908] Elapsed 19m 20s (remain 43m 4s) Loss avg.: 0.4352 Grad: 0.2459 LR: 0.00029  
Epoch: [8][1000/2908

Epoch 8 - avg_train_loss: 0.4359  avg_val_loss: 0.4500  time: 3867s
Epoch 8 - Accuracy: 0.8045635112501885
Epoch 8 - Save Best Score: 0.8046 Model


Epoch: [9][0/2908] Elapsed 0m 3s (remain 146m 13s) Loss avg.: 0.4390 Grad: 0.1788 LR: 0.00019  
Epoch: [9][100/2908] Elapsed 2m 11s (remain 60m 57s) Loss avg.: 0.4315 Grad: 0.1814 LR: 0.00019  
Epoch: [9][200/2908] Elapsed 4m 20s (remain 58m 23s) Loss avg.: 0.4300 Grad: 0.1725 LR: 0.00019  
Epoch: [9][300/2908] Elapsed 6m 28s (remain 56m 6s) Loss avg.: 0.4296 Grad: 0.1922 LR: 0.00019  
Epoch: [9][400/2908] Elapsed 8m 37s (remain 53m 54s) Loss avg.: 0.4296 Grad: 0.2103 LR: 0.00019  
Epoch: [9][500/2908] Elapsed 10m 45s (remain 51m 43s) Loss avg.: 0.4301 Grad: 0.1985 LR: 0.00019  
Epoch: [9][600/2908] Elapsed 12m 54s (remain 49m 32s) Loss avg.: 0.4302 Grad: 0.2086 LR: 0.00019  
Epoch: [9][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4300 Grad: 0.2026 LR: 0.00019  
Epoch: [9][800/2908] Elapsed 17m 11s (remain 45m 14s) Loss avg.: 0.4300 Grad: 0.1838 LR: 0.00019  
Epoch: [9][900/2908] Elapsed 19m 20s (remain 43m 5s) Loss avg.: 0.4300 Grad: 0.2019 LR: 0.00019  
Epoch: [9][1000/2908

Epoch 9 - avg_train_loss: 0.4310  avg_val_loss: 0.4491  time: 3867s
Epoch 9 - Accuracy: 0.8047520781034236
Epoch 9 - Save Best Score: 0.8048 Model


Epoch: [10][0/2908] Elapsed 0m 3s (remain 146m 28s) Loss avg.: 0.4411 Grad: 0.1857 LR: 0.00012  
Epoch: [10][100/2908] Elapsed 2m 11s (remain 61m 1s) Loss avg.: 0.4249 Grad: 0.1809 LR: 0.00012  
Epoch: [10][200/2908] Elapsed 4m 20s (remain 58m 26s) Loss avg.: 0.4250 Grad: 0.2217 LR: 0.00012  
Epoch: [10][300/2908] Elapsed 6m 28s (remain 56m 8s) Loss avg.: 0.4253 Grad: 0.2082 LR: 0.00012  
Epoch: [10][400/2908] Elapsed 8m 37s (remain 53m 55s) Loss avg.: 0.4255 Grad: 0.2100 LR: 0.00012  
Epoch: [10][500/2908] Elapsed 10m 46s (remain 51m 43s) Loss avg.: 0.4260 Grad: 0.2112 LR: 0.00012  
Epoch: [10][600/2908] Elapsed 12m 54s (remain 49m 34s) Loss avg.: 0.4259 Grad: 0.1864 LR: 0.00012  
Epoch: [10][700/2908] Elapsed 15m 3s (remain 47m 24s) Loss avg.: 0.4262 Grad: 0.2249 LR: 0.00012  
Epoch: [10][800/2908] Elapsed 17m 11s (remain 45m 14s) Loss avg.: 0.4264 Grad: 0.2003 LR: 0.00012  
Epoch: [10][900/2908] Elapsed 19m 20s (remain 43m 5s) Loss avg.: 0.4261 Grad: 0.1909 LR: 0.00012  
Epoch: [10]

Epoch 10 - avg_train_loss: 0.4268  avg_val_loss: 0.4495  time: 3866s
Epoch 10 - Accuracy: 0.8048864924244475
Epoch 10 - Save Best Score: 0.8049 Model
Epoch 10 - Save final model
Score: 0.80489
