# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
config_defaults = {
    "seed": 440,
    "data_size": 7_000_000,
    "n_class": 4,
    "n_fold": 10,
    "geese_net_layers": 12,
    "geese_net_filters": 48,
    "gradient_accumulation_steps": 1,
    "max_grad_norm": 1000,
    "num_workers": 4,
    "batch_size": 3200,
    "epochs": 10,
    "scheduler": "CosineAnnealingWarmRestarts",
    "criterion": "CrossEntropyLoss",
    "lr": 1e-3,
    "min_lr": 1e-4,
    "weight_decay": 1e-5,
    "model_name": "geese_net_alpha",
}

In [4]:
if config_defaults["scheduler"] == "CosineAnnealingWarmRestarts":
    config_defaults["T_0"] = config_defaults["epochs"]

elif config_defaults["scheduler"] == "CosineAnnealingLR":
    config_defaults["T_max"] = config_defaults["epochs"]

elif config_defaults["scheduler"] == "ReduceLROnPlateau":
    config_defaults["factor"] = 0.2
    config_defaults["patience"] = 4
    config_defaults["eps"] = 1e-6

In [5]:
class Config:
    pre_train_file = ""
    print_freq = 100
    train = True
    debug = False
    apex = False

In [6]:
if Config.debug:
    wandb.init(project="hungry-geese", config=config_defaults, mode="disabled")
else:
    wandb.init(project="hungry-geese", config=config_defaults)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mimokuri[0m (use `wandb login --relogin` to force relogin)


In [7]:
config = wandb.config

In [8]:
if Config.debug:
    config.update({"epochs": 1, "data_size": 10_000}, allow_val_change=True)

In [9]:
if Config.apex:
    from apex import amp

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [11]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [12]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

29048


## Utils

In [13]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=config.seed)

In [14]:
def ident(y):
    return y


def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

In [15]:
def no_flip(image):
    return image


def h_flip(image):
    return image[:, :, ::-1]


def v_flip(image):
    return image[:, ::-1, :]


def hv_flip(image):
    return image[:, ::-1, ::-1]

## Observation

In [16]:
next_position_map = {}
for pos in range(77):
    position = []
    position.append((11 * (1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (-1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (pos // 11) + (pos + 1) % 11) % 77)
    position.append((11 * (pos // 11) + (pos - 1) % 11) % 77)
    next_position_map[pos] = set(position)

In [17]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [18]:
def get_reverse_cube(obses):
    """
    尻尾から順番に 1, 0.9, 0.8, ... という並び
    """
    b = np.zeros((4, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, geese in enumerate(obs["geese"]):
        # whole position reverse
        for num_reverse, pos in enumerate(geese[::-1]):
            b[(p - obs["index"]) % 4, pos] = 1 - num_reverse * 0.1

    return b.reshape(-1, 7, 11)

In [19]:
def get_next_disappear_cube(obses):
    """
    次になくなる場所: 1
    次になくなる可能性のある場所: 0.5
    """
    b = np.zeros((4, 7 * 11), dtype=np.float32)
    obs = obses[-1]
    step = obs["step"]

    # foodを食べる可能性があるか。
    eat_food_possibility = defaultdict(int)
    for p, geese in enumerate(obs["geese"]):
        for pos in geese[:1]:
            if not next_position_map[pos].isdisjoint(obs["food"]):
                eat_food_possibility[p] = 1

    if (step % 40) == 39:  # 1つ短くなる
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 尻尾が1、尻尾の１つ前0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1
                for pos in geese[-2:-1]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし -> 尻尾が1, 尻尾の1つ前1
                for pos in geese[-2:]:
                    b[(p - obs["index"]) % 4, pos] = 1
    else:  # 1つ短くならない
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 食べる可能性があり -> 尻尾を0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし # 尻尾を1
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1

    return b.reshape(-1, 7, 11)

In [20]:
def get_step_cube_v3(obses):
    b = np.zeros((2, 7, 11), dtype=np.float32)
    obs = obses[-1]
    step = obs["step"]

    b[0, :, :] = (step - 188) / 10 if step > 188 else 0
    b[1, :, :] = (step % 40 - 29) / 10 if step % 40 > 29 else 0

    return b

In [21]:
def get_length_cube_v2(obses):
    b = np.zeros((3, 7, 11), dtype=np.float32)
    obs = obses[-1]

    my_length = len(obs["geese"][obs["index"]])
    o1_length = len(obs["geese"][(obs["index"] + 1) % 4])
    o2_length = len(obs["geese"][(obs["index"] + 2) % 4])
    o3_length = len(obs["geese"][(obs["index"] + 3) % 4])

    b[0, :, :] = max(min((my_length - o1_length) * 0.1 + 0.5, 1.0), -1.0)
    b[1, :, :] = max(min((my_length - o2_length) * 0.1 + 0.5, 1.0), -1.0)
    b[2, :, :] = max(min((my_length - o3_length) * 0.1 + 0.5, 1.0), -1.0)

    return b

## Data

In [22]:
X_train = np.zeros((config.data_size, 30, 7, 11), dtype=np.float32)
y_train = np.zeros((config.data_size,), dtype=np.uint8)

X_count = 0
y_count = 0

In [23]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    global X_train
    global y_train
    global X_count
    global y_count

    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])

                    for func in [ident, reverse_ns, reverse_we, reverse_nswe]:
                        if y_count >= config.data_size:
                            break

                        y_train[y_count] = func(actions[y_])
                        y_count += 1

                    if y_count >= config.data_size:
                        break

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            # 反転可能な特徴量
            X_ = []
            X_.append(make_input(obses[: j + 1]))
            X_.append(get_reverse_cube(obses[: j + 1]))
            X_.append(get_next_disappear_cube(obses[: j + 1]))

            # 反転不可能な特徴量
            X_i = []
            X_i.append(get_step_cube_v3(obses[: j + 1]))
            X_i.append(get_length_cube_v2(obses[: j + 1]))

            X_ = np.concatenate(X_)
            X_i = np.concatenate(X_i)

            for func in [no_flip, v_flip, h_flip, hv_flip]:
                if X_count >= config.data_size:
                    break

                X_train[X_count] = np.concatenate([func(X_), X_i])
                X_count += 1

            if X_count >= config.data_size:
                break

        return
    except Exception as e:
        if Config.debug:
            raise Exception from e
        return

In [24]:
for path in tqdm(paths[::-1]):
    create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X_count >= config.data_size:
        break

print(f"Num episode: {len(X_train):,}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29048.0), HTML(value='')))


Num episode: 7,000,000


In [25]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [26]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,2
2,3
3,3
4,0
...,...
6999995,1
6999996,0
6999997,1
6999998,0


## CV Split

In [27]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         169034
      1         169034
      2         180966
      3         180966
1     0         169034
      1         169034
      2         180966
      3         180966
2     0         169035
      1         169034
      2         180966
      3         180965
3     0         169035
      1         169034
      2         180966
      3         180965
4     0         169035
      1         169034
      2         180966
      3         180965
5     0         169035
      1         169034
      2         180966
      3         180965
6     0         169034
      1         169035
      2         180965
      3         180966
7     0         169034
      1         169035
      2         180965
      3         180966
8     0         169034
      1         169035
      2         180965
      3         180966
9     0         169034
      1         169035
      2         180965
      3         180966
dtype: int64


## Dataset

In [28]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [29]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [30]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [31]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32

        self.conv0 = TorusConv2d(30, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])
        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v = nn.Linear(filters * 2, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))
        h_head = (h * x[:, :1]).view(h.size(0), h.size(1), -1).sum(-1)
        h_avg = h.view(h.size(0), h.size(1), -1).mean(-1)
        p = self.head_p(h_head)
        v = torch.tanh(self.head_v(torch.cat([h_head, h_avg], 1)))

        return {"policy": p, "value": v}

In [32]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        print(f"input shape: {obs.shape}")
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [33]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [34]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [35]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [36]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if config.gradient_accumulation_steps > 1:
            loss = loss / config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

        if (step + 1) % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [37]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if config.gradient_accumulation_steps > 1:
            loss = loss / config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [38]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=config.factor, patience=config.patience, verbose=True, eps=config.eps
            )
        elif config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=config.T_max, eta_min=config.min_lr, last_epoch=-1)
        elif config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=config.T_0, T_mult=1, eta_min=config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    # try:
    #     model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    # except:
    #     print(f"Failed to load pre-train weight.")

    # Disable training for value network
    # for param in model.head_v1.parameters():
    #     param.requires_grad = False
    # for param in model.head_v2.parameters():
    #     param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    # wandb.watch(model, log_freq=Config.print_freq)

    for epoch in range(config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        wandb.log(
            {
                "epoch": epoch + 1,
                f"loss/train_fold{fold}": avg_loss,
                f"loss/val_fold{fold}": avg_val_loss,
                f"accuracy/fold{fold}": score,
            }
        )

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{config.model_name}_fold{fold}_final.pth")

    y_df_valid_folds[[str(c) for c in range(config.n_class)]] = best_preds
    y_df_valid_folds["preds"] = best_preds.argmax(1)

    return y_df_valid_folds

## Main


In [39]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

In [40]:
if __name__ == "__main__":
    main()



Epoch: [1][0/1968] Elapsed 0m 15s (remain 505m 49s) Loss avg.: 2.1962 Grad: 9.8397 LR: 0.00100  
Epoch: [1][100/1968] Elapsed 0m 42s (remain 12m 56s) Loss avg.: 0.7710 Grad: 0.9601 LR: 0.00100  
Epoch: [1][200/1968] Elapsed 1m 8s (remain 10m 4s) Loss avg.: 0.6833 Grad: 1.0201 LR: 0.00100  
Epoch: [1][300/1968] Elapsed 1m 35s (remain 8m 47s) Loss avg.: 0.6441 Grad: 0.8660 LR: 0.00100  
Epoch: [1][400/1968] Elapsed 2m 1s (remain 7m 55s) Loss avg.: 0.6195 Grad: 0.6909 LR: 0.00100  
Epoch: [1][500/1968] Elapsed 2m 28s (remain 7m 14s) Loss avg.: 0.6018 Grad: 0.6553 LR: 0.00100  
Epoch: [1][600/1968] Elapsed 2m 54s (remain 6m 37s) Loss avg.: 0.5882 Grad: 0.7761 LR: 0.00100  
Epoch: [1][700/1968] Elapsed 3m 21s (remain 6m 3s) Loss avg.: 0.5773 Grad: 1.1069 LR: 0.00100  
Epoch: [1][800/1968] Elapsed 3m 47s (remain 5m 32s) Loss avg.: 0.5683 Grad: 0.7327 LR: 0.00100  
Epoch: [1][900/1968] Elapsed 4m 14s (remain 5m 1s) Loss avg.: 0.5607 Grad: 1.0381 LR: 0.00100  
Epoch: [1][1000/1968] Elapsed 4m 

Epoch 1 - avg_train_loss: 0.5194  avg_val_loss: 0.4761  time: 561s
Epoch 1 - Accuracy: 0.7922628571428572
Epoch 1 - Save Best Score: 0.7923 Model


Epoch: [2][0/1968] Elapsed 0m 3s (remain 111m 7s) Loss avg.: 0.4675 Grad: 0.6154 LR: 0.00098  
Epoch: [2][100/1968] Elapsed 0m 29s (remain 9m 12s) Loss avg.: 0.4765 Grad: 0.4552 LR: 0.00098  
Epoch: [2][200/1968] Elapsed 0m 56s (remain 8m 15s) Loss avg.: 0.4742 Grad: 0.5196 LR: 0.00098  
Epoch: [2][300/1968] Elapsed 1m 23s (remain 7m 39s) Loss avg.: 0.4725 Grad: 0.8034 LR: 0.00098  
Epoch: [2][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4717 Grad: 0.5805 LR: 0.00098  
Epoch: [2][500/1968] Elapsed 2m 15s (remain 6m 38s) Loss avg.: 0.4717 Grad: 0.5163 LR: 0.00098  
Epoch: [2][600/1968] Elapsed 2m 42s (remain 6m 9s) Loss avg.: 0.4714 Grad: 0.6457 LR: 0.00098  
Epoch: [2][700/1968] Elapsed 3m 8s (remain 5m 41s) Loss avg.: 0.4711 Grad: 0.5220 LR: 0.00098  
Epoch: [2][800/1968] Elapsed 3m 35s (remain 5m 13s) Loss avg.: 0.4709 Grad: 0.4448 LR: 0.00098  
Epoch: [2][900/1968] Elapsed 4m 1s (remain 4m 46s) Loss avg.: 0.4703 Grad: 0.5505 LR: 0.00098  
Epoch: [2][1000/1968] Elapsed 4m 28s

Epoch 2 - avg_train_loss: 0.4668  avg_val_loss: 0.4612  time: 545s
Epoch 2 - Accuracy: 0.8005842857142857
Epoch 2 - Save Best Score: 0.8006 Model


Epoch: [3][0/1968] Elapsed 0m 3s (remain 107m 18s) Loss avg.: 0.4526 Grad: 0.4692 LR: 0.00091  
Epoch: [3][100/1968] Elapsed 0m 29s (remain 9m 10s) Loss avg.: 0.4586 Grad: 0.4687 LR: 0.00091  
Epoch: [3][200/1968] Elapsed 0m 56s (remain 8m 14s) Loss avg.: 0.4592 Grad: 0.4032 LR: 0.00091  
Epoch: [3][300/1968] Elapsed 1m 22s (remain 7m 37s) Loss avg.: 0.4589 Grad: 0.4455 LR: 0.00091  
Epoch: [3][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4586 Grad: 0.5233 LR: 0.00091  
Epoch: [3][500/1968] Elapsed 2m 15s (remain 6m 37s) Loss avg.: 0.4583 Grad: 0.4416 LR: 0.00091  
Epoch: [3][600/1968] Elapsed 2m 42s (remain 6m 8s) Loss avg.: 0.4579 Grad: 0.3248 LR: 0.00091  
Epoch: [3][700/1968] Elapsed 3m 8s (remain 5m 40s) Loss avg.: 0.4580 Grad: 0.3881 LR: 0.00091  
Epoch: [3][800/1968] Elapsed 3m 34s (remain 5m 13s) Loss avg.: 0.4579 Grad: 0.4588 LR: 0.00091  
Epoch: [3][900/1968] Elapsed 4m 1s (remain 4m 45s) Loss avg.: 0.4576 Grad: 0.3900 LR: 0.00091  
Epoch: [3][1000/1968] Elapsed 4m 27

Epoch 3 - avg_train_loss: 0.4565  avg_val_loss: 0.4545  time: 548s
Epoch 3 - Accuracy: 0.8040342857142857
Epoch 3 - Save Best Score: 0.8040 Model


Epoch: [4][0/1968] Elapsed 0m 3s (remain 107m 2s) Loss avg.: 0.4583 Grad: 0.3894 LR: 0.00081  
Epoch: [4][100/1968] Elapsed 0m 29s (remain 9m 10s) Loss avg.: 0.4484 Grad: 0.4071 LR: 0.00081  
Epoch: [4][200/1968] Elapsed 0m 56s (remain 8m 14s) Loss avg.: 0.4491 Grad: 0.4154 LR: 0.00081  
Epoch: [4][300/1968] Elapsed 1m 22s (remain 7m 37s) Loss avg.: 0.4501 Grad: 0.3466 LR: 0.00081  
Epoch: [4][400/1968] Elapsed 1m 49s (remain 7m 6s) Loss avg.: 0.4505 Grad: 0.4586 LR: 0.00081  
Epoch: [4][500/1968] Elapsed 2m 15s (remain 6m 37s) Loss avg.: 0.4509 Grad: 0.4023 LR: 0.00081  
Epoch: [4][600/1968] Elapsed 2m 42s (remain 6m 9s) Loss avg.: 0.4509 Grad: 0.4791 LR: 0.00081  
Epoch: [4][700/1968] Elapsed 3m 8s (remain 5m 41s) Loss avg.: 0.4508 Grad: 0.3967 LR: 0.00081  
Epoch: [4][800/1968] Elapsed 3m 35s (remain 5m 13s) Loss avg.: 0.4507 Grad: 0.4641 LR: 0.00081  
Epoch: [4][900/1968] Elapsed 4m 1s (remain 4m 46s) Loss avg.: 0.4507 Grad: 0.5235 LR: 0.00081  
Epoch: [4][1000/1968] Elapsed 4m 28s

Epoch 4 - avg_train_loss: 0.4503  avg_val_loss: 0.4497  time: 547s
Epoch 4 - Accuracy: 0.8062457142857142
Epoch 4 - Save Best Score: 0.8062 Model


Epoch: [5][0/1968] Elapsed 0m 3s (remain 105m 49s) Loss avg.: 0.4378 Grad: 0.4155 LR: 0.00069  
Epoch: [5][100/1968] Elapsed 0m 29s (remain 9m 9s) Loss avg.: 0.4465 Grad: 0.4676 LR: 0.00069  
Epoch: [5][200/1968] Elapsed 0m 56s (remain 8m 13s) Loss avg.: 0.4461 Grad: 0.3043 LR: 0.00069  
Epoch: [5][300/1968] Elapsed 1m 22s (remain 7m 37s) Loss avg.: 0.4461 Grad: 0.3991 LR: 0.00069  
Epoch: [5][400/1968] Elapsed 1m 49s (remain 7m 5s) Loss avg.: 0.4462 Grad: 0.3505 LR: 0.00069  
Epoch: [5][500/1968] Elapsed 2m 15s (remain 6m 36s) Loss avg.: 0.4462 Grad: 0.4134 LR: 0.00069  
Epoch: [5][600/1968] Elapsed 2m 41s (remain 6m 8s) Loss avg.: 0.4459 Grad: 0.2863 LR: 0.00069  
Epoch: [5][700/1968] Elapsed 3m 8s (remain 5m 40s) Loss avg.: 0.4461 Grad: 0.3509 LR: 0.00069  
Epoch: [5][800/1968] Elapsed 3m 34s (remain 5m 13s) Loss avg.: 0.4460 Grad: 0.4121 LR: 0.00069  
Epoch: [5][900/1968] Elapsed 4m 1s (remain 4m 45s) Loss avg.: 0.4460 Grad: 0.3599 LR: 0.00069  
Epoch: [5][1000/1968] Elapsed 4m 27s

Epoch 5 - avg_train_loss: 0.4456  avg_val_loss: 0.4463  time: 548s
Epoch 5 - Accuracy: 0.8076942857142857
Epoch 5 - Save Best Score: 0.8077 Model


Epoch: [6][0/1968] Elapsed 0m 3s (remain 112m 39s) Loss avg.: 0.4205 Grad: 0.3866 LR: 0.00055  
Epoch: [6][100/1968] Elapsed 0m 29s (remain 9m 14s) Loss avg.: 0.4423 Grad: 0.2809 LR: 0.00055  
Epoch: [6][200/1968] Elapsed 0m 56s (remain 8m 17s) Loss avg.: 0.4420 Grad: 0.2768 LR: 0.00055  
Epoch: [6][300/1968] Elapsed 1m 22s (remain 7m 39s) Loss avg.: 0.4415 Grad: 0.3033 LR: 0.00055  
Epoch: [6][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4421 Grad: 0.3088 LR: 0.00055  
Epoch: [6][500/1968] Elapsed 2m 15s (remain 6m 37s) Loss avg.: 0.4417 Grad: 0.3653 LR: 0.00055  
Epoch: [6][600/1968] Elapsed 2m 42s (remain 6m 9s) Loss avg.: 0.4416 Grad: 0.3650 LR: 0.00055  
Epoch: [6][700/1968] Elapsed 3m 8s (remain 5m 41s) Loss avg.: 0.4415 Grad: 0.3072 LR: 0.00055  
Epoch: [6][800/1968] Elapsed 3m 35s (remain 5m 13s) Loss avg.: 0.4415 Grad: 0.3170 LR: 0.00055  
Epoch: [6][900/1968] Elapsed 4m 1s (remain 4m 46s) Loss avg.: 0.4415 Grad: 0.3175 LR: 0.00055  
Epoch: [6][1000/1968] Elapsed 4m 28

Epoch 6 - avg_train_loss: 0.4416  avg_val_loss: 0.4433  time: 546s
Epoch 6 - Accuracy: 0.8095485714285714
Epoch 6 - Save Best Score: 0.8095 Model


Epoch: [7][0/1968] Elapsed 0m 3s (remain 107m 27s) Loss avg.: 0.4402 Grad: 0.3083 LR: 0.00041  
Epoch: [7][100/1968] Elapsed 0m 29s (remain 9m 10s) Loss avg.: 0.4353 Grad: 0.3575 LR: 0.00041  
Epoch: [7][200/1968] Elapsed 0m 56s (remain 8m 14s) Loss avg.: 0.4377 Grad: 0.2923 LR: 0.00041  
Epoch: [7][300/1968] Elapsed 1m 22s (remain 7m 37s) Loss avg.: 0.4377 Grad: 0.2580 LR: 0.00041  
Epoch: [7][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4381 Grad: 0.3440 LR: 0.00041  
Epoch: [7][500/1968] Elapsed 2m 15s (remain 6m 37s) Loss avg.: 0.4384 Grad: 0.3243 LR: 0.00041  
Epoch: [7][600/1968] Elapsed 2m 42s (remain 6m 8s) Loss avg.: 0.4386 Grad: 0.3586 LR: 0.00041  
Epoch: [7][700/1968] Elapsed 3m 8s (remain 5m 40s) Loss avg.: 0.4385 Grad: 0.2560 LR: 0.00041  
Epoch: [7][800/1968] Elapsed 3m 34s (remain 5m 13s) Loss avg.: 0.4384 Grad: 0.2477 LR: 0.00041  
Epoch: [7][900/1968] Elapsed 4m 1s (remain 4m 45s) Loss avg.: 0.4383 Grad: 0.3425 LR: 0.00041  
Epoch: [7][1000/1968] Elapsed 4m 27

Epoch 7 - avg_train_loss: 0.4382  avg_val_loss: 0.4419  time: 547s
Epoch 7 - Accuracy: 0.8100685714285715
Epoch 7 - Save Best Score: 0.8101 Model


Epoch: [8][0/1968] Elapsed 0m 3s (remain 107m 42s) Loss avg.: 0.4275 Grad: 0.4027 LR: 0.00029  
Epoch: [8][100/1968] Elapsed 0m 29s (remain 9m 11s) Loss avg.: 0.4342 Grad: 0.2957 LR: 0.00029  
Epoch: [8][200/1968] Elapsed 0m 56s (remain 8m 15s) Loss avg.: 0.4350 Grad: 0.2952 LR: 0.00029  
Epoch: [8][300/1968] Elapsed 1m 22s (remain 7m 38s) Loss avg.: 0.4350 Grad: 0.3211 LR: 0.00029  
Epoch: [8][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4349 Grad: 0.3159 LR: 0.00029  
Epoch: [8][500/1968] Elapsed 2m 16s (remain 6m 38s) Loss avg.: 0.4350 Grad: 0.2572 LR: 0.00029  
Epoch: [8][600/1968] Elapsed 2m 42s (remain 6m 9s) Loss avg.: 0.4352 Grad: 0.2688 LR: 0.00029  
Epoch: [8][700/1968] Elapsed 3m 9s (remain 5m 41s) Loss avg.: 0.4351 Grad: 0.3113 LR: 0.00029  
Epoch: [8][800/1968] Elapsed 3m 35s (remain 5m 13s) Loss avg.: 0.4348 Grad: 0.2620 LR: 0.00029  
Epoch: [8][900/1968] Elapsed 4m 1s (remain 4m 46s) Loss avg.: 0.4350 Grad: 0.2572 LR: 0.00029  
Epoch: [8][1000/1968] Elapsed 4m 28

Epoch 8 - avg_train_loss: 0.4351  avg_val_loss: 0.4397  time: 546s
Epoch 8 - Accuracy: 0.8108528571428572
Epoch 8 - Save Best Score: 0.8109 Model


Epoch: [9][0/1968] Elapsed 0m 3s (remain 106m 20s) Loss avg.: 0.4297 Grad: 0.3226 LR: 0.00019  
Epoch: [9][100/1968] Elapsed 0m 29s (remain 9m 8s) Loss avg.: 0.4326 Grad: 0.3346 LR: 0.00019  
Epoch: [9][200/1968] Elapsed 0m 56s (remain 8m 13s) Loss avg.: 0.4326 Grad: 0.4335 LR: 0.00019  
Epoch: [9][300/1968] Elapsed 1m 22s (remain 7m 37s) Loss avg.: 0.4321 Grad: 0.3389 LR: 0.00019  
Epoch: [9][400/1968] Elapsed 1m 48s (remain 7m 5s) Loss avg.: 0.4320 Grad: 0.2708 LR: 0.00019  
Epoch: [9][500/1968] Elapsed 2m 15s (remain 6m 36s) Loss avg.: 0.4322 Grad: 0.2996 LR: 0.00019  
Epoch: [9][600/1968] Elapsed 2m 41s (remain 6m 8s) Loss avg.: 0.4323 Grad: 0.2491 LR: 0.00019  
Epoch: [9][700/1968] Elapsed 3m 8s (remain 5m 40s) Loss avg.: 0.4325 Grad: 0.3373 LR: 0.00019  
Epoch: [9][800/1968] Elapsed 3m 34s (remain 5m 13s) Loss avg.: 0.4324 Grad: 0.3704 LR: 0.00019  
Epoch: [9][900/1968] Elapsed 4m 1s (remain 4m 45s) Loss avg.: 0.4325 Grad: 0.3216 LR: 0.00019  
Epoch: [9][1000/1968] Elapsed 4m 27s

Epoch 9 - avg_train_loss: 0.4326  avg_val_loss: 0.4386  time: 547s
Epoch 9 - Accuracy: 0.8118614285714286
Epoch 9 - Save Best Score: 0.8119 Model


Epoch: [10][0/1968] Elapsed 0m 3s (remain 111m 41s) Loss avg.: 0.4261 Grad: 0.3115 LR: 0.00012  
Epoch: [10][100/1968] Elapsed 0m 29s (remain 9m 13s) Loss avg.: 0.4294 Grad: 0.3164 LR: 0.00012  
Epoch: [10][200/1968] Elapsed 0m 56s (remain 8m 17s) Loss avg.: 0.4287 Grad: 0.3461 LR: 0.00012  
Epoch: [10][300/1968] Elapsed 1m 22s (remain 7m 39s) Loss avg.: 0.4294 Grad: 0.2783 LR: 0.00012  
Epoch: [10][400/1968] Elapsed 1m 49s (remain 7m 7s) Loss avg.: 0.4296 Grad: 0.3294 LR: 0.00012  
Epoch: [10][500/1968] Elapsed 2m 15s (remain 6m 37s) Loss avg.: 0.4301 Grad: 0.2979 LR: 0.00012  
Epoch: [10][600/1968] Elapsed 2m 42s (remain 6m 8s) Loss avg.: 0.4301 Grad: 0.3032 LR: 0.00012  
Epoch: [10][700/1968] Elapsed 3m 8s (remain 5m 40s) Loss avg.: 0.4300 Grad: 0.3220 LR: 0.00012  
Epoch: [10][800/1968] Elapsed 3m 35s (remain 5m 13s) Loss avg.: 0.4301 Grad: 0.3456 LR: 0.00012  
Epoch: [10][900/1968] Elapsed 4m 1s (remain 4m 46s) Loss avg.: 0.4301 Grad: 0.3345 LR: 0.00012  
Epoch: [10][1000/1968] El

Epoch 10 - avg_train_loss: 0.4308  avg_val_loss: 0.4377  time: 546s
Epoch 10 - Accuracy: 0.8120214285714286
Epoch 10 - Save Best Score: 0.8120 Model
Epoch 10 - Save final model
Score: 0.81202
