# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Load Data

In [3]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

16521


## Config

In [5]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    geese_net_layers = 12
    geese_net_filters = 128

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 12  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 0

    epochs = 12
    model_name = "geese_net"

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [6]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [7]:
if Config.apex:
    from apex import amp

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [10]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [11]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [12]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    b[0, 0] = obs["step"]  # 0-198

    return b.reshape(1, 7, 11)

## Data

In [13]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.uint8)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [14]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16521.0), HTML(value='')))


Num episode: 10343040


In [15]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [16]:
X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [17]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 10341154


In [18]:
del X_train_sum_obs
del X_train_group
del X_train_unique
del y_train_unique
del X_train_
del y_train_
del group_index
del unique_index

In [19]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [20]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [21]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,3
2,1
3,2
4,0
...,...
10341149,3
10341150,3
10341151,2
10341152,2


## CV Split

In [22]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         246342
      1         246342
      2         270716
      3         270716
1     0         246342
      1         246342
      2         270716
      3         270716
2     0         246342
      1         246342
      2         270716
      3         270716
3     0         246342
      1         246342
      2         270716
      3         270716
4     0         246342
      1         246342
      2         270716
      3         270715
5     0         246342
      1         246342
      2         270716
      3         270715
6     0         246342
      1         246342
      2         270716
      3         270715
7     0         246342
      1         246342
      2         270716
      3         270715
8     0         246342
      1         246342
      2         270715
      3         270716
9     0         246342
      1         246342
      2         270715
      3         270716
dtype: int64


## Dataset

In [23]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [24]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [25]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [26]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [27]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters

        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_p2 = nn.Linear(filters * 3, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_v2 = nn.Linear(filters * 3, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v2 = (h_v * x[:, 1:2]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v3 = (h_v * x[:, 2:3]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v4 = (h_v * x[:, 3:4]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v1 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)
        h_avg_v2 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_head_v2, h_head_v3, h_head_v4, h_avg_v1, h_avg_v2], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [28]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [29]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [30]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [31]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [32]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0

    for step, (obs, action) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.6f}  "
            )

    return losses.avg

In [33]:
def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [34]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()

    # Disable training for value network
    for param in model.conv_v.parameters():
        param.requires_grad = False
    for param in model.head_v1.parameters():
        param.requires_grad = False
    for param in model.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel
        # torch.backends.cudnn.benchmark=True

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [35]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [36]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [None]:
if __name__ == "__main__":
    main()



Epoch: [1][0/2908] Elapsed 0m 6s (remain 298m 45s) Loss: 1.3930(1.3930) Grad: 0.5802 LR: 0.001000  
Epoch: [1][100/2908] Elapsed 2m 53s (remain 80m 35s) Loss: 0.5810(0.6809) Grad: 0.5089 LR: 0.001000  
Epoch: [1][200/2908] Elapsed 5m 41s (remain 76m 39s) Loss: 0.5348(0.6211) Grad: 0.5582 LR: 0.001000  
Epoch: [1][300/2908] Elapsed 8m 29s (remain 73m 31s) Loss: 0.5341(0.5915) Grad: 0.4694 LR: 0.001000  
Epoch: [1][400/2908] Elapsed 11m 16s (remain 70m 31s) Loss: 0.5097(0.5728) Grad: 0.3969 LR: 0.001000  
Epoch: [1][500/2908] Elapsed 14m 4s (remain 67m 36s) Loss: 0.4956(0.5597) Grad: 0.5190 LR: 0.001000  
Epoch: [1][600/2908] Elapsed 16m 51s (remain 64m 44s) Loss: 0.5061(0.5502) Grad: 0.4199 LR: 0.001000  
Epoch: [1][700/2908] Elapsed 19m 39s (remain 61m 53s) Loss: 0.5155(0.5423) Grad: 0.4302 LR: 0.001000  
Epoch: [1][800/2908] Elapsed 22m 27s (remain 59m 3s) Loss: 0.4812(0.5363) Grad: 0.2754 LR: 0.001000  
Epoch: [1][900/2908] Elapsed 25m 14s (remain 56m 13s) Loss: 0.4938(0.5313) Grad: 

Epoch 1 - avg_train_loss: 0.4930  avg_val_loss: 0.4699  time: 5039s
Epoch 1 - Accuracy: 0.7957443845758116
Epoch 1 - Save Best Score: 0.7957 Model


Epoch: [2][0/2908] Elapsed 0m 3s (remain 176m 48s) Loss: 0.4781(0.4781) Grad: 0.2365 LR: 0.000985  
Epoch: [2][100/2908] Elapsed 2m 51s (remain 79m 19s) Loss: 0.4480(0.4652) Grad: 0.2247 LR: 0.000985  
Epoch: [2][200/2908] Elapsed 5m 39s (remain 76m 5s) Loss: 0.4491(0.4650) Grad: 0.2159 LR: 0.000985  
Epoch: [2][300/2908] Elapsed 8m 26s (remain 73m 7s) Loss: 0.4490(0.4649) Grad: 0.3877 LR: 0.000985  
Epoch: [2][400/2908] Elapsed 11m 14s (remain 70m 14s) Loss: 0.4588(0.4644) Grad: 0.1666 LR: 0.000985  
Epoch: [2][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4525(0.4642) Grad: 0.2712 LR: 0.000985  
Epoch: [2][600/2908] Elapsed 16m 49s (remain 64m 33s) Loss: 0.4603(0.4638) Grad: 0.1757 LR: 0.000985  
Epoch: [2][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4456(0.4640) Grad: 0.3363 LR: 0.000985  
Epoch: [2][800/2908] Elapsed 22m 24s (remain 58m 55s) Loss: 0.4527(0.4638) Grad: 0.2447 LR: 0.000985  
Epoch: [2][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4637(0.4635) Grad: 0.

Epoch 2 - avg_train_loss: 0.4611  avg_val_loss: 0.4590  time: 5035s
Epoch 2 - Accuracy: 0.8006519578074414
Epoch 2 - Save Best Score: 0.8007 Model


Epoch: [3][0/2908] Elapsed 0m 3s (remain 180m 57s) Loss: 0.4742(0.4742) Grad: 0.1856 LR: 0.000940  
Epoch: [3][100/2908] Elapsed 2m 51s (remain 79m 31s) Loss: 0.4521(0.4541) Grad: 0.2946 LR: 0.000940  
Epoch: [3][200/2908] Elapsed 5m 39s (remain 76m 8s) Loss: 0.4646(0.4537) Grad: 0.1640 LR: 0.000940  
Epoch: [3][300/2908] Elapsed 8m 26s (remain 73m 9s) Loss: 0.4371(0.4536) Grad: 0.1947 LR: 0.000940  
Epoch: [3][400/2908] Elapsed 11m 14s (remain 70m 15s) Loss: 0.4567(0.4535) Grad: 0.2017 LR: 0.000940  
Epoch: [3][500/2908] Elapsed 14m 1s (remain 67m 24s) Loss: 0.4401(0.4536) Grad: 0.1618 LR: 0.000940  
Epoch: [3][600/2908] Elapsed 16m 49s (remain 64m 34s) Loss: 0.4565(0.4537) Grad: 0.2678 LR: 0.000940  
Epoch: [3][700/2908] Elapsed 19m 36s (remain 61m 45s) Loss: 0.4492(0.4536) Grad: 0.2726 LR: 0.000940  
Epoch: [3][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4713(0.4536) Grad: 0.2350 LR: 0.000940  
Epoch: [3][900/2908] Elapsed 25m 12s (remain 56m 8s) Loss: 0.4210(0.4536) Grad: 0.

Epoch 3 - avg_train_loss: 0.4528  avg_val_loss: 0.4554  time: 5037s
Epoch 3 - Accuracy: 0.8015744848740374
Epoch 3 - Save Best Score: 0.8016 Model


Epoch: [4][0/2908] Elapsed 0m 3s (remain 177m 30s) Loss: 0.4517(0.4517) Grad: 0.1358 LR: 0.000868  
Epoch: [4][100/2908] Elapsed 2m 51s (remain 79m 19s) Loss: 0.4472(0.4442) Grad: 0.1345 LR: 0.000868  
Epoch: [4][200/2908] Elapsed 5m 38s (remain 76m 2s) Loss: 0.4298(0.4468) Grad: 0.1686 LR: 0.000868  
Epoch: [4][300/2908] Elapsed 8m 26s (remain 73m 5s) Loss: 0.4448(0.4472) Grad: 0.1939 LR: 0.000868  
Epoch: [4][400/2908] Elapsed 11m 13s (remain 70m 13s) Loss: 0.4487(0.4470) Grad: 0.1702 LR: 0.000868  
Epoch: [4][500/2908] Elapsed 14m 1s (remain 67m 22s) Loss: 0.4374(0.4468) Grad: 0.3306 LR: 0.000868  
Epoch: [4][600/2908] Elapsed 16m 48s (remain 64m 33s) Loss: 0.4467(0.4467) Grad: 0.2947 LR: 0.000868  
Epoch: [4][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4545(0.4466) Grad: 0.1358 LR: 0.000868  
Epoch: [4][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4576(0.4466) Grad: 0.1334 LR: 0.000868  
Epoch: [4][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4491(0.4470) Grad: 0.

Epoch 4 - avg_train_loss: 0.4469  avg_val_loss: 0.4532  time: 5036s
Epoch 4 - Accuracy: 0.8032377412205207
Epoch 4 - Save Best Score: 0.8032 Model


Epoch: [5][0/2908] Elapsed 0m 3s (remain 175m 45s) Loss: 0.4438(0.4438) Grad: 0.1522 LR: 0.000775  
Epoch: [5][100/2908] Elapsed 2m 51s (remain 79m 19s) Loss: 0.4329(0.4394) Grad: 0.2363 LR: 0.000775  
Epoch: [5][200/2908] Elapsed 5m 38s (remain 76m 3s) Loss: 0.4320(0.4402) Grad: 0.1275 LR: 0.000775  
Epoch: [5][300/2908] Elapsed 8m 26s (remain 73m 6s) Loss: 0.4455(0.4407) Grad: 0.2512 LR: 0.000775  
Epoch: [5][400/2908] Elapsed 11m 14s (remain 70m 13s) Loss: 0.4688(0.4409) Grad: 0.1064 LR: 0.000775  
Epoch: [5][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4470(0.4414) Grad: 0.1684 LR: 0.000775  
Epoch: [5][600/2908] Elapsed 16m 49s (remain 64m 33s) Loss: 0.4422(0.4416) Grad: 0.1316 LR: 0.000775  
Epoch: [5][700/2908] Elapsed 19m 36s (remain 61m 45s) Loss: 0.4667(0.4415) Grad: 0.1505 LR: 0.000775  
Epoch: [5][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4385(0.4414) Grad: 0.1258 LR: 0.000775  
Epoch: [5][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4517(0.4414) Grad: 0.

Epoch 5 - avg_train_loss: 0.4417  avg_val_loss: 0.4511  time: 5036s
Epoch 5 - Accuracy: 0.8044745463758418
Epoch 5 - Save Best Score: 0.8045 Model


Epoch: [6][0/2908] Elapsed 0m 3s (remain 177m 10s) Loss: 0.4252(0.4252) Grad: 0.2099 LR: 0.000666  
Epoch: [6][100/2908] Elapsed 2m 51s (remain 79m 20s) Loss: 0.4247(0.4373) Grad: 0.1841 LR: 0.000666  
Epoch: [6][200/2908] Elapsed 5m 38s (remain 76m 4s) Loss: 0.4326(0.4373) Grad: 0.1102 LR: 0.000666  
Epoch: [6][300/2908] Elapsed 8m 26s (remain 73m 6s) Loss: 0.4287(0.4375) Grad: 0.1602 LR: 0.000666  
Epoch: [6][400/2908] Elapsed 11m 14s (remain 70m 14s) Loss: 0.4405(0.4370) Grad: 0.1265 LR: 0.000666  
Epoch: [6][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4375(0.4365) Grad: 0.1981 LR: 0.000666  
Epoch: [6][600/2908] Elapsed 16m 49s (remain 64m 34s) Loss: 0.4368(0.4365) Grad: 0.1952 LR: 0.000666  
Epoch: [6][700/2908] Elapsed 19m 37s (remain 61m 45s) Loss: 0.4227(0.4369) Grad: 0.1789 LR: 0.000666  
Epoch: [6][800/2908] Elapsed 22m 24s (remain 58m 57s) Loss: 0.4282(0.4368) Grad: 0.1428 LR: 0.000666  
Epoch: [6][900/2908] Elapsed 25m 12s (remain 56m 8s) Loss: 0.4413(0.4369) Grad: 0.

Epoch 6 - avg_train_loss: 0.4368  avg_val_loss: 0.4504  time: 5036s
Epoch 6 - Accuracy: 0.8044822824518719
Epoch 6 - Save Best Score: 0.8045 Model


Epoch: [7][0/2908] Elapsed 0m 3s (remain 185m 20s) Loss: 0.4205(0.4205) Grad: 0.1158 LR: 0.000550  
Epoch: [7][100/2908] Elapsed 2m 51s (remain 79m 27s) Loss: 0.4457(0.4317) Grad: 0.1216 LR: 0.000550  
Epoch: [7][200/2908] Elapsed 5m 39s (remain 76m 6s) Loss: 0.4363(0.4310) Grad: 0.1287 LR: 0.000550  
Epoch: [7][300/2908] Elapsed 8m 26s (remain 73m 8s) Loss: 0.4324(0.4303) Grad: 0.1392 LR: 0.000550  
Epoch: [7][400/2908] Elapsed 11m 14s (remain 70m 14s) Loss: 0.4182(0.4300) Grad: 0.1497 LR: 0.000550  
Epoch: [7][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4298(0.4300) Grad: 0.1406 LR: 0.000550  
Epoch: [7][600/2908] Elapsed 16m 49s (remain 64m 34s) Loss: 0.4401(0.4302) Grad: 0.1806 LR: 0.000550  
Epoch: [7][700/2908] Elapsed 19m 36s (remain 61m 45s) Loss: 0.4407(0.4302) Grad: 0.1456 LR: 0.000550  
Epoch: [7][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4233(0.4303) Grad: 0.1690 LR: 0.000550  
Epoch: [7][900/2908] Elapsed 25m 12s (remain 56m 8s) Loss: 0.4166(0.4302) Grad: 0.

Epoch 7 - avg_train_loss: 0.4318  avg_val_loss: 0.4515  time: 5035s
Epoch 7 - Accuracy: 0.8043710763589385


Epoch: [8][0/2908] Elapsed 0m 3s (remain 177m 30s) Loss: 0.4107(0.4107) Grad: 0.1389 LR: 0.000434  
Epoch: [8][100/2908] Elapsed 2m 51s (remain 79m 21s) Loss: 0.4215(0.4250) Grad: 0.1251 LR: 0.000434  
Epoch: [8][200/2908] Elapsed 5m 38s (remain 76m 2s) Loss: 0.4305(0.4239) Grad: 0.1656 LR: 0.000434  
Epoch: [8][300/2908] Elapsed 8m 26s (remain 73m 5s) Loss: 0.4321(0.4241) Grad: 0.1428 LR: 0.000434  
Epoch: [8][400/2908] Elapsed 11m 13s (remain 70m 12s) Loss: 0.4214(0.4246) Grad: 0.1572 LR: 0.000434  
Epoch: [8][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4177(0.4249) Grad: 0.2735 LR: 0.000434  
Epoch: [8][600/2908] Elapsed 16m 49s (remain 64m 33s) Loss: 0.4443(0.4251) Grad: 0.1975 LR: 0.000434  
Epoch: [8][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4109(0.4252) Grad: 0.1813 LR: 0.000434  
Epoch: [8][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4329(0.4252) Grad: 0.1371 LR: 0.000434  
Epoch: [8][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4244(0.4255) Grad: 0.

Epoch 8 - avg_train_loss: 0.4267  avg_val_loss: 0.4504  time: 5035s
Epoch 8 - Accuracy: 0.8047607811889576
Epoch 8 - Save Best Score: 0.8048 Model


Epoch: [9][0/2908] Elapsed 0m 3s (remain 179m 42s) Loss: 0.4220(0.4220) Grad: 0.1183 LR: 0.000325  
Epoch: [9][100/2908] Elapsed 2m 51s (remain 79m 20s) Loss: 0.4129(0.4206) Grad: 0.1548 LR: 0.000325  
Epoch: [9][200/2908] Elapsed 5m 38s (remain 76m 3s) Loss: 0.4434(0.4192) Grad: 0.1990 LR: 0.000325  
Epoch: [9][300/2908] Elapsed 8m 26s (remain 73m 5s) Loss: 0.4229(0.4193) Grad: 0.1592 LR: 0.000325  
Epoch: [9][400/2908] Elapsed 11m 14s (remain 70m 14s) Loss: 0.3963(0.4194) Grad: 0.1371 LR: 0.000325  
Epoch: [9][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4304(0.4194) Grad: 0.1580 LR: 0.000325  
Epoch: [9][600/2908] Elapsed 16m 49s (remain 64m 33s) Loss: 0.4208(0.4195) Grad: 0.2049 LR: 0.000325  
Epoch: [9][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4201(0.4198) Grad: 0.1196 LR: 0.000325  
Epoch: [9][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4256(0.4198) Grad: 0.2152 LR: 0.000325  
Epoch: [9][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4139(0.4200) Grad: 0.

Epoch 9 - avg_train_loss: 0.4214  avg_val_loss: 0.4517  time: 5036s
Epoch 9 - Accuracy: 0.8046234658394222


Epoch: [10][0/2908] Elapsed 0m 3s (remain 176m 36s) Loss: 0.4058(0.4058) Grad: 0.1673 LR: 0.000232  
Epoch: [10][100/2908] Elapsed 2m 51s (remain 79m 17s) Loss: 0.4145(0.4165) Grad: 0.1871 LR: 0.000232  
Epoch: [10][200/2908] Elapsed 5m 38s (remain 76m 1s) Loss: 0.4232(0.4162) Grad: 0.1911 LR: 0.000232  
Epoch: [10][300/2908] Elapsed 8m 26s (remain 73m 4s) Loss: 0.4065(0.4155) Grad: 0.1481 LR: 0.000232  
Epoch: [10][400/2908] Elapsed 11m 13s (remain 70m 13s) Loss: 0.4170(0.4155) Grad: 0.1714 LR: 0.000232  
Epoch: [10][500/2908] Elapsed 14m 1s (remain 67m 22s) Loss: 0.4256(0.4156) Grad: 0.2005 LR: 0.000232  
Epoch: [10][600/2908] Elapsed 16m 48s (remain 64m 33s) Loss: 0.4065(0.4155) Grad: 0.1643 LR: 0.000232  
Epoch: [10][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4106(0.4154) Grad: 0.1070 LR: 0.000232  
Epoch: [10][800/2908] Elapsed 22m 24s (remain 58m 55s) Loss: 0.4288(0.4155) Grad: 0.1455 LR: 0.000232  
Epoch: [10][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.3938(0.4154

Epoch 10 - avg_train_loss: 0.4165  avg_val_loss: 0.4533  time: 5035s
Epoch 10 - Accuracy: 0.8040790394888001


Epoch: [11][0/2908] Elapsed 0m 3s (remain 177m 44s) Loss: 0.4025(0.4025) Grad: 0.1382 LR: 0.000160  
Epoch: [11][100/2908] Elapsed 2m 51s (remain 79m 18s) Loss: 0.4450(0.4099) Grad: 0.1577 LR: 0.000160  
Epoch: [11][200/2908] Elapsed 5m 38s (remain 76m 2s) Loss: 0.4104(0.4103) Grad: 0.1339 LR: 0.000160  
Epoch: [11][300/2908] Elapsed 8m 26s (remain 73m 7s) Loss: 0.4130(0.4100) Grad: 0.1592 LR: 0.000160  
Epoch: [11][400/2908] Elapsed 11m 14s (remain 70m 14s) Loss: 0.3907(0.4103) Grad: 0.1857 LR: 0.000160  
Epoch: [11][500/2908] Elapsed 14m 1s (remain 67m 23s) Loss: 0.4076(0.4104) Grad: 0.1405 LR: 0.000160  
Epoch: [11][600/2908] Elapsed 16m 49s (remain 64m 33s) Loss: 0.4148(0.4105) Grad: 0.1446 LR: 0.000160  
Epoch: [11][700/2908] Elapsed 19m 36s (remain 61m 44s) Loss: 0.4196(0.4105) Grad: 0.1503 LR: 0.000160  
Epoch: [11][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4081(0.4107) Grad: 0.1743 LR: 0.000160  
Epoch: [11][900/2908] Elapsed 25m 11s (remain 56m 7s) Loss: 0.4099(0.4106

Epoch 11 - avg_train_loss: 0.4122  avg_val_loss: 0.4553  time: 5036s
Epoch 11 - Accuracy: 0.8036255120315322


Epoch: [12][0/2908] Elapsed 0m 3s (remain 182m 40s) Loss: 0.4069(0.4069) Grad: 0.1325 LR: 0.000115  
Epoch: [12][100/2908] Elapsed 2m 51s (remain 79m 23s) Loss: 0.4141(0.4074) Grad: 0.1412 LR: 0.000115  
Epoch: [12][200/2908] Elapsed 5m 39s (remain 76m 8s) Loss: 0.4091(0.4075) Grad: 0.1473 LR: 0.000115  
Epoch: [12][300/2908] Elapsed 8m 26s (remain 73m 9s) Loss: 0.4302(0.4075) Grad: 0.1593 LR: 0.000115  
Epoch: [12][400/2908] Elapsed 11m 14s (remain 70m 15s) Loss: 0.3893(0.4073) Grad: 0.1588 LR: 0.000115  
Epoch: [12][500/2908] Elapsed 14m 1s (remain 67m 24s) Loss: 0.4144(0.4074) Grad: 0.1530 LR: 0.000115  
Epoch: [12][600/2908] Elapsed 16m 49s (remain 64m 34s) Loss: 0.4159(0.4075) Grad: 0.1591 LR: 0.000115  
Epoch: [12][700/2908] Elapsed 19m 37s (remain 61m 45s) Loss: 0.3877(0.4077) Grad: 0.1866 LR: 0.000115  
Epoch: [12][800/2908] Elapsed 22m 24s (remain 58m 56s) Loss: 0.4124(0.4077) Grad: 0.1622 LR: 0.000115  
Epoch: [12][900/2908] Elapsed 25m 12s (remain 56m 8s) Loss: 0.4063(0.4078