# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Load Data

In [3]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

16521


## Config

In [5]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 5

    geese_net_layers = 12
    geese_net_filters = 64

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 0

    epochs = 10
    model_name = "geese_net"

    print_freq = 100

    train = False
    tuning = True
    debug = False
    apex = False

In [6]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [7]:
if Config.apex:
    from apex import amp

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [10]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [11]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [12]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    b[0, 0] = obs["step"]  # 0-198

    return b.reshape(1, 7, 11)

## Data

In [13]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.uint8)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [14]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16521.0), HTML(value='')))


Num episode: 10343040


In [15]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [16]:
X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [17]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 10341154


In [18]:
del X_train_sum_obs
del X_train_group
del X_train_unique
del y_train_unique
del X_train_
del y_train_
del group_index
del unique_index

In [19]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [20]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [21]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,3
2,1
3,2
4,0
...,...
10341149,3
10341150,3
10341151,2
10341152,2


## CV Split

In [22]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         492684
      1         492684
      2         541432
      3         541431
1     0         492684
      1         492684
      2         541432
      3         541431
2     0         492684
      1         492684
      2         541432
      3         541431
3     0         492684
      1         492684
      2         541431
      3         541432
4     0         492684
      1         492684
      2         541431
      3         541431
dtype: int64


## Dataset

In [23]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [24]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [25]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [26]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [27]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters

        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_p2 = nn.Linear(filters * 3, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_v2 = nn.Linear(filters * 3, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v2 = (h_v * x[:, 1:2]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v3 = (h_v * x[:, 2:3]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_head_v4 = (h_v * x[:, 3:4]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v1 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)
        h_avg_v2 = h_v.view(h_v.size(0), h_v.size(1), -1).mean(1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_head_v2, h_head_v3, h_head_v4, h_avg_v1, h_avg_v2], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [28]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [29]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [30]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [31]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [32]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0

    for step, (obs, action) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.6f}  "
            )

    return losses.avg

In [33]:
def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [34]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()

    # Disable training for value network
    for param in model.conv_v.parameters():
        param.requires_grad = False
    for param in model.head_v1.parameters():
        param.requires_grad = False
    for param in model.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel
        # torch.backends.cudnn.benchmark=True

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [35]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [36]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [None]:
if __name__ == "__main__":
    main()

[32m[I 2021-05-17 21:45:36,145][0m A new study created in memory with name: no-name-a2e6656d-2383-40bf-8855-aa11f6fc7d2c[0m


Epoch: [1][0/2585] Elapsed 0m 5s (remain 240m 13s) Loss: 1.4050(1.4050) Grad: 0.6862 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 2m 8s (remain 52m 37s) Loss: 0.5839(0.6841) Grad: 0.6368 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 4m 11s (remain 49m 40s) Loss: 0.5427(0.6258) Grad: 0.5462 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 6m 14s (remain 47m 18s) Loss: 0.5437(0.5959) Grad: 0.7162 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 8m 16s (remain 45m 5s) Loss: 0.5083(0.5765) Grad: 0.4606 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 10m 19s (remain 42m 57s) Loss: 0.5013(0.5633) Grad: 0.3756 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 12m 22s (remain 40m 50s) Loss: 0.4942(0.5535) Grad: 0.6405 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 14m 25s (remain 38m 44s) Loss: 0.4819(0.5461) Grad: 0.3531 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 16m 27s (remain 36m 40s) Loss: 0.5078(0.5398) Grad: 0.5177 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 18m 30s (remain 34m 35s) Loss: 0.4903(0.5346) Grad: 0

Epoch 1 - avg_train_loss: 0.4983  avg_val_loss: 0.4750  time: 3414s
Epoch 1 - Accuracy: 0.7930057135784155
Epoch 1 - Save Best Score: 0.7930 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 133m 19s) Loss: 0.4928(0.4928) Grad: 0.5023 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 2m 5s (remain 51m 37s) Loss: 0.4748(0.4670) Grad: 0.5098 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 4m 8s (remain 49m 10s) Loss: 0.4718(0.4673) Grad: 0.1784 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 6m 11s (remain 46m 59s) Loss: 0.4725(0.4678) Grad: 0.2053 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 8m 14s (remain 44m 52s) Loss: 0.4666(0.4668) Grad: 0.2734 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 10m 17s (remain 42m 47s) Loss: 0.4633(0.4668) Grad: 0.3577 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 12m 19s (remain 40m 42s) Loss: 0.4605(0.4665) Grad: 0.2601 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 14m 22s (remain 38m 38s) Loss: 0.4514(0.4666) Grad: 0.2528 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 16m 25s (remain 36m 34s) Loss: 0.4616(0.4664) Grad: 0.3776 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 18m 28s (remain 34m 31s) Loss: 0.4902(0.4661) Grad: 0

Epoch 2 - avg_train_loss: 0.4641  avg_val_loss: 0.4628  time: 3411s
Epoch 2 - Accuracy: 0.7986245250167897
Epoch 2 - Save Best Score: 0.7986 Model
Epoch 2 - Save final model
[32m[I 2021-05-17 23:39:49,041][0m Trial 0 finished with value: 0.7986245250167897 and parameters: {'layers': 9, 'filters': 117}. Best is trial 0 with value: 0.7986245250167897.[0m


Epoch: [1][0/2585] Elapsed 0m 3s (remain 149m 13s) Loss: 1.3931(1.3931) Grad: 0.4987 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 2m 49s (remain 69m 30s) Loss: 0.5742(0.6817) Grad: 0.7576 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 5m 35s (remain 66m 21s) Loss: 0.5523(0.6208) Grad: 0.7124 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 8m 21s (remain 63m 27s) Loss: 0.5320(0.5904) Grad: 0.5868 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 11m 8s (remain 60m 38s) Loss: 0.5222(0.5723) Grad: 0.7441 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 13m 53s (remain 57m 48s) Loss: 0.5075(0.5595) Grad: 0.3945 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 16m 39s (remain 55m 0s) Loss: 0.5195(0.5502) Grad: 0.5779 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 19m 25s (remain 52m 13s) Loss: 0.5030(0.5429) Grad: 0.7195 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 22m 11s (remain 49m 25s) Loss: 0.4929(0.5367) Grad: 0.4778 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 24m 57s (remain 46m 39s) Loss: 0.4785(0.5317) Grad: 

Epoch 1 - avg_train_loss: 0.4966  avg_val_loss: 0.4723  time: 4600s
Epoch 1 - Accuracy: 0.7942961883851465
Epoch 1 - Save Best Score: 0.7943 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 156m 20s) Loss: 0.4634(0.4634) Grad: 0.3431 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 2m 49s (remain 69m 34s) Loss: 0.4652(0.4668) Grad: 0.2788 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 5m 35s (remain 66m 23s) Loss: 0.4586(0.4663) Grad: 0.2182 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 8m 21s (remain 63m 27s) Loss: 0.4383(0.4663) Grad: 0.2354 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 11m 7s (remain 60m 36s) Loss: 0.4691(0.4660) Grad: 0.1934 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 13m 53s (remain 57m 47s) Loss: 0.4688(0.4656) Grad: 0.2559 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 16m 39s (remain 54m 59s) Loss: 0.4698(0.4654) Grad: 0.1531 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 19m 25s (remain 52m 12s) Loss: 0.4768(0.4650) Grad: 0.3319 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 22m 11s (remain 49m 25s) Loss: 0.4711(0.4649) Grad: 0.1866 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 24m 57s (remain 46m 38s) Loss: 0.4505(0.4649) Grad:

Epoch 2 - avg_train_loss: 0.4631  avg_val_loss: 0.4607  time: 4599s
Epoch 2 - Accuracy: 0.8000682709039755
Epoch 2 - Save Best Score: 0.8001 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 02:13:30,649][0m Trial 1 finished with value: 0.8000682709039755 and parameters: {'layers': 13, 'filters': 115}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 3s (remain 162m 46s) Loss: 1.3884(1.3884) Grad: 0.5084 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 3m 25s (remain 84m 19s) Loss: 0.5848(0.6763) Grad: 1.1381 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 6m 47s (remain 80m 34s) Loss: 0.5443(0.6198) Grad: 0.7290 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 10m 9s (remain 77m 5s) Loss: 0.5000(0.5906) Grad: 0.5190 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 13m 31s (remain 73m 39s) Loss: 0.5104(0.5723) Grad: 0.5240 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 16m 53s (remain 70m 15s) Loss: 0.4938(0.5599) Grad: 0.4043 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 20m 15s (remain 66m 52s) Loss: 0.5050(0.5504) Grad: 0.4823 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 23m 37s (remain 63m 30s) Loss: 0.5046(0.5432) Grad: 0.4636 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 26m 59s (remain 60m 7s) Loss: 0.4820(0.5373) Grad: 0.4990 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 30m 21s (remain 56m 44s) Loss: 0.5047(0.5323) Grad: 

Epoch 1 - avg_train_loss: 0.4969  avg_val_loss: 0.4711  time: 5593s
Epoch 1 - Accuracy: 0.7953449106990467
Epoch 1 - Save Best Score: 0.7953 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 167m 52s) Loss: 0.4676(0.4676) Grad: 0.2128 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 3m 25s (remain 84m 21s) Loss: 0.4700(0.4652) Grad: 0.3543 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 6m 47s (remain 80m 34s) Loss: 0.4739(0.4664) Grad: 0.2195 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 10m 9s (remain 77m 4s) Loss: 0.4599(0.4669) Grad: 0.1827 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 13m 31s (remain 73m 38s) Loss: 0.4512(0.4667) Grad: 0.1757 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 16m 53s (remain 70m 14s) Loss: 0.4560(0.4664) Grad: 0.1621 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 20m 15s (remain 66m 51s) Loss: 0.4628(0.4669) Grad: 0.3022 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 23m 36s (remain 63m 27s) Loss: 0.4613(0.4667) Grad: 0.1443 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 26m 58s (remain 60m 5s) Loss: 0.4480(0.4662) Grad: 0.1803 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 30m 20s (remain 56m 42s) Loss: 0.4536(0.4660) Grad: 

Epoch 2 - avg_train_loss: 0.4638  avg_val_loss: 0.4610  time: 5592s
Epoch 2 - Accuracy: 0.7998671328299402
Epoch 2 - Save Best Score: 0.7999 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 05:20:20,618][0m Trial 2 finished with value: 0.7998671328299402 and parameters: {'layers': 18, 'filters': 106}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 2s (remain 118m 31s) Loss: 1.3969(1.3969) Grad: 0.4876 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 1m 33s (remain 38m 13s) Loss: 0.5741(0.6939) Grad: 0.6342 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 3m 3s (remain 36m 21s) Loss: 0.5561(0.6335) Grad: 0.8800 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 4m 34s (remain 34m 44s) Loss: 0.5194(0.6033) Grad: 0.6011 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 6m 5s (remain 33m 9s) Loss: 0.5141(0.5832) Grad: 0.5230 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 7m 35s (remain 31m 36s) Loss: 0.5263(0.5694) Grad: 0.3600 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 9m 6s (remain 30m 4s) Loss: 0.5064(0.5593) Grad: 0.3620 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 10m 37s (remain 28m 32s) Loss: 0.4945(0.5512) Grad: 0.4882 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 12m 7s (remain 27m 1s) Loss: 0.5133(0.5448) Grad: 0.2691 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 13m 38s (remain 25m 30s) Loss: 0.4961(0.5395) Grad: 0.3778 L

Epoch 1 - avg_train_loss: 0.5017  avg_val_loss: 0.4727  time: 2526s
Epoch 1 - Accuracy: 0.7942976389001035
Epoch 1 - Save Best Score: 0.7943 Model


Epoch: [2][0/2585] Elapsed 0m 2s (remain 110m 26s) Loss: 0.4612(0.4612) Grad: 0.3098 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 1m 32s (remain 38m 6s) Loss: 0.4544(0.4705) Grad: 0.3551 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 3m 3s (remain 36m 15s) Loss: 0.4979(0.4701) Grad: 0.2145 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 4m 33s (remain 34m 37s) Loss: 0.4824(0.4698) Grad: 0.2939 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 6m 4s (remain 33m 3s) Loss: 0.4583(0.4694) Grad: 0.2377 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 7m 34s (remain 31m 30s) Loss: 0.4533(0.4695) Grad: 0.4056 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 9m 4s (remain 29m 58s) Loss: 0.4639(0.4691) Grad: 0.2640 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 10m 35s (remain 28m 27s) Loss: 0.4684(0.4690) Grad: 0.3592 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 12m 5s (remain 26m 55s) Loss: 0.4595(0.4688) Grad: 0.2057 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 13m 35s (remain 25m 24s) Loss: 0.4461(0.4686) Grad: 0.1939 

Epoch 2 - avg_train_loss: 0.4666  avg_val_loss: 0.4648  time: 2518s
Epoch 2 - Accuracy: 0.7978663891992722
Epoch 2 - Save Best Score: 0.7979 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 06:44:45,622][0m Trial 3 finished with value: 0.7978663891992722 and parameters: {'layers': 8, 'filters': 94}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 2s (remain 109m 26s) Loss: 1.3955(1.3955) Grad: 0.4174 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 1m 24s (remain 34m 43s) Loss: 0.5797(0.6848) Grad: 0.9614 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 2m 46s (remain 32m 58s) Loss: 0.5798(0.6287) Grad: 0.6165 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 4m 8s (remain 31m 28s) Loss: 0.5326(0.6003) Grad: 0.4294 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 5m 30s (remain 30m 2s) Loss: 0.5349(0.5817) Grad: 0.6643 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 6m 52s (remain 28m 37s) Loss: 0.5010(0.5686) Grad: 0.4155 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 8m 14s (remain 27m 14s) Loss: 0.5151(0.5587) Grad: 0.5180 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 9m 37s (remain 25m 50s) Loss: 0.4987(0.5512) Grad: 0.3074 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 10m 59s (remain 24m 27s) Loss: 0.4685(0.5450) Grad: 0.3601 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 12m 21s (remain 23m 5s) Loss: 0.4888(0.5398) Grad: 0.251

Epoch 1 - avg_train_loss: 0.5025  avg_val_loss: 0.4758  time: 2287s
Epoch 1 - Accuracy: 0.7923259055685753
Epoch 1 - Save Best Score: 0.7923 Model


Epoch: [2][0/2585] Elapsed 0m 2s (remain 108m 44s) Loss: 0.4576(0.4576) Grad: 0.3434 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 1m 24s (remain 34m 36s) Loss: 0.4660(0.4705) Grad: 0.2676 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 2m 46s (remain 32m 53s) Loss: 0.4635(0.4706) Grad: 0.2155 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 4m 8s (remain 31m 24s) Loss: 0.4745(0.4708) Grad: 0.3035 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 5m 30s (remain 29m 58s) Loss: 0.4874(0.4702) Grad: 0.2955 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 6m 52s (remain 28m 34s) Loss: 0.4681(0.4701) Grad: 0.2946 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 8m 14s (remain 27m 11s) Loss: 0.4692(0.4701) Grad: 0.2870 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 9m 36s (remain 25m 48s) Loss: 0.4713(0.4698) Grad: 0.3658 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 10m 58s (remain 24m 25s) Loss: 0.4474(0.4697) Grad: 0.2197 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 12m 19s (remain 23m 2s) Loss: 0.4492(0.4695) Grad: 0.20

Epoch 2 - avg_train_loss: 0.4674  avg_val_loss: 0.4656  time: 2285s
Epoch 2 - Accuracy: 0.797216558498543
Epoch 2 - Save Best Score: 0.7972 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 08:01:23,609][0m Trial 4 finished with value: 0.797216558498543 and parameters: {'layers': 8, 'filters': 85}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 3s (remain 156m 6s) Loss: 1.3966(1.3966) Grad: 0.5167 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 2m 59s (remain 73m 31s) Loss: 0.5818(0.6888) Grad: 0.8461 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 5m 55s (remain 70m 13s) Loss: 0.5407(0.6276) Grad: 0.7269 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 8m 50s (remain 67m 7s) Loss: 0.5463(0.5964) Grad: 0.3950 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 11m 46s (remain 64m 6s) Loss: 0.5147(0.5768) Grad: 0.4300 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 14m 41s (remain 61m 8s) Loss: 0.5245(0.5638) Grad: 0.4617 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 17m 37s (remain 58m 10s) Loss: 0.5080(0.5539) Grad: 0.5082 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 20m 32s (remain 55m 13s) Loss: 0.5106(0.5460) Grad: 0.3342 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 23m 28s (remain 52m 16s) Loss: 0.5118(0.5396) Grad: 0.3259 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 26m 24s (remain 49m 20s) Loss: 0.4987(0.5348) Grad: 0.

Epoch 1 - avg_train_loss: 0.4988  avg_val_loss: 0.4722  time: 4868s
Epoch 1 - Accuracy: 0.7943372863089277
Epoch 1 - Save Best Score: 0.7943 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 149m 16s) Loss: 0.4737(0.4737) Grad: 0.1835 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 2m 58s (remain 73m 21s) Loss: 0.4905(0.4676) Grad: 0.3920 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 5m 54s (remain 70m 3s) Loss: 0.4732(0.4677) Grad: 0.3793 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 8m 49s (remain 67m 1s) Loss: 0.4751(0.4679) Grad: 0.2732 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 11m 45s (remain 64m 2s) Loss: 0.4689(0.4679) Grad: 0.2878 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 14m 40s (remain 61m 4s) Loss: 0.4579(0.4681) Grad: 0.1950 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 17m 36s (remain 58m 7s) Loss: 0.4672(0.4677) Grad: 0.1972 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 20m 31s (remain 55m 10s) Loss: 0.4532(0.4674) Grad: 0.3145 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 23m 27s (remain 52m 15s) Loss: 0.4612(0.4674) Grad: 0.2188 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 26m 23s (remain 49m 19s) Loss: 0.4635(0.4673) Grad: 0.2

Epoch 2 - avg_train_loss: 0.4650  avg_val_loss: 0.4655  time: 4867s
Epoch 2 - Accuracy: 0.7974027079180227
Epoch 2 - Save Best Score: 0.7974 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 10:43:59,798][0m Trial 5 finished with value: 0.7974027079180227 and parameters: {'layers': 18, 'filters': 90}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 3s (remain 135m 33s) Loss: 1.3955(1.3955) Grad: 0.5572 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 2m 6s (remain 51m 57s) Loss: 0.5920(0.6793) Grad: 0.6077 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 4m 10s (remain 49m 29s) Loss: 0.5361(0.6220) Grad: 0.6570 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 6m 13s (remain 47m 17s) Loss: 0.5300(0.5920) Grad: 0.3823 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 8m 17s (remain 45m 9s) Loss: 0.5093(0.5736) Grad: 0.4870 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 10m 21s (remain 43m 3s) Loss: 0.5125(0.5611) Grad: 0.4945 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 12m 24s (remain 40m 57s) Loss: 0.5038(0.5517) Grad: 0.5214 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 14m 28s (remain 38m 53s) Loss: 0.4926(0.5442) Grad: 0.4893 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 16m 31s (remain 36m 48s) Loss: 0.4799(0.5379) Grad: 0.3401 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 18m 35s (remain 34m 44s) Loss: 0.4822(0.5328) Grad: 0.

Epoch 1 - avg_train_loss: 0.4976  avg_val_loss: 0.4703  time: 3433s
Epoch 1 - Accuracy: 0.7956263106007018
Epoch 1 - Save Best Score: 0.7956 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 129m 53s) Loss: 0.4980(0.4980) Grad: 0.3224 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 2m 6s (remain 51m 53s) Loss: 0.4811(0.4682) Grad: 0.2749 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 4m 10s (remain 49m 26s) Loss: 0.4752(0.4669) Grad: 0.2633 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 6m 13s (remain 47m 14s) Loss: 0.4589(0.4666) Grad: 0.2230 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 8m 17s (remain 45m 7s) Loss: 0.4721(0.4666) Grad: 0.3617 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 10m 20s (remain 43m 1s) Loss: 0.4579(0.4669) Grad: 0.3424 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 12m 24s (remain 40m 56s) Loss: 0.4685(0.4668) Grad: 0.2137 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 14m 27s (remain 38m 51s) Loss: 0.4426(0.4668) Grad: 0.2808 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 16m 31s (remain 36m 47s) Loss: 0.4587(0.4665) Grad: 0.2691 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 18m 34s (remain 34m 43s) Loss: 0.4673(0.4663) Grad: 0.

Epoch 2 - avg_train_loss: 0.4640  avg_val_loss: 0.4627  time: 3432s
Epoch 2 - Accuracy: 0.7991167330921933
Epoch 2 - Save Best Score: 0.7991 Model
Epoch 2 - Save final model
[32m[I 2021-05-18 12:38:48,972][0m Trial 6 finished with value: 0.7991167330921933 and parameters: {'layers': 9, 'filters': 118}. Best is trial 1 with value: 0.8000682709039755.[0m


Epoch: [1][0/2585] Elapsed 0m 3s (remain 130m 40s) Loss: 1.3876(1.3876) Grad: 0.4272 LR: 0.001000  
Epoch: [1][100/2585] Elapsed 2m 3s (remain 50m 26s) Loss: 0.5691(0.6718) Grad: 0.7916 LR: 0.001000  
Epoch: [1][200/2585] Elapsed 4m 3s (remain 48m 3s) Loss: 0.5474(0.6162) Grad: 0.4461 LR: 0.001000  
Epoch: [1][300/2585] Elapsed 6m 2s (remain 45m 54s) Loss: 0.5295(0.5890) Grad: 0.3367 LR: 0.001000  
Epoch: [1][400/2585] Elapsed 8m 2s (remain 43m 49s) Loss: 0.5023(0.5711) Grad: 0.4730 LR: 0.001000  
Epoch: [1][500/2585] Elapsed 10m 2s (remain 41m 46s) Loss: 0.5064(0.5588) Grad: 0.3641 LR: 0.001000  
Epoch: [1][600/2585] Elapsed 12m 2s (remain 39m 44s) Loss: 0.4975(0.5495) Grad: 0.3514 LR: 0.001000  
Epoch: [1][700/2585] Elapsed 14m 2s (remain 37m 43s) Loss: 0.4856(0.5425) Grad: 0.4844 LR: 0.001000  
Epoch: [1][800/2585] Elapsed 16m 2s (remain 35m 42s) Loss: 0.4797(0.5366) Grad: 0.3621 LR: 0.001000  
Epoch: [1][900/2585] Elapsed 18m 2s (remain 33m 42s) Loss: 0.5039(0.5318) Grad: 0.3740 LR

Epoch 1 - avg_train_loss: 0.4976  avg_val_loss: 0.4720  time: 3331s
Epoch 1 - Accuracy: 0.794675739798891
Epoch 1 - Save Best Score: 0.7947 Model


Epoch: [2][0/2585] Elapsed 0m 3s (remain 132m 3s) Loss: 0.4579(0.4579) Grad: 0.2485 LR: 0.000978  
Epoch: [2][100/2585] Elapsed 2m 2s (remain 50m 22s) Loss: 0.4648(0.4668) Grad: 0.2665 LR: 0.000978  
Epoch: [2][200/2585] Elapsed 4m 2s (remain 47m 57s) Loss: 0.4636(0.4678) Grad: 0.3235 LR: 0.000978  
Epoch: [2][300/2585] Elapsed 6m 2s (remain 45m 48s) Loss: 0.4697(0.4671) Grad: 0.2186 LR: 0.000978  
Epoch: [2][400/2585] Elapsed 8m 1s (remain 43m 44s) Loss: 0.4465(0.4669) Grad: 0.3244 LR: 0.000978  
Epoch: [2][500/2585] Elapsed 10m 1s (remain 41m 43s) Loss: 0.4672(0.4668) Grad: 0.2889 LR: 0.000978  
Epoch: [2][600/2585] Elapsed 12m 1s (remain 39m 42s) Loss: 0.4760(0.4666) Grad: 0.1788 LR: 0.000978  
Epoch: [2][700/2585] Elapsed 14m 1s (remain 37m 41s) Loss: 0.4650(0.4665) Grad: 0.2449 LR: 0.000978  
Epoch: [2][800/2585] Elapsed 16m 1s (remain 35m 40s) Loss: 0.4556(0.4664) Grad: 0.2650 LR: 0.000978  
Epoch: [2][900/2585] Elapsed 18m 0s (remain 33m 40s) Loss: 0.4546(0.4663) Grad: 0.2111 LR