# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 5

    geese_net_layers = 12
    geese_net_filters = 32

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 10
    model_name = "geese_net"
    pre_train_file = ""

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [4]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [5]:
if Config.apex:
    from apex import amp

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

27362


In [9]:
# fit for memory size...
paths = paths[-11000:]
# paths = paths[:-11000]
print(len(paths))

11000


In [10]:
if Config.debug:
    paths = paths[:10]

## Utils

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [12]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [13]:
next_position_map = {}
for pos in range(77):
    position = []
    position.append((11 * (1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (-1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (pos // 11) + (pos + 1) % 11) % 77)
    position.append((11 * (pos // 11) + (pos - 1) % 11) % 77)
    next_position_map[pos] = set(position)

In [14]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [15]:
def get_reverse_cube(obses):
    """
    尻尾から順番に 1, 0.9, 0.8, ... という並び
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, geese in enumerate(obs["geese"]):
        # whole position reverse
        for num_reverse, pos in enumerate(geese[::-1]):
            b[(p - obs["index"]) % 4, pos] = 1 - num_reverse * 0.1

    return b.reshape(-1, 7, 11)

In [16]:
def get_next_disappear_cube(obses):
    """
    次になくなる場所: 1
    次になくなる可能性のある場所: 0.5
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    # foodを食べる可能性があるか。
    eat_food_possibility = defaultdict(int)
    for p, geese in enumerate(obs["geese"]):
        for pos in geese[:1]:
            if not next_position_map[pos].isdisjoint(obs["food"]):
                eat_food_possibility[p] = 1

    if (step % 40) == 39:  # 1つ短くなる
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 尻尾が1、尻尾の１つ前0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1
                for pos in geese[-2:-1]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし -> 尻尾が1, 尻尾の1つ前1
                for pos in geese[-2:]:
                    b[(p - obs["index"]) % 4, pos] = 1
    else:  # 1つ短くならない
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 食べる可能性があり -> 尻尾を0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし # 尻尾を1
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1

    return b.reshape(-1, 7, 11)

In [17]:
def get_step_cube_v2(obses):
    """
    step0: 0, step199: 1
    step0: 0, step39 + 40n: 1
    """
    b = np.zeros((1, 7, 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    b[:, :, :5] = (step % 200) / 199
    b[:, :, 5:] = (step % 40) / 39

    return b

In [18]:
def get_length_cube(obses):
    b = np.zeros((2, 7, 11), dtype=np.float16)
    obs = obses[-1]

    my_length = len(obs["geese"][obs["index"]])
    opposite1_length = len(obs["geese"][(obs["index"] + 1) % 4])
    opposite2_length = len(obs["geese"][(obs["index"] + 2) % 4])
    opposite3_length = len(obs["geese"][(obs["index"] + 3) % 4])

    b[0] = my_length / 10
    max_opposite_length = max(opposite1_length, opposite2_length, opposite3_length)
    b[1, :, 0:2] = (my_length - max_opposite_length) / 10
    b[1, :, 2:5] = (my_length - opposite1_length) / 10
    b[1, :, 5:8] = (my_length - opposite2_length) / 10
    b[1, :, 8:11] = (my_length - opposite3_length) / 10

    return b

In [19]:
def get_features(obses):
    b = np.zeros((7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    my_goose = obs["geese"][obs["index"]]
    my_length = len(my_goose)

    # num step
    b[0] = (step - 194) if step >= 195 else 0
    b[1] = (step % 40 - 35) if step % 40 > 35 else 0

    """
    2-4: difference between my_length and opponent length (-3 to 3)
    """
    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4
        p_length = len(pos_list)

        if pid == 0:
            continue

        b[1 + pid] = max(min(my_length - p_length, 3), -3) + 3

    """
    5-7: difference between my head position and opponent one
    """
    if my_length != 0:

        for p, pos_list in enumerate(obs["geese"]):
            pid = (p - obs["index"]) % 4

            if pid == 0 or len(pos_list) == 0:
                continue

            diff = abs(my_goose[0] - pos_list[0])
            x_ = diff % 11
            x = min(x_, 11 - x_)
            y_ = diff // 11
            y = min(y_, 7 - y_)
            b[4 + pid] = x + y

    return b.reshape(1, 7, 11)

## Data

In [20]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            # 反転可能な特徴量
            X_ = []
            X_.append(make_input(obses[: j + 1]))
            X_.append(get_reverse_cube(obses[: j + 1]))
            X_.append(get_next_disappear_cube(obses[: j + 1]))

            # 反転不可能な特徴量
            X_i = []
            # X_i.append(get_step_cube_v2(obses[: j + 1]))
            # X_i.append(get_length_cube(obses[: j + 1]))
            X_i.append(get_features(obses[: j + 1]))

            X_ = np.concatenate(X_)
            X_i = np.concatenate(X_i)

            X.append(np.concatenate([X_, X_i]))
            X.append(np.concatenate([X_[:, ::-1, :], X_i]))  # 上下反転
            X.append(np.concatenate([X_[:, :, ::-1], X_i]))  # 左右反転
            X.append(np.concatenate([X_[:, ::-1, ::-1], X_i]))  # 上下左右反転

        X = np.array(X, dtype=np.float16)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except Exception as e:
        if Config.debug:
            raise Exception from e
        return 0, 0

In [21]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11000.0), HTML(value='')))


Num episode: 6861292


In [22]:
unique_ = False

In [23]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [24]:
if unique_:
    X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
    X_train_group = np.unique(X_train_sum_obs)
    X_train_group.shape

In [25]:
if unique_:
    X_train_unique = []
    y_train_unique = []
    for group in tqdm(X_train_group):
        group_index = np.where(X_train_sum_obs == group)

        X_train_ = X_train[group_index]
        y_train_ = y_train[group_index]

        X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
        y_train_ = y_train_[unique_index]

        X_train_unique.append(X_train_)
        y_train_unique.append(y_train_)

    X_train = np.concatenate(X_train_unique)
    y_train = np.concatenate(y_train_unique)

    print(f"Num episode: {len(X_train)}")

In [26]:
if unique_:
    del X_train_sum_obs
    del X_train_group
    del X_train_unique
    del y_train_unique
    del X_train_
    del y_train_
    del group_index
    del unique_index

In [27]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [28]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [29]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,3
1,3
2,2
3,2
4,0
...,...
6861287,3
6861288,1
6861289,0
6861290,1


## CV Split

In [30]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         330851
      1         330851
      2         355278
      3         355279
1     0         330851
      1         330851
      2         355278
      3         355279
2     0         330851
      1         330850
      2         355279
      3         355278
3     0         330850
      1         330851
      2         355279
      3         355278
4     0         330851
      1         330851
      2         355278
      3         355278
dtype: int64


## Dataset

In [31]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [32]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [33]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, do=False, bn=True):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.do = nn.Dropout2d(p=0.1) if do else None
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.do(h) if self.do is not None else h
        h = self.bn(h) if self.bn is not None else h
        return h

In [34]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters
        dim = filters * 5 + 30

        self.embed_step = nn.Embedding(5, 3)
        self.embed_hunger = nn.Embedding(5, 3)
        self.embed_diff_len = nn.Embedding(7, 4)
        self.embed_diff_head = nn.Embedding(9, 4)

        self.conv0 = TorusConv2d(25, filters, (3, 3))
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])
        self.conv1 = TorusConv2d(filters, filters, (5, 5))

        self.head_p1 = nn.Linear(dim, dim // 2, bias=True)
        self.head_p2 = nn.Linear(dim // 2, 4, bias=False)
        self.head_v1 = nn.Linear(dim, dim // 2, bias=True)
        self.head_v2 = nn.Linear(dim // 2, 1, bias=False)

    def forward(self, x, _=None):
        x_feats = x[:, -1].view(x.size(0), -1).long()

        # Embedding for features
        e_step = self.embed_step(x_feats[:, 0])
        e_hung = self.embed_hunger(x_feats[:, 1])
        e_diff_l = self.embed_diff_len(x_feats[:, 2:5]).view(x.size(0), -1)
        e_diff_h = self.embed_diff_head(x_feats[:, 5:8]).view(x.size(0), -1)

        x = x[:, :-1].float()

        # CNN for observation
        h = F.relu_(self.conv0(x))

        for block in self.blocks:
            h = F.relu_(h + block(h))

        h = F.relu_(h + self.conv1(h))

        # Extract head position
        h_head = (h * x[:, :1]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head2 = (h * x[:, 1:2]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head3 = (h * x[:, 2:3]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head4 = (h * x[:, 3:4]).view(h.size(0), h.size(1), -1).sum(-1)
        h_avg = h.view(h.size(0), h.size(1), -1).mean(-1)

        # Merge features
        h = torch.cat(
            [
                h_head,
                h_head2,
                h_head3,
                h_head4,
                h_avg,
                e_step,
                e_hung,
                e_diff_l,
                e_diff_h,
            ],
            1,
        ).view(1, h.size(0), -1)

        h_p = F.relu_(self.head_p1(h.view(x.size(0), -1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.head_v1(h.view(x.size(0), -1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [35]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        print(f"input shape: {obs.shape}")
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [36]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [37]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [38]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [39]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [40]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [41]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    # try:
    #     model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    # except:
    #     print(f"Failed to load pre-train weight.")

    # Disable training for value network
    # for param in model.head_v1.parameters():
    #     param.requires_grad = False
    # for param in model.head_v2.parameters():
    #     param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [42]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [43]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            # break  # fold 1つだけ
        # CV result
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [44]:
if __name__ == "__main__":
    main()



Epoch: [1][0/1715] Elapsed 0m 3s (remain 108m 48s) Loss avg.: 1.6127 Grad: 4.1090 LR: 0.00100  
Epoch: [1][100/1715] Elapsed 0m 35s (remain 9m 34s) Loss avg.: 0.7649 Grad: 0.6198 LR: 0.00100  
Epoch: [1][200/1715] Elapsed 1m 7s (remain 8m 32s) Loss avg.: 0.6801 Grad: 0.8742 LR: 0.00100  
Epoch: [1][300/1715] Elapsed 1m 40s (remain 7m 50s) Loss avg.: 0.6433 Grad: 0.8801 LR: 0.00100  
Epoch: [1][400/1715] Elapsed 2m 12s (remain 7m 13s) Loss avg.: 0.6204 Grad: 0.9802 LR: 0.00100  
Epoch: [1][500/1715] Elapsed 2m 44s (remain 6m 39s) Loss avg.: 0.6043 Grad: 1.0607 LR: 0.00100  
Epoch: [1][600/1715] Elapsed 3m 17s (remain 6m 5s) Loss avg.: 0.5916 Grad: 0.8876 LR: 0.00100  
Epoch: [1][700/1715] Elapsed 3m 49s (remain 5m 32s) Loss avg.: 0.5813 Grad: 0.6955 LR: 0.00100  
Epoch: [1][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.5727 Grad: 0.7523 LR: 0.00100  
Epoch: [1][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.5658 Grad: 0.4134 LR: 0.00100  
Epoch: [1][1000/1715] Elapsed 5m 

Epoch 1 - avg_train_loss: 0.5337  avg_val_loss: 0.4850  time: 593s
Epoch 1 - Accuracy: 0.7888569140373647
Epoch 1 - Save Best Score: 0.7889 Model


Epoch: [2][0/1715] Elapsed 0m 1s (remain 46m 43s) Loss avg.: 0.4987 Grad: 0.8099 LR: 0.00098  
Epoch: [2][100/1715] Elapsed 0m 33s (remain 8m 59s) Loss avg.: 0.4876 Grad: 0.4621 LR: 0.00098  
Epoch: [2][200/1715] Elapsed 1m 5s (remain 8m 16s) Loss avg.: 0.4863 Grad: 0.5063 LR: 0.00098  
Epoch: [2][300/1715] Elapsed 1m 38s (remain 7m 40s) Loss avg.: 0.4867 Grad: 0.3671 LR: 0.00098  
Epoch: [2][400/1715] Elapsed 2m 10s (remain 7m 6s) Loss avg.: 0.4865 Grad: 0.6746 LR: 0.00098  
Epoch: [2][500/1715] Elapsed 2m 42s (remain 6m 33s) Loss avg.: 0.4859 Grad: 0.4165 LR: 0.00098  
Epoch: [2][600/1715] Elapsed 3m 14s (remain 6m 0s) Loss avg.: 0.4853 Grad: 0.5333 LR: 0.00098  
Epoch: [2][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4851 Grad: 0.5234 LR: 0.00098  
Epoch: [2][800/1715] Elapsed 4m 18s (remain 4m 55s) Loss avg.: 0.4845 Grad: 0.4305 LR: 0.00098  
Epoch: [2][900/1715] Elapsed 4m 51s (remain 4m 22s) Loss avg.: 0.4840 Grad: 0.6597 LR: 0.00098  
Epoch: [2][1000/1715] Elapsed 5m 23

Epoch 2 - avg_train_loss: 0.4811  avg_val_loss: 0.4683  time: 591s
Epoch 2 - Accuracy: 0.7972051923142789
Epoch 2 - Save Best Score: 0.7972 Model


Epoch: [3][0/1715] Elapsed 0m 1s (remain 49m 15s) Loss avg.: 0.4753 Grad: 0.4669 LR: 0.00091  
Epoch: [3][100/1715] Elapsed 0m 34s (remain 9m 5s) Loss avg.: 0.4734 Grad: 0.7199 LR: 0.00091  
Epoch: [3][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4723 Grad: 0.4615 LR: 0.00091  
Epoch: [3][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4725 Grad: 0.7240 LR: 0.00091  
Epoch: [3][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4722 Grad: 0.4891 LR: 0.00091  
Epoch: [3][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4724 Grad: 0.3248 LR: 0.00091  
Epoch: [3][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4723 Grad: 0.5093 LR: 0.00091  
Epoch: [3][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4722 Grad: 0.3855 LR: 0.00091  
Epoch: [3][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4719 Grad: 0.5475 LR: 0.00091  
Epoch: [3][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4717 Grad: 0.3340 LR: 0.00091  
Epoch: [3][1000/1715] Elapsed 5m 23s

Epoch 3 - avg_train_loss: 0.4701  avg_val_loss: 0.4635  time: 591s
Epoch 3 - Accuracy: 0.7996077999852798
Epoch 3 - Save Best Score: 0.7996 Model


Epoch: [4][0/1715] Elapsed 0m 1s (remain 45m 34s) Loss avg.: 0.4787 Grad: 0.5371 LR: 0.00081  
Epoch: [4][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4649 Grad: 0.4244 LR: 0.00081  
Epoch: [4][200/1715] Elapsed 1m 5s (remain 8m 16s) Loss avg.: 0.4646 Grad: 0.5265 LR: 0.00081  
Epoch: [4][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4649 Grad: 0.3671 LR: 0.00081  
Epoch: [4][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4649 Grad: 0.3856 LR: 0.00081  
Epoch: [4][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4652 Grad: 0.3960 LR: 0.00081  
Epoch: [4][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4650 Grad: 0.3362 LR: 0.00081  
Epoch: [4][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4652 Grad: 0.3435 LR: 0.00081  
Epoch: [4][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4649 Grad: 0.3697 LR: 0.00081  
Epoch: [4][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4646 Grad: 0.5776 LR: 0.00081  
Epoch: [4][1000/1715] Elapsed 5m 23s

Epoch 4 - avg_train_loss: 0.4638  avg_val_loss: 0.4592  time: 591s
Epoch 4 - Accuracy: 0.8011570702032197
Epoch 4 - Save Best Score: 0.8012 Model


Epoch: [5][0/1715] Elapsed 0m 1s (remain 45m 22s) Loss avg.: 0.4457 Grad: 0.3980 LR: 0.00069  
Epoch: [5][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4574 Grad: 0.3010 LR: 0.00069  
Epoch: [5][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4588 Grad: 0.4408 LR: 0.00069  
Epoch: [5][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4593 Grad: 0.3781 LR: 0.00069  
Epoch: [5][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4596 Grad: 0.3521 LR: 0.00069  
Epoch: [5][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4593 Grad: 0.4654 LR: 0.00069  
Epoch: [5][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4593 Grad: 0.3393 LR: 0.00069  
Epoch: [5][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4592 Grad: 0.3172 LR: 0.00069  
Epoch: [5][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4591 Grad: 0.2664 LR: 0.00069  
Epoch: [5][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4592 Grad: 0.3947 LR: 0.00069  
Epoch: [5][1000/1715] Elapsed 5m 23s

Epoch 5 - avg_train_loss: 0.4591  avg_val_loss: 0.4549  time: 591s
Epoch 5 - Accuracy: 0.8033242995673557
Epoch 5 - Save Best Score: 0.8033 Model


Epoch: [6][0/1715] Elapsed 0m 1s (remain 45m 1s) Loss avg.: 0.4609 Grad: 0.4261 LR: 0.00055  
Epoch: [6][100/1715] Elapsed 0m 33s (remain 8m 59s) Loss avg.: 0.4541 Grad: 0.3783 LR: 0.00055  
Epoch: [6][200/1715] Elapsed 1m 5s (remain 8m 16s) Loss avg.: 0.4533 Grad: 0.3096 LR: 0.00055  
Epoch: [6][300/1715] Elapsed 1m 38s (remain 7m 40s) Loss avg.: 0.4545 Grad: 0.3228 LR: 0.00055  
Epoch: [6][400/1715] Elapsed 2m 10s (remain 7m 6s) Loss avg.: 0.4544 Grad: 0.3347 LR: 0.00055  
Epoch: [6][500/1715] Elapsed 2m 42s (remain 6m 33s) Loss avg.: 0.4551 Grad: 0.2894 LR: 0.00055  
Epoch: [6][600/1715] Elapsed 3m 14s (remain 6m 0s) Loss avg.: 0.4554 Grad: 0.3327 LR: 0.00055  
Epoch: [6][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4554 Grad: 0.4054 LR: 0.00055  
Epoch: [6][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4554 Grad: 0.2553 LR: 0.00055  
Epoch: [6][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4553 Grad: 0.2561 LR: 0.00055  
Epoch: [6][1000/1715] Elapsed 5m 23s

Epoch 6 - avg_train_loss: 0.4551  avg_val_loss: 0.4522  time: 591s
Epoch 6 - Accuracy: 0.8050477351578674
Epoch 6 - Save Best Score: 0.8050 Model


Epoch: [7][0/1715] Elapsed 0m 1s (remain 46m 10s) Loss avg.: 0.4397 Grad: 0.3027 LR: 0.00041  
Epoch: [7][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4505 Grad: 0.3802 LR: 0.00041  
Epoch: [7][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4517 Grad: 0.4321 LR: 0.00041  
Epoch: [7][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4516 Grad: 0.3277 LR: 0.00041  
Epoch: [7][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4517 Grad: 0.3247 LR: 0.00041  
Epoch: [7][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4519 Grad: 0.3657 LR: 0.00041  
Epoch: [7][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4519 Grad: 0.2476 LR: 0.00041  
Epoch: [7][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4518 Grad: 0.3206 LR: 0.00041  
Epoch: [7][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4520 Grad: 0.2994 LR: 0.00041  
Epoch: [7][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4520 Grad: 0.2550 LR: 0.00041  
Epoch: [7][1000/1715] Elapsed 5m 23s

Epoch 7 - avg_train_loss: 0.4517  avg_val_loss: 0.4501  time: 592s
Epoch 7 - Accuracy: 0.8058697374183736
Epoch 7 - Save Best Score: 0.8059 Model


Epoch: [8][0/1715] Elapsed 0m 1s (remain 45m 33s) Loss avg.: 0.4678 Grad: 0.2555 LR: 0.00029  
Epoch: [8][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4490 Grad: 0.2485 LR: 0.00029  
Epoch: [8][200/1715] Elapsed 1m 5s (remain 8m 16s) Loss avg.: 0.4486 Grad: 0.4748 LR: 0.00029  
Epoch: [8][300/1715] Elapsed 1m 38s (remain 7m 40s) Loss avg.: 0.4486 Grad: 0.2763 LR: 0.00029  
Epoch: [8][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4487 Grad: 0.3144 LR: 0.00029  
Epoch: [8][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4484 Grad: 0.2647 LR: 0.00029  
Epoch: [8][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4488 Grad: 0.2950 LR: 0.00029  
Epoch: [8][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4492 Grad: 0.2986 LR: 0.00029  
Epoch: [8][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4490 Grad: 0.2606 LR: 0.00029  
Epoch: [8][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4490 Grad: 0.4051 LR: 0.00029  
Epoch: [8][1000/1715] Elapsed 5m 23s

Epoch 8 - avg_train_loss: 0.4488  avg_val_loss: 0.4478  time: 591s
Epoch 8 - Accuracy: 0.80718727295649
Epoch 8 - Save Best Score: 0.8072 Model


Epoch: [9][0/1715] Elapsed 0m 1s (remain 45m 29s) Loss avg.: 0.4532 Grad: 0.3904 LR: 0.00019  
Epoch: [9][100/1715] Elapsed 0m 33s (remain 8m 59s) Loss avg.: 0.4454 Grad: 0.3377 LR: 0.00019  
Epoch: [9][200/1715] Elapsed 1m 5s (remain 8m 16s) Loss avg.: 0.4455 Grad: 0.3203 LR: 0.00019  
Epoch: [9][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4452 Grad: 0.2606 LR: 0.00019  
Epoch: [9][400/1715] Elapsed 2m 10s (remain 7m 6s) Loss avg.: 0.4460 Grad: 0.2982 LR: 0.00019  
Epoch: [9][500/1715] Elapsed 2m 42s (remain 6m 33s) Loss avg.: 0.4459 Grad: 0.3297 LR: 0.00019  
Epoch: [9][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4457 Grad: 0.3564 LR: 0.00019  
Epoch: [9][700/1715] Elapsed 3m 46s (remain 5m 28s) Loss avg.: 0.4458 Grad: 0.3089 LR: 0.00019  
Epoch: [9][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4462 Grad: 0.3679 LR: 0.00019  
Epoch: [9][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4463 Grad: 0.3435 LR: 0.00019  
Epoch: [9][1000/1715] Elapsed 5m 23

Epoch 9 - avg_train_loss: 0.4466  avg_val_loss: 0.4466  time: 591s
Epoch 9 - Accuracy: 0.8078649875861627
Epoch 9 - Save Best Score: 0.8079 Model


Epoch: [10][0/1715] Elapsed 0m 1s (remain 48m 57s) Loss avg.: 0.4235 Grad: 0.3018 LR: 0.00012  
Epoch: [10][100/1715] Elapsed 0m 34s (remain 9m 5s) Loss avg.: 0.4457 Grad: 0.3242 LR: 0.00012  
Epoch: [10][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4442 Grad: 0.3615 LR: 0.00012  
Epoch: [10][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4451 Grad: 0.2925 LR: 0.00012  
Epoch: [10][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4450 Grad: 0.3037 LR: 0.00012  
Epoch: [10][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4451 Grad: 0.2704 LR: 0.00012  
Epoch: [10][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4452 Grad: 0.2976 LR: 0.00012  
Epoch: [10][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4452 Grad: 0.2760 LR: 0.00012  
Epoch: [10][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4451 Grad: 0.2828 LR: 0.00012  
Epoch: [10][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4450 Grad: 0.2699 LR: 0.00012  
Epoch: [10][1000/1715] Ela

Epoch 10 - avg_train_loss: 0.4448  avg_val_loss: 0.4453  time: 591s
Epoch 10 - Accuracy: 0.808387483703878
Epoch 10 - Save Best Score: 0.8084 Model
Epoch 10 - Save final model
Score: 0.80839


Epoch: [1][0/1715] Elapsed 0m 1s (remain 45m 7s) Loss avg.: 1.6029 Grad: 4.9055 LR: 0.00100  
Epoch: [1][100/1715] Elapsed 0m 33s (remain 8m 59s) Loss avg.: 0.7981 Grad: 1.0461 LR: 0.00100  
Epoch: [1][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.7007 Grad: 0.6821 LR: 0.00100  
Epoch: [1][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.6589 Grad: 0.7903 LR: 0.00100  
Epoch: [1][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.6338 Grad: 0.6130 LR: 0.00100  
Epoch: [1][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.6150 Grad: 0.8799 LR: 0.00100  
Epoch: [1][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.6007 Grad: 0.7068 LR: 0.00100  
Epoch: [1][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.5893 Grad: 0.7431 LR: 0.00100  
Epoch: [1][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.5798 Grad: 0.4313 LR: 0.00100  
Epoch: [1][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.5721 Grad: 0.5498 LR: 0.00100  
Epoch: [1][1000/1715] Elapsed 5m 23s

Epoch 1 - avg_train_loss: 0.5369  avg_val_loss: 0.4842  time: 591s
Epoch 1 - Accuracy: 0.790159875067316
Epoch 1 - Save Best Score: 0.7902 Model


Epoch: [2][0/1715] Elapsed 0m 1s (remain 45m 13s) Loss avg.: 0.5152 Grad: 0.7249 LR: 0.00098  
Epoch: [2][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4867 Grad: 0.4415 LR: 0.00098  
Epoch: [2][200/1715] Elapsed 1m 5s (remain 8m 17s) Loss avg.: 0.4866 Grad: 0.3610 LR: 0.00098  
Epoch: [2][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4858 Grad: 0.3897 LR: 0.00098  
Epoch: [2][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4857 Grad: 0.5717 LR: 0.00098  
Epoch: [2][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4850 Grad: 0.9540 LR: 0.00098  
Epoch: [2][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4848 Grad: 0.7987 LR: 0.00098  
Epoch: [2][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4846 Grad: 0.7001 LR: 0.00098  
Epoch: [2][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4843 Grad: 0.3958 LR: 0.00098  
Epoch: [2][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4837 Grad: 0.7767 LR: 0.00098  
Epoch: [2][1000/1715] Elapsed 5m 23s

Epoch 2 - avg_train_loss: 0.4805  avg_val_loss: 0.4686  time: 591s
Epoch 2 - Accuracy: 0.7978450132227225
Epoch 2 - Save Best Score: 0.7978 Model


Epoch: [3][0/1715] Elapsed 0m 1s (remain 44m 35s) Loss avg.: 0.4697 Grad: 0.3023 LR: 0.00091  
Epoch: [3][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4718 Grad: 0.3956 LR: 0.00091  
Epoch: [3][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4716 Grad: 0.7783 LR: 0.00091  
Epoch: [3][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4718 Grad: 0.6161 LR: 0.00091  
Epoch: [3][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4714 Grad: 0.5272 LR: 0.00091  
Epoch: [3][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4711 Grad: 0.4138 LR: 0.00091  
Epoch: [3][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4717 Grad: 0.3414 LR: 0.00091  
Epoch: [3][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4712 Grad: 0.4292 LR: 0.00091  
Epoch: [3][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4710 Grad: 0.3353 LR: 0.00091  
Epoch: [3][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4708 Grad: 0.3461 LR: 0.00091  
Epoch: [3][1000/1715] Elapsed 5m 23s

Epoch 3 - avg_train_loss: 0.4696  avg_val_loss: 0.4624  time: 591s
Epoch 3 - Accuracy: 0.8007686595606223
Epoch 3 - Save Best Score: 0.8008 Model


Epoch: [4][0/1715] Elapsed 0m 1s (remain 45m 23s) Loss avg.: 0.4689 Grad: 0.4414 LR: 0.00081  
Epoch: [4][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4661 Grad: 0.3386 LR: 0.00081  
Epoch: [4][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4653 Grad: 0.5551 LR: 0.00081  
Epoch: [4][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4646 Grad: 0.4123 LR: 0.00081  
Epoch: [4][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4643 Grad: 0.6691 LR: 0.00081  
Epoch: [4][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4640 Grad: 0.3079 LR: 0.00081  
Epoch: [4][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4640 Grad: 0.5137 LR: 0.00081  
Epoch: [4][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4641 Grad: 0.3959 LR: 0.00081  
Epoch: [4][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4640 Grad: 0.3309 LR: 0.00081  
Epoch: [4][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4638 Grad: 0.3196 LR: 0.00081  
Epoch: [4][1000/1715] Elapsed 5m 23s

Epoch 4 - avg_train_loss: 0.4631  avg_val_loss: 0.4590  time: 591s
Epoch 4 - Accuracy: 0.8021481367584399
Epoch 4 - Save Best Score: 0.8021 Model


Epoch: [5][0/1715] Elapsed 0m 1s (remain 45m 7s) Loss avg.: 0.4678 Grad: 0.4900 LR: 0.00069  
Epoch: [5][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4573 Grad: 0.3179 LR: 0.00069  
Epoch: [5][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4584 Grad: 0.3996 LR: 0.00069  
Epoch: [5][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4585 Grad: 0.2909 LR: 0.00069  
Epoch: [5][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4585 Grad: 0.3696 LR: 0.00069  
Epoch: [5][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4583 Grad: 0.4567 LR: 0.00069  
Epoch: [5][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4581 Grad: 0.3933 LR: 0.00069  
Epoch: [5][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4582 Grad: 0.4533 LR: 0.00069  
Epoch: [5][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4584 Grad: 0.3306 LR: 0.00069  
Epoch: [5][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4585 Grad: 0.3196 LR: 0.00069  
Epoch: [5][1000/1715] Elapsed 5m 23s 

Epoch 5 - avg_train_loss: 0.4581  avg_val_loss: 0.4531  time: 591s
Epoch 5 - Accuracy: 0.8053603583580068
Epoch 5 - Save Best Score: 0.8054 Model


Epoch: [6][0/1715] Elapsed 0m 1s (remain 45m 5s) Loss avg.: 0.4808 Grad: 0.2773 LR: 0.00055  
Epoch: [6][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4537 Grad: 0.3307 LR: 0.00055  
Epoch: [6][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4540 Grad: 0.2872 LR: 0.00055  
Epoch: [6][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4544 Grad: 0.3936 LR: 0.00055  
Epoch: [6][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4543 Grad: 0.2992 LR: 0.00055  
Epoch: [6][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4542 Grad: 0.3528 LR: 0.00055  
Epoch: [6][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4545 Grad: 0.4231 LR: 0.00055  
Epoch: [6][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4545 Grad: 0.4329 LR: 0.00055  
Epoch: [6][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4545 Grad: 0.4346 LR: 0.00055  
Epoch: [6][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4547 Grad: 0.4375 LR: 0.00055  
Epoch: [6][1000/1715] Elapsed 5m 23s 

Epoch 6 - avg_train_loss: 0.4543  avg_val_loss: 0.4510  time: 591s
Epoch 6 - Accuracy: 0.806313531191998
Epoch 6 - Save Best Score: 0.8063 Model


Epoch: [7][0/1715] Elapsed 0m 1s (remain 44m 51s) Loss avg.: 0.4501 Grad: 0.2991 LR: 0.00041  
Epoch: [7][100/1715] Elapsed 0m 33s (remain 9m 3s) Loss avg.: 0.4497 Grad: 0.2792 LR: 0.00041  
Epoch: [7][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4500 Grad: 0.3077 LR: 0.00041  
Epoch: [7][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4498 Grad: 0.3590 LR: 0.00041  
Epoch: [7][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4501 Grad: 0.3969 LR: 0.00041  
Epoch: [7][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4501 Grad: 0.3455 LR: 0.00041  
Epoch: [7][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4504 Grad: 0.4271 LR: 0.00041  
Epoch: [7][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4504 Grad: 0.3160 LR: 0.00041  
Epoch: [7][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4505 Grad: 0.3292 LR: 0.00041  
Epoch: [7][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4504 Grad: 0.3268 LR: 0.00041  
Epoch: [7][1000/1715] Elapsed 5m 23s

Epoch 7 - avg_train_loss: 0.4509  avg_val_loss: 0.4496  time: 591s
Epoch 7 - Accuracy: 0.8071253312967888
Epoch 7 - Save Best Score: 0.8071 Model


Epoch: [8][0/1715] Elapsed 0m 1s (remain 45m 22s) Loss avg.: 0.4525 Grad: 0.4021 LR: 0.00029  
Epoch: [8][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4467 Grad: 0.3413 LR: 0.00029  
Epoch: [8][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4471 Grad: 0.2771 LR: 0.00029  
Epoch: [8][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4476 Grad: 0.2954 LR: 0.00029  
Epoch: [8][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4474 Grad: 0.3301 LR: 0.00029  
Epoch: [8][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4472 Grad: 0.2421 LR: 0.00029  
Epoch: [8][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4476 Grad: 0.3500 LR: 0.00029  
Epoch: [8][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4476 Grad: 0.2506 LR: 0.00029  
Epoch: [8][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4478 Grad: 0.2663 LR: 0.00029  
Epoch: [8][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4479 Grad: 0.3478 LR: 0.00029  
Epoch: [8][1000/1715] Elapsed 5m 23s

Epoch 8 - avg_train_loss: 0.4481  avg_val_loss: 0.4481  time: 591s
Epoch 8 - Accuracy: 0.8077994022994202
Epoch 8 - Save Best Score: 0.8078 Model


Epoch: [9][0/1715] Elapsed 0m 1s (remain 44m 43s) Loss avg.: 0.4403 Grad: 0.3774 LR: 0.00019  
Epoch: [9][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4456 Grad: 0.3093 LR: 0.00019  
Epoch: [9][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4458 Grad: 0.2535 LR: 0.00019  
Epoch: [9][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4452 Grad: 0.3048 LR: 0.00019  
Epoch: [9][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4456 Grad: 0.3388 LR: 0.00019  
Epoch: [9][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4457 Grad: 0.3334 LR: 0.00019  
Epoch: [9][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4458 Grad: 0.3180 LR: 0.00019  
Epoch: [9][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4460 Grad: 0.2800 LR: 0.00019  
Epoch: [9][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4459 Grad: 0.3477 LR: 0.00019  
Epoch: [9][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4459 Grad: 0.2665 LR: 0.00019  
Epoch: [9][1000/1715] Elapsed 5m 23s

Epoch 9 - avg_train_loss: 0.4457  avg_val_loss: 0.4460  time: 591s
Epoch 9 - Accuracy: 0.8088524105143416
Epoch 9 - Save Best Score: 0.8089 Model


Epoch: [10][0/1715] Elapsed 0m 1s (remain 44m 56s) Loss avg.: 0.4474 Grad: 0.2960 LR: 0.00012  
Epoch: [10][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4433 Grad: 0.2349 LR: 0.00012  
Epoch: [10][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4439 Grad: 0.2358 LR: 0.00012  
Epoch: [10][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4445 Grad: 0.2642 LR: 0.00012  
Epoch: [10][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4445 Grad: 0.2726 LR: 0.00012  
Epoch: [10][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4445 Grad: 0.2843 LR: 0.00012  
Epoch: [10][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4444 Grad: 0.3038 LR: 0.00012  
Epoch: [10][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4440 Grad: 0.3209 LR: 0.00012  
Epoch: [10][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4441 Grad: 0.2824 LR: 0.00012  
Epoch: [10][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4439 Grad: 0.3097 LR: 0.00012  
Epoch: [10][1000/1715] Ela

Epoch 10 - avg_train_loss: 0.4442  avg_val_loss: 0.4452  time: 591s
Epoch 10 - Accuracy: 0.8094055130992036
Epoch 10 - Save Best Score: 0.8094 Model
Epoch 10 - Save final model
Score: 0.80941


Epoch: [1][0/1715] Elapsed 0m 1s (remain 45m 58s) Loss avg.: 1.5621 Grad: 3.9111 LR: 0.00100  
Epoch: [1][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.7719 Grad: 0.8537 LR: 0.00100  
Epoch: [1][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.6872 Grad: 0.5487 LR: 0.00100  
Epoch: [1][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.6497 Grad: 0.7980 LR: 0.00100  
Epoch: [1][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.6264 Grad: 0.7373 LR: 0.00100  
Epoch: [1][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.6095 Grad: 0.5225 LR: 0.00100  
Epoch: [1][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.5957 Grad: 0.7740 LR: 0.00100  
Epoch: [1][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.5850 Grad: 0.5902 LR: 0.00100  
Epoch: [1][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.5763 Grad: 0.6309 LR: 0.00100  
Epoch: [1][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.5693 Grad: 0.7522 LR: 0.00100  
Epoch: [1][1000/1715] Elapsed 5m 23s

Epoch 1 - avg_train_loss: 0.5357  avg_val_loss: 0.4910  time: 591s
Epoch 1 - Accuracy: 0.785026576635006
Epoch 1 - Save Best Score: 0.7850 Model


Epoch: [2][0/1715] Elapsed 0m 1s (remain 43m 38s) Loss avg.: 0.4928 Grad: 1.1988 LR: 0.00098  
Epoch: [2][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4887 Grad: 0.7444 LR: 0.00098  
Epoch: [2][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4881 Grad: 0.4684 LR: 0.00098  
Epoch: [2][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4882 Grad: 0.8433 LR: 0.00098  
Epoch: [2][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4868 Grad: 0.6456 LR: 0.00098  
Epoch: [2][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4862 Grad: 0.5739 LR: 0.00098  
Epoch: [2][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.4857 Grad: 0.6341 LR: 0.00098  
Epoch: [2][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4851 Grad: 0.6279 LR: 0.00098  
Epoch: [2][800/1715] Elapsed 4m 20s (remain 4m 57s) Loss avg.: 0.4850 Grad: 0.4625 LR: 0.00098  
Epoch: [2][900/1715] Elapsed 4m 52s (remain 4m 24s) Loss avg.: 0.4846 Grad: 0.6389 LR: 0.00098  
Epoch: [2][1000/1715] Elapsed 5m 25s

Epoch 2 - avg_train_loss: 0.4816  avg_val_loss: 0.4696  time: 595s
Epoch 2 - Accuracy: 0.7967459471906886
Epoch 2 - Save Best Score: 0.7967 Model


Epoch: [3][0/1715] Elapsed 0m 1s (remain 44m 38s) Loss avg.: 0.4916 Grad: 0.6848 LR: 0.00091  
Epoch: [3][100/1715] Elapsed 0m 34s (remain 9m 4s) Loss avg.: 0.4729 Grad: 0.6692 LR: 0.00091  
Epoch: [3][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4721 Grad: 0.5793 LR: 0.00091  
Epoch: [3][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4727 Grad: 0.6204 LR: 0.00091  
Epoch: [3][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4730 Grad: 0.3458 LR: 0.00091  
Epoch: [3][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4729 Grad: 0.4434 LR: 0.00091  
Epoch: [3][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4728 Grad: 0.4074 LR: 0.00091  
Epoch: [3][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4726 Grad: 0.6561 LR: 0.00091  
Epoch: [3][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4726 Grad: 0.4566 LR: 0.00091  
Epoch: [3][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4723 Grad: 0.5638 LR: 0.00091  
Epoch: [3][1000/1715] Elapsed 5m 26

Epoch 3 - avg_train_loss: 0.4707  avg_val_loss: 0.4623  time: 596s
Epoch 3 - Accuracy: 0.7999683732942348
Epoch 3 - Save Best Score: 0.8000 Model


Epoch: [4][0/1715] Elapsed 0m 1s (remain 43m 23s) Loss avg.: 0.4730 Grad: 0.4088 LR: 0.00081  
Epoch: [4][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.4649 Grad: 0.4100 LR: 0.00081  
Epoch: [4][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4649 Grad: 0.3777 LR: 0.00081  
Epoch: [4][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4649 Grad: 0.3907 LR: 0.00081  
Epoch: [4][400/1715] Elapsed 2m 11s (remain 7m 11s) Loss avg.: 0.4649 Grad: 0.3108 LR: 0.00081  
Epoch: [4][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4650 Grad: 0.4156 LR: 0.00081  
Epoch: [4][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4652 Grad: 0.4344 LR: 0.00081  
Epoch: [4][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4652 Grad: 0.3060 LR: 0.00081  
Epoch: [4][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4652 Grad: 0.3307 LR: 0.00081  
Epoch: [4][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4652 Grad: 0.3135 LR: 0.00081  
Epoch: [4][1000/1715] Elapsed 5m 26

Epoch 4 - avg_train_loss: 0.4640  avg_val_loss: 0.4595  time: 597s
Epoch 4 - Accuracy: 0.8013092290225308
Epoch 4 - Save Best Score: 0.8013 Model


Epoch: [5][0/1715] Elapsed 0m 1s (remain 47m 10s) Loss avg.: 0.4733 Grad: 0.3503 LR: 0.00069  
Epoch: [5][100/1715] Elapsed 0m 34s (remain 9m 9s) Loss avg.: 0.4575 Grad: 0.3604 LR: 0.00069  
Epoch: [5][200/1715] Elapsed 1m 6s (remain 8m 24s) Loss avg.: 0.4581 Grad: 0.2816 LR: 0.00069  
Epoch: [5][300/1715] Elapsed 1m 39s (remain 7m 47s) Loss avg.: 0.4587 Grad: 0.3570 LR: 0.00069  
Epoch: [5][400/1715] Elapsed 2m 11s (remain 7m 12s) Loss avg.: 0.4590 Grad: 0.3373 LR: 0.00069  
Epoch: [5][500/1715] Elapsed 2m 44s (remain 6m 38s) Loss avg.: 0.4592 Grad: 0.3289 LR: 0.00069  
Epoch: [5][600/1715] Elapsed 3m 16s (remain 6m 5s) Loss avg.: 0.4594 Grad: 0.2800 LR: 0.00069  
Epoch: [5][700/1715] Elapsed 3m 49s (remain 5m 32s) Loss avg.: 0.4594 Grad: 0.3417 LR: 0.00069  
Epoch: [5][800/1715] Elapsed 4m 22s (remain 4m 58s) Loss avg.: 0.4596 Grad: 0.4597 LR: 0.00069  
Epoch: [5][900/1715] Elapsed 4m 54s (remain 4m 26s) Loss avg.: 0.4596 Grad: 0.3993 LR: 0.00069  
Epoch: [5][1000/1715] Elapsed 5m 27

Epoch 5 - avg_train_loss: 0.4592  avg_val_loss: 0.4543  time: 597s
Epoch 5 - Accuracy: 0.803916610433315
Epoch 5 - Save Best Score: 0.8039 Model


Epoch: [6][0/1715] Elapsed 0m 1s (remain 42m 30s) Loss avg.: 0.4672 Grad: 0.4172 LR: 0.00055  
Epoch: [6][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.4567 Grad: 0.5206 LR: 0.00055  
Epoch: [6][200/1715] Elapsed 1m 6s (remain 8m 23s) Loss avg.: 0.4560 Grad: 0.3487 LR: 0.00055  
Epoch: [6][300/1715] Elapsed 1m 39s (remain 7m 46s) Loss avg.: 0.4555 Grad: 0.2861 LR: 0.00055  
Epoch: [6][400/1715] Elapsed 2m 11s (remain 7m 11s) Loss avg.: 0.4550 Grad: 0.2286 LR: 0.00055  
Epoch: [6][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4551 Grad: 0.3710 LR: 0.00055  
Epoch: [6][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4550 Grad: 0.4115 LR: 0.00055  
Epoch: [6][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4555 Grad: 0.3122 LR: 0.00055  
Epoch: [6][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4554 Grad: 0.4526 LR: 0.00055  
Epoch: [6][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4552 Grad: 0.3873 LR: 0.00055  
Epoch: [6][1000/1715] Elapsed 5m 26

Epoch 6 - avg_train_loss: 0.4553  avg_val_loss: 0.4513  time: 597s
Epoch 6 - Accuracy: 0.8056240153090746
Epoch 6 - Save Best Score: 0.8056 Model


Epoch: [7][0/1715] Elapsed 0m 1s (remain 43m 27s) Loss avg.: 0.4674 Grad: 0.3502 LR: 0.00041  
Epoch: [7][100/1715] Elapsed 0m 34s (remain 9m 4s) Loss avg.: 0.4530 Grad: 0.3392 LR: 0.00041  
Epoch: [7][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4526 Grad: 0.2605 LR: 0.00041  
Epoch: [7][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4529 Grad: 0.2640 LR: 0.00041  
Epoch: [7][400/1715] Elapsed 2m 11s (remain 7m 11s) Loss avg.: 0.4521 Grad: 0.2883 LR: 0.00041  
Epoch: [7][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4519 Grad: 0.4879 LR: 0.00041  
Epoch: [7][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4519 Grad: 0.3951 LR: 0.00041  
Epoch: [7][700/1715] Elapsed 3m 48s (remain 5m 31s) Loss avg.: 0.4520 Grad: 0.3739 LR: 0.00041  
Epoch: [7][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4522 Grad: 0.3248 LR: 0.00041  
Epoch: [7][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.4521 Grad: 0.3316 LR: 0.00041  
Epoch: [7][1000/1715] Elapsed 5m 26

Epoch 7 - avg_train_loss: 0.4518  avg_val_loss: 0.4491  time: 596s
Epoch 7 - Accuracy: 0.8066617210466254
Epoch 7 - Save Best Score: 0.8067 Model


Epoch: [8][0/1715] Elapsed 0m 1s (remain 43m 14s) Loss avg.: 0.4463 Grad: 0.2731 LR: 0.00029  
Epoch: [8][100/1715] Elapsed 0m 34s (remain 9m 4s) Loss avg.: 0.4477 Grad: 0.2698 LR: 0.00029  
Epoch: [8][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4488 Grad: 0.3719 LR: 0.00029  
Epoch: [8][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4483 Grad: 0.3015 LR: 0.00029  
Epoch: [8][400/1715] Elapsed 2m 11s (remain 7m 11s) Loss avg.: 0.4481 Grad: 0.2833 LR: 0.00029  
Epoch: [8][500/1715] Elapsed 2m 44s (remain 6m 38s) Loss avg.: 0.4483 Grad: 0.2628 LR: 0.00029  
Epoch: [8][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4485 Grad: 0.2648 LR: 0.00029  
Epoch: [8][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4483 Grad: 0.3017 LR: 0.00029  
Epoch: [8][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4486 Grad: 0.3987 LR: 0.00029  
Epoch: [8][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4485 Grad: 0.4083 LR: 0.00029  
Epoch: [8][1000/1715] Elapsed 5m 26

Epoch 8 - avg_train_loss: 0.4490  avg_val_loss: 0.4472  time: 597s
Epoch 8 - Accuracy: 0.8075434794331678
Epoch 8 - Save Best Score: 0.8075 Model


Epoch: [9][0/1715] Elapsed 0m 1s (remain 44m 3s) Loss avg.: 0.4450 Grad: 0.4141 LR: 0.00019  
Epoch: [9][100/1715] Elapsed 0m 34s (remain 9m 4s) Loss avg.: 0.4426 Grad: 0.4572 LR: 0.00019  
Epoch: [9][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4441 Grad: 0.3382 LR: 0.00019  
Epoch: [9][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4447 Grad: 0.2310 LR: 0.00019  
Epoch: [9][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4450 Grad: 0.2363 LR: 0.00019  
Epoch: [9][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4455 Grad: 0.2839 LR: 0.00019  
Epoch: [9][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4457 Grad: 0.2521 LR: 0.00019  
Epoch: [9][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4459 Grad: 0.2912 LR: 0.00019  
Epoch: [9][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4457 Grad: 0.3325 LR: 0.00019  
Epoch: [9][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4460 Grad: 0.2813 LR: 0.00019  
Epoch: [9][1000/1715] Elapsed 5m 26s

Epoch 9 - avg_train_loss: 0.4466  avg_val_loss: 0.4461  time: 596s
Epoch 9 - Accuracy: 0.8082605457574304
Epoch 9 - Save Best Score: 0.8083 Model


Epoch: [10][0/1715] Elapsed 0m 1s (remain 43m 20s) Loss avg.: 0.4352 Grad: 0.4309 LR: 0.00012  
Epoch: [10][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.4449 Grad: 0.3128 LR: 0.00012  
Epoch: [10][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4453 Grad: 0.2908 LR: 0.00012  
Epoch: [10][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4448 Grad: 0.3134 LR: 0.00012  
Epoch: [10][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4445 Grad: 0.2806 LR: 0.00012  
Epoch: [10][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4447 Grad: 0.2473 LR: 0.00012  
Epoch: [10][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4450 Grad: 0.2836 LR: 0.00012  
Epoch: [10][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4447 Grad: 0.2674 LR: 0.00012  
Epoch: [10][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4445 Grad: 0.2945 LR: 0.00012  
Epoch: [10][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4445 Grad: 0.2671 LR: 0.00012  
Epoch: [10][1000/1715] El

Epoch 10 - avg_train_loss: 0.4450  avg_val_loss: 0.4451  time: 597s
Epoch 10 - Accuracy: 0.8087130845657303
Epoch 10 - Save Best Score: 0.8087 Model
Epoch 10 - Save final model
Score: 0.80871


Epoch: [1][0/1715] Elapsed 0m 1s (remain 44m 12s) Loss avg.: 1.4756 Grad: 1.8506 LR: 0.00100  
Epoch: [1][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.7768 Grad: 1.5510 LR: 0.00100  
Epoch: [1][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.6902 Grad: 1.1886 LR: 0.00100  
Epoch: [1][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.6513 Grad: 0.5428 LR: 0.00100  
Epoch: [1][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.6277 Grad: 0.9004 LR: 0.00100  
Epoch: [1][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.6114 Grad: 0.8936 LR: 0.00100  
Epoch: [1][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.5982 Grad: 0.6037 LR: 0.00100  
Epoch: [1][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.5878 Grad: 0.9722 LR: 0.00100  
Epoch: [1][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.5792 Grad: 0.7416 LR: 0.00100  
Epoch: [1][900/1715] Elapsed 4m 52s (remain 4m 23s) Loss avg.: 0.5718 Grad: 0.4655 LR: 0.00100  
Epoch: [1][1000/1715] Elapsed 5m 24s

Epoch 1 - avg_train_loss: 0.5369  avg_val_loss: 0.4834  time: 593s
Epoch 1 - Accuracy: 0.7900285514823014
Epoch 1 - Save Best Score: 0.7900 Model


Epoch: [2][0/1715] Elapsed 0m 1s (remain 43m 37s) Loss avg.: 0.4924 Grad: 0.5606 LR: 0.00098  
Epoch: [2][100/1715] Elapsed 0m 34s (remain 9m 5s) Loss avg.: 0.4882 Grad: 0.7105 LR: 0.00098  
Epoch: [2][200/1715] Elapsed 1m 6s (remain 8m 22s) Loss avg.: 0.4877 Grad: 0.4696 LR: 0.00098  
Epoch: [2][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4885 Grad: 0.7298 LR: 0.00098  
Epoch: [2][400/1715] Elapsed 2m 11s (remain 7m 11s) Loss avg.: 0.4882 Grad: 0.4480 LR: 0.00098  
Epoch: [2][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4870 Grad: 0.4994 LR: 0.00098  
Epoch: [2][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4863 Grad: 0.4533 LR: 0.00098  
Epoch: [2][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4861 Grad: 0.7528 LR: 0.00098  
Epoch: [2][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4858 Grad: 0.3672 LR: 0.00098  
Epoch: [2][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4854 Grad: 0.4606 LR: 0.00098  
Epoch: [2][1000/1715] Elapsed 5m 26

Epoch 2 - avg_train_loss: 0.4814  avg_val_loss: 0.4703  time: 596s
Epoch 2 - Accuracy: 0.7958321248628173
Epoch 2 - Save Best Score: 0.7958 Model


Epoch: [3][0/1715] Elapsed 0m 1s (remain 45m 28s) Loss avg.: 0.4685 Grad: 0.5280 LR: 0.00091  
Epoch: [3][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.4736 Grad: 0.4035 LR: 0.00091  
Epoch: [3][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.4737 Grad: 0.5832 LR: 0.00091  
Epoch: [3][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4731 Grad: 0.4001 LR: 0.00091  
Epoch: [3][400/1715] Elapsed 2m 11s (remain 7m 9s) Loss avg.: 0.4724 Grad: 0.5228 LR: 0.00091  
Epoch: [3][500/1715] Elapsed 2m 43s (remain 6m 36s) Loss avg.: 0.4725 Grad: 0.5455 LR: 0.00091  
Epoch: [3][600/1715] Elapsed 3m 16s (remain 6m 3s) Loss avg.: 0.4720 Grad: 0.4908 LR: 0.00091  
Epoch: [3][700/1715] Elapsed 3m 48s (remain 5m 30s) Loss avg.: 0.4716 Grad: 0.6016 LR: 0.00091  
Epoch: [3][800/1715] Elapsed 4m 20s (remain 4m 57s) Loss avg.: 0.4713 Grad: 0.4693 LR: 0.00091  
Epoch: [3][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.4712 Grad: 0.3369 LR: 0.00091  
Epoch: [3][1000/1715] Elapsed 5m 26s

Epoch 3 - avg_train_loss: 0.4701  avg_val_loss: 0.4622  time: 595s
Epoch 3 - Accuracy: 0.7996630371256717
Epoch 3 - Save Best Score: 0.7997 Model


Epoch: [4][0/1715] Elapsed 0m 1s (remain 43m 51s) Loss avg.: 0.4762 Grad: 0.3802 LR: 0.00081  
Epoch: [4][100/1715] Elapsed 0m 33s (remain 9m 3s) Loss avg.: 0.4644 Grad: 0.3878 LR: 0.00081  
Epoch: [4][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4648 Grad: 0.4771 LR: 0.00081  
Epoch: [4][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4644 Grad: 0.4941 LR: 0.00081  
Epoch: [4][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4650 Grad: 0.3712 LR: 0.00081  
Epoch: [4][500/1715] Elapsed 2m 43s (remain 6m 36s) Loss avg.: 0.4648 Grad: 0.3463 LR: 0.00081  
Epoch: [4][600/1715] Elapsed 3m 15s (remain 6m 3s) Loss avg.: 0.4649 Grad: 0.3556 LR: 0.00081  
Epoch: [4][700/1715] Elapsed 3m 48s (remain 5m 30s) Loss avg.: 0.4646 Grad: 0.3304 LR: 0.00081  
Epoch: [4][800/1715] Elapsed 4m 21s (remain 4m 57s) Loss avg.: 0.4644 Grad: 0.4184 LR: 0.00081  
Epoch: [4][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.4641 Grad: 0.3262 LR: 0.00081  
Epoch: [4][1000/1715] Elapsed 5m 25

Epoch 4 - avg_train_loss: 0.4637  avg_val_loss: 0.4576  time: 596s
Epoch 4 - Accuracy: 0.8026573720102197
Epoch 4 - Save Best Score: 0.8027 Model


Epoch: [5][0/1715] Elapsed 0m 1s (remain 45m 28s) Loss avg.: 0.4725 Grad: 0.3984 LR: 0.00069  
Epoch: [5][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.4575 Grad: 0.3238 LR: 0.00069  
Epoch: [5][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.4582 Grad: 0.3669 LR: 0.00069  
Epoch: [5][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4584 Grad: 0.4785 LR: 0.00069  
Epoch: [5][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4589 Grad: 0.3396 LR: 0.00069  
Epoch: [5][500/1715] Elapsed 2m 43s (remain 6m 36s) Loss avg.: 0.4592 Grad: 0.3005 LR: 0.00069  
Epoch: [5][600/1715] Elapsed 3m 16s (remain 6m 3s) Loss avg.: 0.4591 Grad: 0.3650 LR: 0.00069  
Epoch: [5][700/1715] Elapsed 3m 48s (remain 5m 30s) Loss avg.: 0.4589 Grad: 0.5263 LR: 0.00069  
Epoch: [5][800/1715] Elapsed 4m 20s (remain 4m 57s) Loss avg.: 0.4588 Grad: 0.3580 LR: 0.00069  
Epoch: [5][900/1715] Elapsed 4m 53s (remain 4m 24s) Loss avg.: 0.4588 Grad: 0.3754 LR: 0.00069  
Epoch: [5][1000/1715] Elapsed 5m 25

Epoch 5 - avg_train_loss: 0.4588  avg_val_loss: 0.4539  time: 594s
Epoch 5 - Accuracy: 0.8043232395074396
Epoch 5 - Save Best Score: 0.8043 Model


Epoch: [6][0/1715] Elapsed 0m 1s (remain 43m 13s) Loss avg.: 0.4390 Grad: 0.2841 LR: 0.00055  
Epoch: [6][100/1715] Elapsed 0m 33s (remain 9m 2s) Loss avg.: 0.4553 Grad: 0.3116 LR: 0.00055  
Epoch: [6][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.4552 Grad: 0.4291 LR: 0.00055  
Epoch: [6][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4551 Grad: 0.3496 LR: 0.00055  
Epoch: [6][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4549 Grad: 0.3282 LR: 0.00055  
Epoch: [6][500/1715] Elapsed 2m 44s (remain 6m 37s) Loss avg.: 0.4550 Grad: 0.2664 LR: 0.00055  
Epoch: [6][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4554 Grad: 0.2590 LR: 0.00055  
Epoch: [6][700/1715] Elapsed 3m 48s (remain 5m 31s) Loss avg.: 0.4553 Grad: 0.4432 LR: 0.00055  
Epoch: [6][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4552 Grad: 0.5230 LR: 0.00055  
Epoch: [6][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.4554 Grad: 0.2672 LR: 0.00055  
Epoch: [6][1000/1715] Elapsed 5m 26

Epoch 6 - avg_train_loss: 0.4549  avg_val_loss: 0.4523  time: 596s
Epoch 6 - Accuracy: 0.805142327463203
Epoch 6 - Save Best Score: 0.8051 Model


Epoch: [7][0/1715] Elapsed 0m 1s (remain 46m 28s) Loss avg.: 0.4385 Grad: 0.2949 LR: 0.00041  
Epoch: [7][100/1715] Elapsed 0m 34s (remain 9m 4s) Loss avg.: 0.4514 Grad: 0.3445 LR: 0.00041  
Epoch: [7][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.4510 Grad: 0.3558 LR: 0.00041  
Epoch: [7][300/1715] Elapsed 1m 39s (remain 7m 45s) Loss avg.: 0.4503 Grad: 0.3917 LR: 0.00041  
Epoch: [7][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4508 Grad: 0.3489 LR: 0.00041  
Epoch: [7][500/1715] Elapsed 2m 43s (remain 6m 37s) Loss avg.: 0.4509 Grad: 0.4114 LR: 0.00041  
Epoch: [7][600/1715] Elapsed 3m 16s (remain 6m 4s) Loss avg.: 0.4507 Grad: 0.4077 LR: 0.00041  
Epoch: [7][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4509 Grad: 0.3160 LR: 0.00041  
Epoch: [7][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4508 Grad: 0.2658 LR: 0.00041  
Epoch: [7][900/1715] Elapsed 4m 53s (remain 4m 25s) Loss avg.: 0.4509 Grad: 0.2840 LR: 0.00041  
Epoch: [7][1000/1715] Elapsed 5m 26

Epoch 7 - avg_train_loss: 0.4515  avg_val_loss: 0.4494  time: 595s
Epoch 7 - Accuracy: 0.8065028587918598
Epoch 7 - Save Best Score: 0.8065 Model


Epoch: [8][0/1715] Elapsed 0m 1s (remain 42m 51s) Loss avg.: 0.4484 Grad: 0.2638 LR: 0.00029  
Epoch: [8][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4493 Grad: 0.3171 LR: 0.00029  
Epoch: [8][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4484 Grad: 0.3824 LR: 0.00029  
Epoch: [8][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4481 Grad: 0.2607 LR: 0.00029  
Epoch: [8][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4483 Grad: 0.4003 LR: 0.00029  
Epoch: [8][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4485 Grad: 0.3850 LR: 0.00029  
Epoch: [8][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4485 Grad: 0.3693 LR: 0.00029  
Epoch: [8][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4486 Grad: 0.3398 LR: 0.00029  
Epoch: [8][800/1715] Elapsed 4m 20s (remain 4m 56s) Loss avg.: 0.4487 Grad: 0.4650 LR: 0.00029  
Epoch: [8][900/1715] Elapsed 4m 52s (remain 4m 24s) Loss avg.: 0.4487 Grad: 0.2704 LR: 0.00029  
Epoch: [8][1000/1715] Elapsed 5m 24s

Epoch 8 - avg_train_loss: 0.4487  avg_val_loss: 0.4478  time: 595s
Epoch 8 - Accuracy: 0.8073547394148913
Epoch 8 - Save Best Score: 0.8074 Model


Epoch: [9][0/1715] Elapsed 0m 1s (remain 49m 29s) Loss avg.: 0.4494 Grad: 0.3360 LR: 0.00019  
Epoch: [9][100/1715] Elapsed 0m 34s (remain 9m 10s) Loss avg.: 0.4466 Grad: 0.3354 LR: 0.00019  
Epoch: [9][200/1715] Elapsed 1m 6s (remain 8m 24s) Loss avg.: 0.4466 Grad: 0.3115 LR: 0.00019  
Epoch: [9][300/1715] Elapsed 1m 39s (remain 7m 47s) Loss avg.: 0.4464 Grad: 0.3432 LR: 0.00019  
Epoch: [9][400/1715] Elapsed 2m 11s (remain 7m 12s) Loss avg.: 0.4465 Grad: 0.2729 LR: 0.00019  
Epoch: [9][500/1715] Elapsed 2m 44s (remain 6m 38s) Loss avg.: 0.4466 Grad: 0.2819 LR: 0.00019  
Epoch: [9][600/1715] Elapsed 3m 16s (remain 6m 5s) Loss avg.: 0.4461 Grad: 0.3310 LR: 0.00019  
Epoch: [9][700/1715] Elapsed 3m 49s (remain 5m 31s) Loss avg.: 0.4463 Grad: 0.3818 LR: 0.00019  
Epoch: [9][800/1715] Elapsed 4m 22s (remain 4m 59s) Loss avg.: 0.4463 Grad: 0.3010 LR: 0.00019  
Epoch: [9][900/1715] Elapsed 4m 54s (remain 4m 26s) Loss avg.: 0.4463 Grad: 0.3387 LR: 0.00019  
Epoch: [9][1000/1715] Elapsed 5m 2

Epoch 9 - avg_train_loss: 0.4463  avg_val_loss: 0.4463  time: 597s
Epoch 9 - Accuracy: 0.8079675979298353
Epoch 9 - Save Best Score: 0.8080 Model


Epoch: [10][0/1715] Elapsed 0m 1s (remain 42m 16s) Loss avg.: 0.4521 Grad: 0.4037 LR: 0.00012  
Epoch: [10][100/1715] Elapsed 0m 33s (remain 9m 3s) Loss avg.: 0.4458 Grad: 0.3194 LR: 0.00012  
Epoch: [10][200/1715] Elapsed 1m 6s (remain 8m 21s) Loss avg.: 0.4452 Grad: 0.3525 LR: 0.00012  
Epoch: [10][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.4452 Grad: 0.2429 LR: 0.00012  
Epoch: [10][400/1715] Elapsed 2m 11s (remain 7m 10s) Loss avg.: 0.4455 Grad: 0.3351 LR: 0.00012  
Epoch: [10][500/1715] Elapsed 2m 43s (remain 6m 36s) Loss avg.: 0.4456 Grad: 0.2851 LR: 0.00012  
Epoch: [10][600/1715] Elapsed 3m 16s (remain 6m 3s) Loss avg.: 0.4454 Grad: 0.3434 LR: 0.00012  
Epoch: [10][700/1715] Elapsed 3m 48s (remain 5m 31s) Loss avg.: 0.4453 Grad: 0.2687 LR: 0.00012  
Epoch: [10][800/1715] Elapsed 4m 21s (remain 4m 58s) Loss avg.: 0.4454 Grad: 0.3530 LR: 0.00012  
Epoch: [10][900/1715] Elapsed 4m 54s (remain 4m 25s) Loss avg.: 0.4455 Grad: 0.4915 LR: 0.00012  
Epoch: [10][1000/1715] El

Epoch 10 - avg_train_loss: 0.4448  avg_val_loss: 0.4455  time: 597s
Epoch 10 - Accuracy: 0.8084653177463713
Epoch 10 - Save Best Score: 0.8085 Model
Epoch 10 - Save final model
Score: 0.80847


Epoch: [1][0/1715] Elapsed 0m 1s (remain 43m 58s) Loss avg.: 1.4471 Grad: 1.0173 LR: 0.00100  
Epoch: [1][100/1715] Elapsed 0m 34s (remain 9m 3s) Loss avg.: 0.7622 Grad: 1.1758 LR: 0.00100  
Epoch: [1][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.6819 Grad: 1.0880 LR: 0.00100  
Epoch: [1][300/1715] Elapsed 1m 38s (remain 7m 44s) Loss avg.: 0.6451 Grad: 0.7273 LR: 0.00100  
Epoch: [1][400/1715] Elapsed 2m 11s (remain 7m 9s) Loss avg.: 0.6232 Grad: 0.7319 LR: 0.00100  
Epoch: [1][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.6071 Grad: 1.1024 LR: 0.00100  
Epoch: [1][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.5945 Grad: 0.7784 LR: 0.00100  
Epoch: [1][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.5846 Grad: 1.0238 LR: 0.00100  
Epoch: [1][800/1715] Elapsed 4m 20s (remain 4m 56s) Loss avg.: 0.5760 Grad: 0.5395 LR: 0.00100  
Epoch: [1][900/1715] Elapsed 4m 52s (remain 4m 24s) Loss avg.: 0.5691 Grad: 0.5657 LR: 0.00100  
Epoch: [1][1000/1715] Elapsed 5m 24s

Epoch 1 - avg_train_loss: 0.5362  avg_val_loss: 0.4822  time: 593s
Epoch 1 - Accuracy: 0.7901305731138022
Epoch 1 - Save Best Score: 0.7901 Model


Epoch: [2][0/1715] Elapsed 0m 1s (remain 48m 21s) Loss avg.: 0.4623 Grad: 0.4323 LR: 0.00098  
Epoch: [2][100/1715] Elapsed 0m 33s (remain 9m 2s) Loss avg.: 0.4889 Grad: 0.7319 LR: 0.00098  
Epoch: [2][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4888 Grad: 1.2436 LR: 0.00098  
Epoch: [2][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4881 Grad: 0.4333 LR: 0.00098  
Epoch: [2][400/1715] Elapsed 2m 10s (remain 7m 9s) Loss avg.: 0.4872 Grad: 0.6838 LR: 0.00098  
Epoch: [2][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.4868 Grad: 0.5375 LR: 0.00098  
Epoch: [2][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.4862 Grad: 0.5643 LR: 0.00098  
Epoch: [2][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4856 Grad: 0.3724 LR: 0.00098  
Epoch: [2][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4853 Grad: 0.6151 LR: 0.00098  
Epoch: [2][900/1715] Elapsed 4m 52s (remain 4m 23s) Loss avg.: 0.4851 Grad: 0.6176 LR: 0.00098  
Epoch: [2][1000/1715] Elapsed 5m 24s

Epoch 2 - avg_train_loss: 0.4823  avg_val_loss: 0.4723  time: 594s
Epoch 2 - Accuracy: 0.794855632104167
Epoch 2 - Save Best Score: 0.7949 Model


Epoch: [3][0/1715] Elapsed 0m 1s (remain 45m 4s) Loss avg.: 0.4664 Grad: 0.6974 LR: 0.00091  
Epoch: [3][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4705 Grad: 0.5062 LR: 0.00091  
Epoch: [3][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4727 Grad: 0.5236 LR: 0.00091  
Epoch: [3][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4736 Grad: 0.5635 LR: 0.00091  
Epoch: [3][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4731 Grad: 0.3483 LR: 0.00091  
Epoch: [3][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.4730 Grad: 0.3622 LR: 0.00091  
Epoch: [3][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4731 Grad: 0.5233 LR: 0.00091  
Epoch: [3][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4729 Grad: 0.5052 LR: 0.00091  
Epoch: [3][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4726 Grad: 0.4011 LR: 0.00091  
Epoch: [3][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4724 Grad: 0.4968 LR: 0.00091  
Epoch: [3][1000/1715] Elapsed 5m 24s 

Epoch 3 - avg_train_loss: 0.4712  avg_val_loss: 0.4642  time: 592s
Epoch 3 - Accuracy: 0.7992301739177327
Epoch 3 - Save Best Score: 0.7992 Model


Epoch: [4][0/1715] Elapsed 0m 1s (remain 46m 43s) Loss avg.: 0.4664 Grad: 0.5229 LR: 0.00081  
Epoch: [4][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4648 Grad: 0.4871 LR: 0.00081  
Epoch: [4][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4647 Grad: 0.3585 LR: 0.00081  
Epoch: [4][300/1715] Elapsed 1m 38s (remain 7m 41s) Loss avg.: 0.4648 Grad: 0.2957 LR: 0.00081  
Epoch: [4][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4651 Grad: 0.3690 LR: 0.00081  
Epoch: [4][500/1715] Elapsed 2m 42s (remain 6m 33s) Loss avg.: 0.4646 Grad: 0.4197 LR: 0.00081  
Epoch: [4][600/1715] Elapsed 3m 14s (remain 6m 1s) Loss avg.: 0.4648 Grad: 0.2904 LR: 0.00081  
Epoch: [4][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4650 Grad: 0.2982 LR: 0.00081  
Epoch: [4][800/1715] Elapsed 4m 19s (remain 4m 55s) Loss avg.: 0.4649 Grad: 0.4076 LR: 0.00081  
Epoch: [4][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4649 Grad: 0.4937 LR: 0.00081  
Epoch: [4][1000/1715] Elapsed 5m 23s

Epoch 4 - avg_train_loss: 0.4642  avg_val_loss: 0.4588  time: 592s
Epoch 4 - Accuracy: 0.8020875083256939
Epoch 4 - Save Best Score: 0.8021 Model


Epoch: [5][0/1715] Elapsed 0m 1s (remain 46m 5s) Loss avg.: 0.4649 Grad: 0.3255 LR: 0.00069  
Epoch: [5][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4594 Grad: 0.3826 LR: 0.00069  
Epoch: [5][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4594 Grad: 0.5741 LR: 0.00069  
Epoch: [5][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4592 Grad: 0.3727 LR: 0.00069  
Epoch: [5][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4592 Grad: 0.4479 LR: 0.00069  
Epoch: [5][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4597 Grad: 0.4634 LR: 0.00069  
Epoch: [5][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4596 Grad: 0.3930 LR: 0.00069  
Epoch: [5][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4595 Grad: 0.3246 LR: 0.00069  
Epoch: [5][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4596 Grad: 0.3893 LR: 0.00069  
Epoch: [5][900/1715] Elapsed 4m 52s (remain 4m 23s) Loss avg.: 0.4598 Grad: 0.3300 LR: 0.00069  
Epoch: [5][1000/1715] Elapsed 5m 24s 

Epoch 5 - avg_train_loss: 0.4595  avg_val_loss: 0.4558  time: 592s
Epoch 5 - Accuracy: 0.803073474521555
Epoch 5 - Save Best Score: 0.8031 Model


Epoch: [6][0/1715] Elapsed 0m 1s (remain 45m 13s) Loss avg.: 0.4659 Grad: 0.2759 LR: 0.00055  
Epoch: [6][100/1715] Elapsed 0m 33s (remain 9m 0s) Loss avg.: 0.4566 Grad: 0.3840 LR: 0.00055  
Epoch: [6][200/1715] Elapsed 1m 6s (remain 8m 17s) Loss avg.: 0.4560 Grad: 0.2652 LR: 0.00055  
Epoch: [6][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4564 Grad: 0.3022 LR: 0.00055  
Epoch: [6][400/1715] Elapsed 2m 10s (remain 7m 7s) Loss avg.: 0.4567 Grad: 0.3586 LR: 0.00055  
Epoch: [6][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4564 Grad: 0.3323 LR: 0.00055  
Epoch: [6][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4562 Grad: 0.3699 LR: 0.00055  
Epoch: [6][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4559 Grad: 0.2767 LR: 0.00055  
Epoch: [6][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4558 Grad: 0.4390 LR: 0.00055  
Epoch: [6][900/1715] Elapsed 4m 52s (remain 4m 23s) Loss avg.: 0.4555 Grad: 0.3590 LR: 0.00055  
Epoch: [6][1000/1715] Elapsed 5m 24s

Epoch 6 - avg_train_loss: 0.4553  avg_val_loss: 0.4532  time: 593s
Epoch 6 - Accuracy: 0.8045506020004984
Epoch 6 - Save Best Score: 0.8046 Model


Epoch: [7][0/1715] Elapsed 0m 1s (remain 49m 16s) Loss avg.: 0.4670 Grad: 0.3898 LR: 0.00041  
Epoch: [7][100/1715] Elapsed 0m 34s (remain 9m 7s) Loss avg.: 0.4511 Grad: 0.3564 LR: 0.00041  
Epoch: [7][200/1715] Elapsed 1m 6s (remain 8m 20s) Loss avg.: 0.4520 Grad: 0.3892 LR: 0.00041  
Epoch: [7][300/1715] Elapsed 1m 38s (remain 7m 43s) Loss avg.: 0.4517 Grad: 0.3053 LR: 0.00041  
Epoch: [7][400/1715] Elapsed 2m 11s (remain 7m 9s) Loss avg.: 0.4516 Grad: 0.2754 LR: 0.00041  
Epoch: [7][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.4516 Grad: 0.3108 LR: 0.00041  
Epoch: [7][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.4519 Grad: 0.2914 LR: 0.00041  
Epoch: [7][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4519 Grad: 0.2626 LR: 0.00041  
Epoch: [7][800/1715] Elapsed 4m 20s (remain 4m 56s) Loss avg.: 0.4519 Grad: 0.4450 LR: 0.00041  
Epoch: [7][900/1715] Elapsed 4m 52s (remain 4m 24s) Loss avg.: 0.4520 Grad: 0.3471 LR: 0.00041  
Epoch: [7][1000/1715] Elapsed 5m 24s

Epoch 7 - avg_train_loss: 0.4519  avg_val_loss: 0.4506  time: 593s
Epoch 7 - Accuracy: 0.8060299156572598
Epoch 7 - Save Best Score: 0.8060 Model


Epoch: [8][0/1715] Elapsed 0m 1s (remain 46m 7s) Loss avg.: 0.4375 Grad: 0.2789 LR: 0.00029  
Epoch: [8][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4480 Grad: 0.3005 LR: 0.00029  
Epoch: [8][200/1715] Elapsed 1m 6s (remain 8m 19s) Loss avg.: 0.4483 Grad: 0.3659 LR: 0.00029  
Epoch: [8][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4483 Grad: 0.3299 LR: 0.00029  
Epoch: [8][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4487 Grad: 0.3553 LR: 0.00029  
Epoch: [8][500/1715] Elapsed 2m 42s (remain 6m 34s) Loss avg.: 0.4487 Grad: 0.2989 LR: 0.00029  
Epoch: [8][600/1715] Elapsed 3m 15s (remain 6m 1s) Loss avg.: 0.4485 Grad: 0.2754 LR: 0.00029  
Epoch: [8][700/1715] Elapsed 3m 47s (remain 5m 28s) Loss avg.: 0.4487 Grad: 0.3756 LR: 0.00029  
Epoch: [8][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4488 Grad: 0.3180 LR: 0.00029  
Epoch: [8][900/1715] Elapsed 4m 51s (remain 4m 23s) Loss avg.: 0.4489 Grad: 0.3027 LR: 0.00029  
Epoch: [8][1000/1715] Elapsed 5m 23s 

Epoch 8 - avg_train_loss: 0.4490  avg_val_loss: 0.4481  time: 592s
Epoch 8 - Accuracy: 0.8072702072059336
Epoch 8 - Save Best Score: 0.8073 Model


Epoch: [9][0/1715] Elapsed 0m 1s (remain 46m 30s) Loss avg.: 0.4481 Grad: 0.3981 LR: 0.00019  
Epoch: [9][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4483 Grad: 0.2456 LR: 0.00019  
Epoch: [9][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4477 Grad: 0.3969 LR: 0.00019  
Epoch: [9][300/1715] Elapsed 1m 38s (remain 7m 43s) Loss avg.: 0.4468 Grad: 0.3155 LR: 0.00019  
Epoch: [9][400/1715] Elapsed 2m 10s (remain 7m 8s) Loss avg.: 0.4467 Grad: 0.2555 LR: 0.00019  
Epoch: [9][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.4466 Grad: 0.3003 LR: 0.00019  
Epoch: [9][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.4465 Grad: 0.3288 LR: 0.00019  
Epoch: [9][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4464 Grad: 0.2612 LR: 0.00019  
Epoch: [9][800/1715] Elapsed 4m 19s (remain 4m 56s) Loss avg.: 0.4464 Grad: 0.2936 LR: 0.00019  
Epoch: [9][900/1715] Elapsed 4m 52s (remain 4m 23s) Loss avg.: 0.4466 Grad: 0.3700 LR: 0.00019  
Epoch: [9][1000/1715] Elapsed 5m 24s

Epoch 9 - avg_train_loss: 0.4466  avg_val_loss: 0.4463  time: 593s
Epoch 9 - Accuracy: 0.8080127789380714
Epoch 9 - Save Best Score: 0.8080 Model


Epoch: [10][0/1715] Elapsed 0m 1s (remain 46m 20s) Loss avg.: 0.4505 Grad: 0.3325 LR: 0.00012  
Epoch: [10][100/1715] Elapsed 0m 33s (remain 9m 1s) Loss avg.: 0.4430 Grad: 0.2860 LR: 0.00012  
Epoch: [10][200/1715] Elapsed 1m 6s (remain 8m 18s) Loss avg.: 0.4439 Grad: 0.2639 LR: 0.00012  
Epoch: [10][300/1715] Elapsed 1m 38s (remain 7m 42s) Loss avg.: 0.4437 Grad: 0.2943 LR: 0.00012  
Epoch: [10][400/1715] Elapsed 2m 11s (remain 7m 9s) Loss avg.: 0.4444 Grad: 0.4345 LR: 0.00012  
Epoch: [10][500/1715] Elapsed 2m 43s (remain 6m 35s) Loss avg.: 0.4441 Grad: 0.2548 LR: 0.00012  
Epoch: [10][600/1715] Elapsed 3m 15s (remain 6m 2s) Loss avg.: 0.4440 Grad: 0.2932 LR: 0.00012  
Epoch: [10][700/1715] Elapsed 3m 47s (remain 5m 29s) Loss avg.: 0.4444 Grad: 0.2719 LR: 0.00012  
Epoch: [10][800/1715] Elapsed 4m 20s (remain 4m 56s) Loss avg.: 0.4448 Grad: 0.2551 LR: 0.00012  
Epoch: [10][900/1715] Elapsed 4m 52s (remain 4m 24s) Loss avg.: 0.4448 Grad: 0.3462 LR: 0.00012  
Epoch: [10][1000/1715] Ela

Epoch 10 - avg_train_loss: 0.4450  avg_val_loss: 0.4456  time: 593s
Epoch 10 - Accuracy: 0.8084456421460104
Epoch 10 - Save Best Score: 0.8084 Model
Epoch 10 - Save final model
Score: 0.80845
Score: 0.80868
