# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    geese_net_layers = 12
    geese_net_filters = 32

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 10
    model_name = "geese_net"
    pre_train_file = ""

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [4]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [5]:
if Config.apex:
    from apex import amp

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

27362


In [9]:
# fit for memory size...
paths = paths[-11000:]
# paths = paths[:-11000]
print(len(paths))

11000


In [10]:
if Config.debug:
    paths = paths[:10]

## Utils

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [12]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [13]:
next_position_map = {}
for pos in range(77):
    position = []
    position.append((11 * (1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (-1 + pos // 11) + pos % 11) % 77)
    position.append((11 * (pos // 11) + (pos + 1) % 11) % 77)
    position.append((11 * (pos // 11) + (pos - 1) % 11) % 77)
    next_position_map[pos] = set(position)

In [14]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [15]:
def get_reverse_cube(obses):
    """
    尻尾から順番に 1, 0.9, 0.8, ... という並び
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]

    for p, geese in enumerate(obs["geese"]):
        # whole position reverse
        for num_reverse, pos in enumerate(geese[::-1]):
            b[(p - obs["index"]) % 4, pos] = 1 - num_reverse * 0.1

    return b.reshape(-1, 7, 11)

In [16]:
def get_next_disappear_cube(obses):
    """
    次になくなる場所: 1
    次になくなる可能性のある場所: 0.5
    """
    b = np.zeros((4, 7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    # foodを食べる可能性があるか。
    eat_food_possibility = defaultdict(int)
    for p, geese in enumerate(obs["geese"]):
        for pos in geese[:1]:
            if not next_position_map[pos].isdisjoint(obs["food"]):
                eat_food_possibility[p] = 1

    if (step % 40) == 39:  # 1つ短くなる
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 尻尾が1、尻尾の１つ前0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1
                for pos in geese[-2:-1]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし -> 尻尾が1, 尻尾の1つ前1
                for pos in geese[-2:]:
                    b[(p - obs["index"]) % 4, pos] = 1
    else:  # 1つ短くならない
        for p, geese in enumerate(obs["geese"]):
            if eat_food_possibility[p]:  # 食べる可能性があり -> 尻尾を0.5
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 0.5
            else:  # 食べる可能性なし # 尻尾を1
                for pos in geese[-1:]:
                    b[(p - obs["index"]) % 4, pos] = 1

    return b.reshape(-1, 7, 11)

In [17]:
def get_step_cube_v2(obses):
    """
    step0: 0, step199: 1
    step0: 0, step39 + 40n: 1
    """
    b = np.zeros((1, 7, 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    b[:, :, :5] = (step % 200) / 199
    b[:, :, 5:] = (step % 40) / 39

    return b

In [18]:
def get_length_cube(obses):
    b = np.zeros((2, 7, 11), dtype=np.float16)
    obs = obses[-1]

    my_length = len(obs["geese"][obs["index"]])
    opposite1_length = len(obs["geese"][(obs["index"] + 1) % 4])
    opposite2_length = len(obs["geese"][(obs["index"] + 2) % 4])
    opposite3_length = len(obs["geese"][(obs["index"] + 3) % 4])

    b[0] = my_length / 10
    max_opposite_length = max(opposite1_length, opposite2_length, opposite3_length)
    b[1, :, 0:2] = (my_length - max_opposite_length) / 10
    b[1, :, 2:5] = (my_length - opposite1_length) / 10
    b[1, :, 5:8] = (my_length - opposite2_length) / 10
    b[1, :, 8:11] = (my_length - opposite3_length) / 10

    return b

In [19]:
def get_features(obses):
    b = np.zeros((7 * 11), dtype=np.float16)
    obs = obses[-1]
    step = obs["step"]

    my_goose = obs["geese"][obs["index"]]
    my_length = len(my_goose)

    # num step
    b[0] = (step - 194) if step >= 195 else 0
    b[1] = (step % 40 - 35) if step % 40 > 35 else 0

    """
    2-4: difference between my_length and opponent length (-3 to 3)
    """
    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4
        p_length = len(pos_list)

        if pid == 0:
            continue

        b[1 + pid] = max(min(my_length - p_length, 3), -3) + 3

    """
    5-7: difference between my head position and opponent one
    """
    if my_length != 0:

        for p, pos_list in enumerate(obs["geese"]):
            pid = (p - obs["index"]) % 4

            if pid == 0 or len(pos_list) == 0:
                continue

            diff = abs(my_goose[0] - pos_list[0])
            x_ = diff % 11
            x = min(x_, 11 - x_)
            y_ = diff // 11
            y = min(y_, 7 - y_)
            b[4 + pid] = x + y

    return b.reshape(1, 7, 11)

## Data

In [20]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            # 反転可能な特徴量
            X_ = []
            X_.append(make_input(obses[: j + 1]))
            X_.append(get_reverse_cube(obses[: j + 1]))
            X_.append(get_next_disappear_cube(obses[: j + 1]))

            # 反転不可能な特徴量
            X_i = []
            # X_i.append(get_step_cube_v2(obses[: j + 1]))
            # X_i.append(get_length_cube(obses[: j + 1]))
            X_i.append(get_features(obses[: j + 1]))

            X_ = np.concatenate(X_)
            X_i = np.concatenate(X_i)

            X.append(np.concatenate([X_, X_i]))
            X.append(np.concatenate([X_[:, ::-1, :], X_i]))  # 上下反転
            X.append(np.concatenate([X_[:, :, ::-1], X_i]))  # 左右反転
            X.append(np.concatenate([X_[:, ::-1, ::-1], X_i]))  # 上下左右反転

        X = np.array(X, dtype=np.float16)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except Exception as e:
        if Config.debug:
            raise Exception from e
        return 0, 0

In [21]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11000.0), HTML(value='')))


Num episode: 6861292


In [22]:
unique_ = False

In [23]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [24]:
if unique_:
    X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
    X_train_group = np.unique(X_train_sum_obs)
    X_train_group.shape

In [25]:
if unique_:
    X_train_unique = []
    y_train_unique = []
    for group in tqdm(X_train_group):
        group_index = np.where(X_train_sum_obs == group)

        X_train_ = X_train[group_index]
        y_train_ = y_train[group_index]

        X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
        y_train_ = y_train_[unique_index]

        X_train_unique.append(X_train_)
        y_train_unique.append(y_train_)

    X_train = np.concatenate(X_train_unique)
    y_train = np.concatenate(y_train_unique)

    print(f"Num episode: {len(X_train)}")

In [26]:
if unique_:
    del X_train_sum_obs
    del X_train_group
    del X_train_unique
    del y_train_unique
    del X_train_
    del y_train_
    del group_index
    del unique_index

In [27]:
X_train = X_train.astype(np.float32)
X_train.dtype

dtype('float32')

In [28]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [29]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,3
1,3
2,2
3,2
4,0
...,...
6861287,3
6861288,1
6861289,0
6861290,1


## CV Split

In [30]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         165425
      1         165426
      2         177639
      3         177640
1     0         165425
      1         165426
      2         177639
      3         177640
2     0         165425
      1         165425
      2         177640
      3         177639
3     0         165425
      1         165425
      2         177640
      3         177639
4     0         165426
      1         165425
      2         177639
      3         177639
5     0         165426
      1         165425
      2         177639
      3         177639
6     0         165426
      1         165425
      2         177639
      3         177639
7     0         165426
      1         165425
      2         177639
      3         177639
8     0         165425
      1         165426
      2         177639
      3         177639
9     0         165425
      1         165426
      2         177639
      3         177639
dtype: int64


## Dataset

In [31]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [32]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [33]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, do=False, bn=True):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.do = nn.Dropout2d(p=0.1) if do else None
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.do(h) if self.do is not None else h
        h = self.bn(h) if self.bn is not None else h
        return h

In [34]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()

        layers = Config.geese_net_layers
        filters = Config.geese_net_filters
        dim = filters * 5 + 30

        self.embed_step = nn.Embedding(5, 3)
        self.embed_hunger = nn.Embedding(5, 3)
        self.embed_diff_len = nn.Embedding(7, 4)
        self.embed_diff_head = nn.Embedding(9, 4)

        self.conv0 = TorusConv2d(25, filters, (3, 3))
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])
        self.conv1 = TorusConv2d(filters, filters, (5, 5))

        self.head_p1 = nn.Linear(dim, dim // 2, bias=True)
        self.head_p2 = nn.Linear(dim // 2, 4, bias=False)
        self.head_v1 = nn.Linear(dim, dim // 2, bias=True)
        self.head_v2 = nn.Linear(dim // 2, 1, bias=False)

    def forward(self, x, _=None):
        x_feats = x[:, -1].view(x.size(0), -1).long()

        # Embedding for features
        e_step = self.embed_step(x_feats[:, 0])
        e_hung = self.embed_hunger(x_feats[:, 1])
        e_diff_l = self.embed_diff_len(x_feats[:, 2:5]).view(x.size(0), -1)
        e_diff_h = self.embed_diff_head(x_feats[:, 5:8]).view(x.size(0), -1)

        x = x[:, :-1].float()

        # CNN for observation
        h = F.relu_(self.conv0(x))

        for block in self.blocks:
            h = F.relu_(h + block(h))

        h = F.relu_(h + self.conv1(h))

        # Extract head position
        h_head = (h * x[:, :1]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head2 = (h * x[:, 1:2]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head3 = (h * x[:, 2:3]).view(h.size(0), h.size(1), -1).sum(-1)
        h_head4 = (h * x[:, 3:4]).view(h.size(0), h.size(1), -1).sum(-1)
        h_avg = h.view(h.size(0), h.size(1), -1).mean(-1)

        # Merge features
        h = torch.cat(
            [
                h_head,
                h_head2,
                h_head3,
                h_head4,
                h_avg,
                e_step,
                e_hung,
                e_diff_l,
                e_diff_h,
            ],
            1,
        ).view(1, h.size(0), -1)

        h_p = F.relu_(self.head_p1(h.view(x.size(0), -1)))
        p = self.head_p2(h_p)

        h_v = F.relu_(self.head_v1(h.view(x.size(0), -1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [35]:
# Test

if Config.debug or False:
    model = GeeseNetAlpha()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        print(f"input shape: {obs.shape}")
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [36]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [37]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [38]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [39]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [40]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [41]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    # try:
    #     model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    # except:
    #     print(f"Failed to load pre-train weight.")

    # Disable training for value network
    # for param in model.head_v1.parameters():
    #     param.requires_grad = False
    # for param in model.head_v2.parameters():
    #     param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [42]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [43]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            # break  # fold 1つだけ
        # CV result
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [44]:
if __name__ == "__main__":
    main()



Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 4s (remain 140m 19s) Loss avg.: 1.3964 Grad: 0.3497 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 4s (remain 19m 18s) Loss avg.: 0.6997 Grad: 0.4585 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 3s (remain 17m 43s) Loss avg.: 0.6335 Grad: 0.9590 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 3s (remain 16m 31s) Loss avg.: 0.6019 Grad: 0.7024 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 2s (remain 15m 25s) Loss avg.: 0.5811 Grad: 0.5451 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 2s (remain 14m 23s) Loss avg.: 0.5668 Grad: 0.6449 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 2s (remain 13m 20s) Loss avg.: 0.5557 Grad: 0.5673 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 2s (remain 12m 19s) Loss avg.: 0.5473 Grad: 0.4344 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 1s (remain 11m 18s) Loss avg.: 0.5399 Grad: 0.4112 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 1s (remain 10m 17s) Loss avg.: 0.5341 Grad: 0.3787 LR: 0.00100 

Epoch 1 - avg_train_loss: 0.5029  avg_val_loss: 0.4717  time: 1191s
Epoch 1 - Accuracy: 0.7951131709734307
Epoch 1 - Save Best Score: 0.7951 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 62m 4s) Loss avg.: 0.4927 Grad: 0.3201 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4629 Grad: 0.6122 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4634 Grad: 0.3572 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4637 Grad: 0.2942 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4632 Grad: 0.3097 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4631 Grad: 0.3064 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4628 Grad: 0.3038 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4627 Grad: 0.2515 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4624 Grad: 0.3119 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4623 Grad: 0.3561 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed 9m

Epoch 2 - avg_train_loss: 0.4601  avg_val_loss: 0.4565  time: 1190s
Epoch 2 - Accuracy: 0.8025053561278475
Epoch 2 - Save Best Score: 0.8025 Model


Epoch: [3][0/1929] Elapsed 0m 2s (remain 65m 52s) Loss avg.: 0.4413 Grad: 0.2800 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4525 Grad: 0.3244 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4519 Grad: 0.3165 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4516 Grad: 0.2890 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4515 Grad: 0.2794 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4517 Grad: 0.3532 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4514 Grad: 0.2316 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4513 Grad: 0.2906 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4512 Grad: 0.2520 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 16s) Loss avg.: 0.4511 Grad: 0.2716 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 9

Epoch 3 - avg_train_loss: 0.4507  avg_val_loss: 0.4519  time: 1190s
Epoch 3 - Accuracy: 0.8048503927827089
Epoch 3 - Save Best Score: 0.8049 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 48s) Loss avg.: 0.4413 Grad: 0.2877 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4432 Grad: 0.3173 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4438 Grad: 0.2349 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4435 Grad: 0.3095 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4440 Grad: 0.2790 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4441 Grad: 0.2568 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4441 Grad: 0.2841 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4440 Grad: 0.2244 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4440 Grad: 0.2448 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4442 Grad: 0.2559 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed

Epoch 4 - avg_train_loss: 0.4446  avg_val_loss: 0.4486  time: 1189s
Epoch 4 - Accuracy: 0.8065818430909595
Epoch 4 - Save Best Score: 0.8066 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 61m 54s) Loss avg.: 0.4210 Grad: 0.2014 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4393 Grad: 0.2184 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4392 Grad: 0.2662 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4388 Grad: 0.2086 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4385 Grad: 0.2093 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4384 Grad: 0.2382 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4384 Grad: 0.2461 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4387 Grad: 0.1902 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4392 Grad: 0.2353 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4392 Grad: 0.2084 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 

Epoch 5 - avg_train_loss: 0.4398  avg_val_loss: 0.4454  time: 1189s
Epoch 5 - Accuracy: 0.8080713567399764
Epoch 5 - Save Best Score: 0.8081 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 61m 24s) Loss avg.: 0.4383 Grad: 0.2092 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4367 Grad: 0.1976 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4358 Grad: 0.2336 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 0s (remain 16m 17s) Loss avg.: 0.4352 Grad: 0.2374 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4352 Grad: 0.2375 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4349 Grad: 0.2950 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 5m 59s (remain 13m 14s) Loss avg.: 0.4349 Grad: 0.2350 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4349 Grad: 0.2424 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4349 Grad: 0.1953 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4350 Grad: 0.2189 LR: 0.00055  
Epoch: [6][1000/1929] Elapse

Epoch 6 - avg_train_loss: 0.4355  avg_val_loss: 0.4430  time: 1188s
Epoch 6 - Accuracy: 0.8093233060790229
Epoch 6 - Save Best Score: 0.8093 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 61m 11s) Loss avg.: 0.4234 Grad: 0.2873 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4300 Grad: 0.2432 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4298 Grad: 0.1963 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4297 Grad: 0.2611 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4303 Grad: 0.1887 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4301 Grad: 0.1754 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4302 Grad: 0.1958 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4303 Grad: 0.2239 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4305 Grad: 0.2598 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4306 Grad: 0.2092 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed

Epoch 7 - avg_train_loss: 0.4313  avg_val_loss: 0.4418  time: 1188s
Epoch 7 - Accuracy: 0.8095069447480798
Epoch 7 - Save Best Score: 0.8095 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 62m 7s) Loss avg.: 0.4219 Grad: 0.2655 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4278 Grad: 0.2337 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4276 Grad: 0.2238 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4266 Grad: 0.2057 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4262 Grad: 0.2234 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4261 Grad: 0.1954 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4264 Grad: 0.2146 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4264 Grad: 0.3155 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4266 Grad: 0.2954 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4266 Grad: 0.2034 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed 9

Epoch 8 - avg_train_loss: 0.4274  avg_val_loss: 0.4401  time: 1189s
Epoch 8 - Accuracy: 0.8104965531313308
Epoch 8 - Save Best Score: 0.8105 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 62m 4s) Loss avg.: 0.4427 Grad: 0.2789 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4231 Grad: 0.2113 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4229 Grad: 0.2125 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4225 Grad: 0.2365 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4226 Grad: 0.2511 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4229 Grad: 0.2613 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4233 Grad: 0.3050 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4236 Grad: 0.1965 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4236 Grad: 0.2367 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4237 Grad: 0.2290 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 9

Epoch 9 - avg_train_loss: 0.4240  avg_val_loss: 0.4381  time: 1188s
Epoch 9 - Accuracy: 0.8113447888884031
Epoch 9 - Save Best Score: 0.8113 Model


Epoch: [10][0/1929] Elapsed 0m 2s (remain 65m 27s) Loss avg.: 0.4409 Grad: 0.2264 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 42s) Loss avg.: 0.4211 Grad: 0.2031 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4205 Grad: 0.2336 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4210 Grad: 0.2344 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4211 Grad: 0.2431 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4209 Grad: 0.2242 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4211 Grad: 0.2572 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4213 Grad: 0.2452 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4214 Grad: 0.2292 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4214 Grad: 0.1952 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4214  avg_val_loss: 0.4378  time: 1191s
Epoch 10 - Accuracy: 0.8115940127964089
Epoch 10 - Save Best Score: 0.8116 Model
Epoch 10 - Save final model
Score: 0.81159


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 2s (remain 65m 51s) Loss avg.: 1.3869 Grad: 0.3003 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 39s) Loss avg.: 0.6980 Grad: 0.8860 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.6359 Grad: 0.4106 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 23s) Loss avg.: 0.6057 Grad: 0.7039 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.5859 Grad: 0.8494 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.5717 Grad: 0.5171 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 1s (remain 13m 18s) Loss avg.: 0.5604 Grad: 0.4090 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 1s (remain 12m 17s) Loss avg.: 0.5512 Grad: 0.4151 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 17s) Loss avg.: 0.5442 Grad: 0.5265 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5381 Grad: 0.6787 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5062  avg_val_loss: 0.4723  time: 1192s
Epoch 1 - Accuracy: 0.7949193301560928
Epoch 1 - Save Best Score: 0.7949 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 61m 23s) Loss avg.: 0.4560 Grad: 0.3711 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4678 Grad: 0.3413 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4672 Grad: 0.3252 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4666 Grad: 0.4059 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4663 Grad: 0.3320 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4658 Grad: 0.3177 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4658 Grad: 0.4430 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4655 Grad: 0.4419 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4653 Grad: 0.4068 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4649 Grad: 0.3262 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed 9

Epoch 2 - avg_train_loss: 0.4620  avg_val_loss: 0.4563  time: 1191s
Epoch 2 - Accuracy: 0.8037573054668941
Epoch 2 - Save Best Score: 0.8038 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 60m 59s) Loss avg.: 0.4599 Grad: 0.2636 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4525 Grad: 0.2602 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.4529 Grad: 0.2537 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4534 Grad: 0.2466 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4533 Grad: 0.4052 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4533 Grad: 0.2602 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 1s (remain 13m 18s) Loss avg.: 0.4533 Grad: 0.2338 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 1s (remain 12m 17s) Loss avg.: 0.4532 Grad: 0.2247 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 8m 0s (remain 11m 17s) Loss avg.: 0.4530 Grad: 0.3154 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4530 Grad: 0.4163 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 10

Epoch 3 - avg_train_loss: 0.4521  avg_val_loss: 0.4543  time: 1192s
Epoch 3 - Accuracy: 0.8041726786469037
Epoch 3 - Save Best Score: 0.8042 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 23s) Loss avg.: 0.4429 Grad: 0.2705 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4456 Grad: 0.2432 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4460 Grad: 0.2578 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4461 Grad: 0.2875 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4459 Grad: 0.2552 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4465 Grad: 0.3055 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4464 Grad: 0.2168 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4461 Grad: 0.2015 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4462 Grad: 0.2521 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4460 Grad: 0.2570 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed 10

Epoch 4 - avg_train_loss: 0.4458  avg_val_loss: 0.4495  time: 1191s
Epoch 4 - Accuracy: 0.806899567137423
Epoch 4 - Save Best Score: 0.8069 Model


Epoch: [5][0/1929] Elapsed 0m 2s (remain 64m 51s) Loss avg.: 0.4253 Grad: 0.3395 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 2s (remain 18m 43s) Loss avg.: 0.4394 Grad: 0.2847 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 28s) Loss avg.: 0.4400 Grad: 0.2470 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4397 Grad: 0.1891 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4406 Grad: 0.2821 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4406 Grad: 0.2125 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4405 Grad: 0.1997 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4405 Grad: 0.3826 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4407 Grad: 0.2437 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4408 Grad: 0.2982 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 10

Epoch 5 - avg_train_loss: 0.4409  avg_val_loss: 0.4449  time: 1191s
Epoch 5 - Accuracy: 0.8088219433635025
Epoch 5 - Save Best Score: 0.8088 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 48s) Loss avg.: 0.4311 Grad: 0.2242 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4366 Grad: 0.2381 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4362 Grad: 0.2251 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4356 Grad: 0.2337 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.4354 Grad: 0.2343 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4352 Grad: 0.2299 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 1s (remain 13m 17s) Loss avg.: 0.4353 Grad: 0.1862 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4356 Grad: 0.1967 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4358 Grad: 0.1828 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4360 Grad: 0.2600 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 10

Epoch 6 - avg_train_loss: 0.4365  avg_val_loss: 0.4421  time: 1192s
Epoch 6 - Accuracy: 0.80988296678472
Epoch 6 - Save Best Score: 0.8099 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 60m 49s) Loss avg.: 0.4459 Grad: 0.2019 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4334 Grad: 0.2152 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4317 Grad: 0.2140 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4316 Grad: 0.2248 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4314 Grad: 0.2858 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4318 Grad: 0.2242 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4315 Grad: 0.1895 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4315 Grad: 0.2187 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4316 Grad: 0.2254 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4318 Grad: 0.2874 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 10

Epoch 7 - avg_train_loss: 0.4323  avg_val_loss: 0.4417  time: 1192s
Epoch 7 - Accuracy: 0.8101278183434626
Epoch 7 - Save Best Score: 0.8101 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 61m 13s) Loss avg.: 0.4325 Grad: 0.2310 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4272 Grad: 0.2521 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4277 Grad: 0.2314 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4280 Grad: 0.2190 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4280 Grad: 0.2500 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4278 Grad: 0.2763 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4278 Grad: 0.2039 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4277 Grad: 0.2580 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4279 Grad: 0.2383 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4280 Grad: 0.2198 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed 10

Epoch 8 - avg_train_loss: 0.4283  avg_val_loss: 0.4398  time: 1192s
Epoch 8 - Accuracy: 0.811101394779415
Epoch 8 - Save Best Score: 0.8111 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 11s) Loss avg.: 0.4329 Grad: 0.2296 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4232 Grad: 0.2926 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4237 Grad: 0.2409 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4235 Grad: 0.2441 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4238 Grad: 0.2270 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4242 Grad: 0.2186 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4241 Grad: 0.2221 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4242 Grad: 0.2704 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4243 Grad: 0.2119 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4245 Grad: 0.2131 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 10

Epoch 9 - avg_train_loss: 0.4249  avg_val_loss: 0.4386  time: 1191s
Epoch 9 - Accuracy: 0.8118199175083439
Epoch 9 - Save Best Score: 0.8118 Model


Epoch: [10][0/1929] Elapsed 0m 2s (remain 65m 23s) Loss avg.: 0.4222 Grad: 0.2594 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 2s (remain 18m 44s) Loss avg.: 0.4217 Grad: 0.2038 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 28s) Loss avg.: 0.4221 Grad: 0.2503 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 23s) Loss avg.: 0.4221 Grad: 0.2734 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.4224 Grad: 0.2295 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 1s (remain 14m 19s) Loss avg.: 0.4224 Grad: 0.2464 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 1s (remain 13m 18s) Loss avg.: 0.4221 Grad: 0.2373 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 7m 1s (remain 12m 17s) Loss avg.: 0.4219 Grad: 0.2260 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 8m 1s (remain 11m 17s) Loss avg.: 0.4220 Grad: 0.2484 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 9m 0s (remain 10m 17s) Loss avg.: 0.4221 Grad: 0.2325 LR: 0.00012  
Epoch: [10][1000/1929]

Epoch 10 - avg_train_loss: 0.4222  avg_val_loss: 0.4378  time: 1192s
Epoch 10 - Accuracy: 0.8122178012913005
Epoch 10 - Save Best Score: 0.8122 Model
Epoch 10 - Save final model
Score: 0.81222


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 2s (remain 64m 54s) Loss avg.: 1.3959 Grad: 0.3511 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.6997 Grad: 0.5567 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 27s) Loss avg.: 0.6347 Grad: 0.4003 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.6027 Grad: 0.5110 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.5820 Grad: 0.5164 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.5677 Grad: 0.7972 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 1s (remain 13m 17s) Loss avg.: 0.5568 Grad: 0.5677 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.5482 Grad: 0.5420 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.5411 Grad: 0.4672 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5351 Grad: 0.4059 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5032  avg_val_loss: 0.4754  time: 1192s
Epoch 1 - Accuracy: 0.7932210998223366
Epoch 1 - Save Best Score: 0.7932 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 60m 56s) Loss avg.: 0.4606 Grad: 0.4414 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4664 Grad: 0.3832 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4647 Grad: 0.2859 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4636 Grad: 0.3775 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4637 Grad: 0.3639 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4634 Grad: 0.3593 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4635 Grad: 0.2995 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4635 Grad: 0.2810 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4633 Grad: 0.2489 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4629 Grad: 0.3020 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed 9m

Epoch 2 - avg_train_loss: 0.4604  avg_val_loss: 0.4614  time: 1191s
Epoch 2 - Accuracy: 0.800737179160187
Epoch 2 - Save Best Score: 0.8007 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 61m 33s) Loss avg.: 0.4587 Grad: 0.3018 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4520 Grad: 0.2264 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4516 Grad: 0.2366 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4517 Grad: 0.2877 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4519 Grad: 0.2606 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4518 Grad: 0.2236 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4521 Grad: 0.2623 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4519 Grad: 0.2457 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4517 Grad: 0.3242 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4517 Grad: 0.2220 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 9m

Epoch 3 - avg_train_loss: 0.4509  avg_val_loss: 0.4525  time: 1191s
Epoch 3 - Accuracy: 0.8053092057033007
Epoch 3 - Save Best Score: 0.8053 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 42s) Loss avg.: 0.4341 Grad: 0.2787 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4445 Grad: 0.2130 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4449 Grad: 0.2815 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4455 Grad: 0.2803 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4447 Grad: 0.2228 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4447 Grad: 0.1862 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4448 Grad: 0.2231 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4451 Grad: 0.3238 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4453 Grad: 0.3071 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4452 Grad: 0.3765 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed 10

Epoch 4 - avg_train_loss: 0.4449  avg_val_loss: 0.4486  time: 1191s
Epoch 4 - Accuracy: 0.8068016364269692
Epoch 4 - Save Best Score: 0.8068 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 61m 18s) Loss avg.: 0.4438 Grad: 0.2460 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4398 Grad: 0.2175 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4400 Grad: 0.1983 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4393 Grad: 0.2246 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4394 Grad: 0.2327 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4398 Grad: 0.2030 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4396 Grad: 0.1965 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.4397 Grad: 0.2407 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 8m 0s (remain 11m 17s) Loss avg.: 0.4398 Grad: 0.2513 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4398 Grad: 0.2228 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 10

Epoch 5 - avg_train_loss: 0.4400  avg_val_loss: 0.4456  time: 1192s
Epoch 5 - Accuracy: 0.8084645890204321
Epoch 5 - Save Best Score: 0.8085 Model


Epoch: [6][0/1929] Elapsed 0m 2s (remain 64m 53s) Loss avg.: 0.4356 Grad: 0.1790 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.4341 Grad: 0.2654 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 27s) Loss avg.: 0.4345 Grad: 0.2178 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4340 Grad: 0.2113 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.4344 Grad: 0.2443 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4350 Grad: 0.2371 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 1s (remain 13m 18s) Loss avg.: 0.4348 Grad: 0.2869 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 1s (remain 12m 17s) Loss avg.: 0.4351 Grad: 0.2057 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 8m 0s (remain 11m 17s) Loss avg.: 0.4350 Grad: 0.2003 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 9m 1s (remain 10m 17s) Loss avg.: 0.4351 Grad: 0.2062 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 10

Epoch 6 - avg_train_loss: 0.4356  avg_val_loss: 0.4430  time: 1191s
Epoch 6 - Accuracy: 0.8105443728511694
Epoch 6 - Save Best Score: 0.8105 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 61m 36s) Loss avg.: 0.4351 Grad: 0.2338 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4295 Grad: 0.2385 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4298 Grad: 0.2288 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4310 Grad: 0.1928 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4309 Grad: 0.2654 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4308 Grad: 0.2118 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4308 Grad: 0.2518 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4311 Grad: 0.2177 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4312 Grad: 0.2702 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4311 Grad: 0.2472 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed

Epoch 7 - avg_train_loss: 0.4315  avg_val_loss: 0.4422  time: 1190s
Epoch 7 - Accuracy: 0.8105458303030479
Epoch 7 - Save Best Score: 0.8105 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 61m 21s) Loss avg.: 0.4370 Grad: 0.2517 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4272 Grad: 0.2091 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4269 Grad: 0.1805 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4262 Grad: 0.2747 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4264 Grad: 0.2060 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4266 Grad: 0.2099 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4268 Grad: 0.2259 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4266 Grad: 0.2132 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4265 Grad: 0.2207 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4266 Grad: 0.1976 LR: 0.00029  
Epoch: [8][1000/1929] Elapse

Epoch 8 - avg_train_loss: 0.4276  avg_val_loss: 0.4396  time: 1188s
Epoch 8 - Accuracy: 0.8115310677729698
Epoch 8 - Save Best Score: 0.8115 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 27s) Loss avg.: 0.4172 Grad: 0.2328 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4229 Grad: 0.2064 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4231 Grad: 0.2771 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4230 Grad: 0.2412 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4226 Grad: 0.2516 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4234 Grad: 0.2205 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4234 Grad: 0.2738 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4233 Grad: 0.2089 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4237 Grad: 0.2277 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4235 Grad: 0.2247 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed

Epoch 9 - avg_train_loss: 0.4242  avg_val_loss: 0.4387  time: 1189s
Epoch 9 - Accuracy: 0.8117365684878499
Epoch 9 - Save Best Score: 0.8117 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 61m 27s) Loss avg.: 0.4188 Grad: 0.2291 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4201 Grad: 0.2280 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4205 Grad: 0.2326 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4205 Grad: 0.2819 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4207 Grad: 0.2818 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4205 Grad: 0.2141 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4206 Grad: 0.2364 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4208 Grad: 0.2736 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4209 Grad: 0.2223 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4210 Grad: 0.2419 LR: 0.00012  
Epoch: [10][1000/192

Epoch 10 - avg_train_loss: 0.4216  avg_val_loss: 0.4381  time: 1189s
Epoch 10 - Accuracy: 0.81231226197989
Epoch 10 - Save Best Score: 0.8123 Model
Epoch 10 - Save final model
Score: 0.81231


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 2s (remain 64m 58s) Loss avg.: 1.4009 Grad: 0.3739 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 40s) Loss avg.: 0.7056 Grad: 0.9432 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.6360 Grad: 0.4932 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.6045 Grad: 0.6635 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.5853 Grad: 0.8099 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.5702 Grad: 0.5129 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.5594 Grad: 0.4330 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.5507 Grad: 0.5694 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.5434 Grad: 0.4265 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.5374 Grad: 0.4118 LR: 0.00100

Epoch 1 - avg_train_loss: 0.5047  avg_val_loss: 0.4684  time: 1188s
Epoch 1 - Accuracy: 0.7974797742115549
Epoch 1 - Save Best Score: 0.7975 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 60m 16s) Loss avg.: 0.4428 Grad: 0.3787 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4660 Grad: 0.3156 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4661 Grad: 0.4256 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4648 Grad: 0.3935 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4643 Grad: 0.6451 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4641 Grad: 0.4330 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4641 Grad: 0.4125 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4639 Grad: 0.2842 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4636 Grad: 0.3451 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4631 Grad: 0.3302 LR: 0.00098  
Epoch: [2][1000/1929] Elapse

Epoch 2 - avg_train_loss: 0.4607  avg_val_loss: 0.4594  time: 1188s
Epoch 2 - Accuracy: 0.801617480094851
Epoch 2 - Save Best Score: 0.8016 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 60m 34s) Loss avg.: 0.4404 Grad: 0.2650 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4526 Grad: 0.2550 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4525 Grad: 0.2511 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4527 Grad: 0.2983 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4527 Grad: 0.3858 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4523 Grad: 0.3153 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4521 Grad: 0.3056 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4522 Grad: 0.2339 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4524 Grad: 0.3439 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4524 Grad: 0.3067 LR: 0.00091  
Epoch: [3][1000/1929] Elapse

Epoch 3 - avg_train_loss: 0.4511  avg_val_loss: 0.4521  time: 1188s
Epoch 3 - Accuracy: 0.8057930797269901
Epoch 3 - Save Best Score: 0.8058 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 61m 47s) Loss avg.: 0.4355 Grad: 0.2323 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4450 Grad: 0.2376 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4443 Grad: 0.2911 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 18s) Loss avg.: 0.4438 Grad: 0.2260 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4441 Grad: 0.2038 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4442 Grad: 0.3091 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4440 Grad: 0.2336 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4440 Grad: 0.2498 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4440 Grad: 0.2586 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4441 Grad: 0.2239 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed

Epoch 4 - avg_train_loss: 0.4450  avg_val_loss: 0.4501  time: 1188s
Epoch 4 - Accuracy: 0.8063585710558802
Epoch 4 - Save Best Score: 0.8064 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 60m 44s) Loss avg.: 0.4455 Grad: 0.2001 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4379 Grad: 0.2569 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4383 Grad: 0.1970 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4384 Grad: 0.2507 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4390 Grad: 0.2437 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4394 Grad: 0.2272 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4395 Grad: 0.2168 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4397 Grad: 0.2626 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4397 Grad: 0.2396 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4396 Grad: 0.2756 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed

Epoch 5 - avg_train_loss: 0.4401  avg_val_loss: 0.4460  time: 1188s
Epoch 5 - Accuracy: 0.8085564084887827
Epoch 5 - Save Best Score: 0.8086 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 47s) Loss avg.: 0.4231 Grad: 0.2198 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4341 Grad: 0.2108 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 21s) Loss avg.: 0.4342 Grad: 0.2545 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 0s (remain 16m 17s) Loss avg.: 0.4343 Grad: 0.2151 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 15s) Loss avg.: 0.4345 Grad: 0.2271 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 4m 59s (remain 14m 14s) Loss avg.: 0.4350 Grad: 0.2288 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 5m 59s (remain 13m 14s) Loss avg.: 0.4349 Grad: 0.2698 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4351 Grad: 0.2775 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4352 Grad: 0.2283 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4354 Grad: 0.2313 LR: 0.00055  
Epoch: [6][1000/1929] Elaps

Epoch 6 - avg_train_loss: 0.4357  avg_val_loss: 0.4439  time: 1188s
Epoch 6 - Accuracy: 0.8093988156746035
Epoch 6 - Save Best Score: 0.8094 Model


Epoch: [7][0/1929] Elapsed 0m 2s (remain 65m 43s) Loss avg.: 0.4512 Grad: 0.2707 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.4306 Grad: 0.2044 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4304 Grad: 0.2201 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4306 Grad: 0.3027 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4303 Grad: 0.1988 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4305 Grad: 0.2061 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4309 Grad: 0.2328 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4309 Grad: 0.2313 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4311 Grad: 0.2631 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4310 Grad: 0.2277 LR: 0.00041  
Epoch: [7][1000/1929] Elapse

Epoch 7 - avg_train_loss: 0.4316  avg_val_loss: 0.4415  time: 1189s
Epoch 7 - Accuracy: 0.810353446655075
Epoch 7 - Save Best Score: 0.8104 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 60m 24s) Loss avg.: 0.4456 Grad: 0.2099 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4274 Grad: 0.2295 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4269 Grad: 0.2344 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 18s) Loss avg.: 0.4278 Grad: 0.2275 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4278 Grad: 0.2416 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4276 Grad: 0.2167 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4272 Grad: 0.2170 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 6m 59s (remain 12m 14s) Loss avg.: 0.4272 Grad: 0.2267 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4276 Grad: 0.2030 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4274 Grad: 0.2152 LR: 0.00029  
Epoch: [8][1000/1929] Elapse

Epoch 8 - avg_train_loss: 0.4276  avg_val_loss: 0.4400  time: 1188s
Epoch 8 - Accuracy: 0.8111375557657525
Epoch 8 - Save Best Score: 0.8111 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 60m 46s) Loss avg.: 0.4226 Grad: 0.2420 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4226 Grad: 0.2258 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4231 Grad: 0.2330 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4230 Grad: 0.2549 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4235 Grad: 0.2498 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4235 Grad: 0.2499 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4232 Grad: 0.2391 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4233 Grad: 0.2436 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4235 Grad: 0.2848 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4236 Grad: 0.2408 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed

Epoch 9 - avg_train_loss: 0.4241  avg_val_loss: 0.4394  time: 1188s
Epoch 9 - Accuracy: 0.8116520362788922
Epoch 9 - Save Best Score: 0.8117 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 60m 32s) Loss avg.: 0.4258 Grad: 0.2147 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4222 Grad: 0.2346 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4218 Grad: 0.2136 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4215 Grad: 0.2213 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4213 Grad: 0.2131 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4211 Grad: 0.2051 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4214 Grad: 0.2119 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4211 Grad: 0.2811 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4207 Grad: 0.2224 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 14s) Loss avg.: 0.4208 Grad: 0.2527 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4215  avg_val_loss: 0.4390  time: 1188s
Epoch 10 - Accuracy: 0.811799238918629
Epoch 10 - Save Best Score: 0.8118 Model
Epoch 10 - Save final model
Score: 0.81180


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 1s (remain 61m 10s) Loss avg.: 1.3963 Grad: 0.3499 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.6962 Grad: 1.0620 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.6328 Grad: 0.9389 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.6015 Grad: 0.6947 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.5819 Grad: 0.5886 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.5669 Grad: 0.6919 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.5563 Grad: 0.4764 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.5476 Grad: 0.4018 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.5403 Grad: 0.4436 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.5344 Grad: 0.6708 LR: 0.00100

Epoch 1 - avg_train_loss: 0.5025  avg_val_loss: 0.4696  time: 1189s
Epoch 1 - Accuracy: 0.7958357684925138
Epoch 1 - Save Best Score: 0.7958 Model


Epoch: [2][0/1929] Elapsed 0m 2s (remain 64m 49s) Loss avg.: 0.4815 Grad: 0.4135 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 40s) Loss avg.: 0.4643 Grad: 0.3736 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4633 Grad: 0.3228 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4638 Grad: 0.2983 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4636 Grad: 0.2739 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4631 Grad: 0.2615 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4628 Grad: 0.3200 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4625 Grad: 0.2869 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4623 Grad: 0.2936 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4620 Grad: 0.3145 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed

Epoch 2 - avg_train_loss: 0.4594  avg_val_loss: 0.4588  time: 1188s
Epoch 2 - Accuracy: 0.8013901176017921
Epoch 2 - Save Best Score: 0.8014 Model


Epoch: [3][0/1929] Elapsed 0m 2s (remain 64m 49s) Loss avg.: 0.4615 Grad: 0.3924 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4501 Grad: 0.3660 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4501 Grad: 0.3260 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4507 Grad: 0.3999 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4504 Grad: 0.2263 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4506 Grad: 0.3525 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4507 Grad: 0.3002 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4503 Grad: 0.3823 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4504 Grad: 0.3240 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4505 Grad: 0.3315 LR: 0.00091  
Epoch: [3][1000/1929] Elapse

Epoch 3 - avg_train_loss: 0.4501  avg_val_loss: 0.4492  time: 1188s
Epoch 3 - Accuracy: 0.8066136251346321
Epoch 3 - Save Best Score: 0.8066 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 39s) Loss avg.: 0.4390 Grad: 0.2219 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4444 Grad: 0.2344 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4438 Grad: 0.2427 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4434 Grad: 0.2355 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4435 Grad: 0.3947 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4438 Grad: 0.2200 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4443 Grad: 0.2350 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4442 Grad: 0.2131 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4444 Grad: 0.1816 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4442 Grad: 0.2061 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed

Epoch 4 - avg_train_loss: 0.4443  avg_val_loss: 0.4456  time: 1188s
Epoch 4 - Accuracy: 0.8078888955283919
Epoch 4 - Save Best Score: 0.8079 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 61m 37s) Loss avg.: 0.4508 Grad: 0.2274 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4377 Grad: 0.2136 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4388 Grad: 0.2361 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4396 Grad: 0.2405 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4398 Grad: 0.2370 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4397 Grad: 0.2331 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4398 Grad: 0.2431 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4399 Grad: 0.2421 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4401 Grad: 0.2150 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4401 Grad: 0.3198 LR: 0.00069  
Epoch: [5][1000/1929] Elapse

Epoch 5 - avg_train_loss: 0.4395  avg_val_loss: 0.4424  time: 1189s
Epoch 5 - Accuracy: 0.809557677929369
Epoch 5 - Save Best Score: 0.8096 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 23s) Loss avg.: 0.4317 Grad: 0.2566 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4322 Grad: 0.1928 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4329 Grad: 0.2434 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4335 Grad: 0.2419 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4340 Grad: 0.2310 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4345 Grad: 0.2883 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4345 Grad: 0.2166 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4344 Grad: 0.2033 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4348 Grad: 0.2539 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4349 Grad: 0.2206 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed

Epoch 6 - avg_train_loss: 0.4352  avg_val_loss: 0.4417  time: 1188s
Epoch 6 - Accuracy: 0.8095256139880401


Epoch: [7][0/1929] Elapsed 0m 1s (remain 61m 6s) Loss avg.: 0.4211 Grad: 0.1874 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4290 Grad: 0.1870 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4294 Grad: 0.2337 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4294 Grad: 0.2215 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4293 Grad: 0.2146 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4294 Grad: 0.2143 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4297 Grad: 0.2578 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4301 Grad: 0.2138 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4303 Grad: 0.2903 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4302 Grad: 0.2215 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed

Epoch 7 - avg_train_loss: 0.4310  avg_val_loss: 0.4385  time: 1189s
Epoch 7 - Accuracy: 0.8114756846015837
Epoch 7 - Save Best Score: 0.8115 Model


Epoch: [8][0/1929] Elapsed 0m 2s (remain 65m 6s) Loss avg.: 0.4115 Grad: 0.2018 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 41s) Loss avg.: 0.4264 Grad: 0.2600 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4271 Grad: 0.2193 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4270 Grad: 0.2156 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4267 Grad: 0.2433 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4268 Grad: 0.2146 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4269 Grad: 0.2054 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4269 Grad: 0.2042 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4267 Grad: 0.2174 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4265 Grad: 0.2429 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed 9

Epoch 8 - avg_train_loss: 0.4271  avg_val_loss: 0.4384  time: 1189s
Epoch 8 - Accuracy: 0.8114232163339546


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 15s) Loss avg.: 0.4235 Grad: 0.2589 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4231 Grad: 0.2319 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4235 Grad: 0.2463 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4234 Grad: 0.2038 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4230 Grad: 0.1953 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4231 Grad: 0.2124 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4231 Grad: 0.2267 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4232 Grad: 0.2021 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4232 Grad: 0.2237 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4233 Grad: 0.2191 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed

Epoch 9 - avg_train_loss: 0.4237  avg_val_loss: 0.4361  time: 1189s
Epoch 9 - Accuracy: 0.8127640720622507
Epoch 9 - Save Best Score: 0.8128 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 60m 19s) Loss avg.: 0.4455 Grad: 0.2237 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4194 Grad: 0.2195 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4199 Grad: 0.2395 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4203 Grad: 0.2335 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4203 Grad: 0.2117 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4201 Grad: 0.2437 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4203 Grad: 0.2111 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4203 Grad: 0.2255 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4204 Grad: 0.2237 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4206 Grad: 0.2804 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4211  avg_val_loss: 0.4356  time: 1189s
Epoch 10 - Accuracy: 0.8130045516222169
Epoch 10 - Save Best Score: 0.8130 Model
Epoch 10 - Save final model
Score: 0.81300


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 1s (remain 60m 35s) Loss avg.: 1.3904 Grad: 0.3300 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.6834 Grad: 0.5733 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.6236 Grad: 0.6722 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.5961 Grad: 0.6129 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.5776 Grad: 0.4426 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.5637 Grad: 0.6764 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.5540 Grad: 0.6376 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.5458 Grad: 0.4757 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.5389 Grad: 0.4543 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.5331 Grad: 0.5135 LR: 0.00100

Epoch 1 - avg_train_loss: 0.5024  avg_val_loss: 0.4702  time: 1189s
Epoch 1 - Accuracy: 0.7959305028646216
Epoch 1 - Save Best Score: 0.7959 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 61m 10s) Loss avg.: 0.4619 Grad: 0.4235 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4631 Grad: 0.3799 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4623 Grad: 0.5932 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4628 Grad: 0.4341 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4626 Grad: 0.3073 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4623 Grad: 0.3800 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4622 Grad: 0.4458 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4622 Grad: 0.3974 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4621 Grad: 0.3847 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4619 Grad: 0.3212 LR: 0.00098  
Epoch: [2][1000/1929] Elapse

Epoch 2 - avg_train_loss: 0.4597  avg_val_loss: 0.4587  time: 1188s
Epoch 2 - Accuracy: 0.8018958534036602
Epoch 2 - Save Best Score: 0.8019 Model


Epoch: [3][0/1929] Elapsed 0m 2s (remain 77m 1s) Loss avg.: 0.4471 Grad: 0.4031 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 2s (remain 18m 47s) Loss avg.: 0.4507 Grad: 0.2335 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 28s) Loss avg.: 0.4509 Grad: 0.4523 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4511 Grad: 0.2625 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4508 Grad: 0.2949 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4509 Grad: 0.2531 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4510 Grad: 0.3782 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4510 Grad: 0.2216 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4511 Grad: 0.2810 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4512 Grad: 0.3624 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 9m

Epoch 3 - avg_train_loss: 0.4505  avg_val_loss: 0.4540  time: 1189s
Epoch 3 - Accuracy: 0.8042496381875711
Epoch 3 - Save Best Score: 0.8042 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 25s) Loss avg.: 0.4481 Grad: 0.2076 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4459 Grad: 0.2170 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4449 Grad: 0.2428 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4447 Grad: 0.2275 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4440 Grad: 0.2573 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4444 Grad: 0.3092 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4443 Grad: 0.2704 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4441 Grad: 0.2779 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4441 Grad: 0.2954 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4442 Grad: 0.2536 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed

Epoch 4 - avg_train_loss: 0.4444  avg_val_loss: 0.4488  time: 1188s
Epoch 4 - Accuracy: 0.8065611568670031
Epoch 4 - Save Best Score: 0.8066 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 60m 58s) Loss avg.: 0.4648 Grad: 0.2256 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4391 Grad: 0.2409 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4394 Grad: 0.3421 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4395 Grad: 0.2366 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4393 Grad: 0.2141 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4388 Grad: 0.2131 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4390 Grad: 0.2168 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4389 Grad: 0.3210 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4390 Grad: 0.2067 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4392 Grad: 0.2234 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 

Epoch 5 - avg_train_loss: 0.4396  avg_val_loss: 0.4461  time: 1189s
Epoch 5 - Accuracy: 0.807766469570591
Epoch 5 - Save Best Score: 0.8078 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 16s) Loss avg.: 0.4547 Grad: 0.2012 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4351 Grad: 0.2474 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4343 Grad: 0.2819 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4345 Grad: 0.2559 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4348 Grad: 0.2253 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4349 Grad: 0.2316 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4348 Grad: 0.2267 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4348 Grad: 0.2302 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4347 Grad: 0.2297 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4348 Grad: 0.2625 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 

Epoch 6 - avg_train_loss: 0.4353  avg_val_loss: 0.4438  time: 1189s
Epoch 6 - Accuracy: 0.8085068551249109
Epoch 6 - Save Best Score: 0.8085 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 60m 45s) Loss avg.: 0.4400 Grad: 0.2822 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4301 Grad: 0.2551 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4304 Grad: 0.2047 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4304 Grad: 0.1942 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4307 Grad: 0.2324 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4305 Grad: 0.2250 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4306 Grad: 0.2464 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4306 Grad: 0.2211 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4307 Grad: 0.1770 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4309 Grad: 0.1984 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 

Epoch 7 - avg_train_loss: 0.4311  avg_val_loss: 0.4414  time: 1189s
Epoch 7 - Accuracy: 0.8101085947394732
Epoch 7 - Save Best Score: 0.8101 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 60m 44s) Loss avg.: 0.4346 Grad: 0.2584 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4254 Grad: 0.2066 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4261 Grad: 0.2115 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4259 Grad: 0.2576 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4260 Grad: 0.2388 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4265 Grad: 0.2329 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4265 Grad: 0.2116 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4266 Grad: 0.2165 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4266 Grad: 0.2075 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4266 Grad: 0.1996 LR: 0.00029  
Epoch: [8][1000/1929] Elapse

Epoch 8 - avg_train_loss: 0.4272  avg_val_loss: 0.4389  time: 1189s
Epoch 8 - Accuracy: 0.8111973112927744
Epoch 8 - Save Best Score: 0.8112 Model


Epoch: [9][0/1929] Elapsed 0m 2s (remain 64m 59s) Loss avg.: 0.4342 Grad: 0.2103 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 2s (remain 18m 43s) Loss avg.: 0.4257 Grad: 0.2041 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.4244 Grad: 0.3266 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4236 Grad: 0.2089 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4238 Grad: 0.2342 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4236 Grad: 0.2345 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4237 Grad: 0.2727 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4237 Grad: 0.2085 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4238 Grad: 0.2324 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4236 Grad: 0.2024 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 9m

Epoch 9 - avg_train_loss: 0.4237  avg_val_loss: 0.4390  time: 1189s
Epoch 9 - Accuracy: 0.811251237012282
Epoch 9 - Save Best Score: 0.8113 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 60m 23s) Loss avg.: 0.4027 Grad: 0.2430 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4201 Grad: 0.2177 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4210 Grad: 0.2139 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4206 Grad: 0.2453 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4205 Grad: 0.2324 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4200 Grad: 0.2299 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4199 Grad: 0.2059 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4199 Grad: 0.2288 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4200 Grad: 0.2417 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4201 Grad: 0.2176 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4211  avg_val_loss: 0.4382  time: 1189s
Epoch 10 - Accuracy: 0.8116258021450776
Epoch 10 - Save Best Score: 0.8116 Model
Epoch 10 - Save final model
Score: 0.81163


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 1s (remain 61m 14s) Loss avg.: 1.3973 Grad: 0.3345 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.6985 Grad: 1.0737 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.6358 Grad: 0.7248 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.6054 Grad: 0.6079 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.5850 Grad: 0.7866 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.5703 Grad: 0.5010 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.5594 Grad: 0.5601 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.5509 Grad: 0.6258 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.5436 Grad: 0.5897 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5377 Grad: 0.3825 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5054  avg_val_loss: 0.4679  time: 1190s
Epoch 1 - Accuracy: 0.7975613915167556
Epoch 1 - Save Best Score: 0.7976 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 60m 19s) Loss avg.: 0.4538 Grad: 0.3311 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4649 Grad: 0.3948 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4645 Grad: 0.4322 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4650 Grad: 0.3059 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4650 Grad: 0.3489 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4645 Grad: 0.3967 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4644 Grad: 0.2779 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4640 Grad: 0.4626 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4639 Grad: 0.4000 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4635 Grad: 0.3625 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed

Epoch 2 - avg_train_loss: 0.4611  avg_val_loss: 0.4588  time: 1188s
Epoch 2 - Accuracy: 0.801662661103087
Epoch 2 - Save Best Score: 0.8017 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 60m 21s) Loss avg.: 0.4574 Grad: 0.3125 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4516 Grad: 0.2953 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4518 Grad: 0.2726 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4518 Grad: 0.3260 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4518 Grad: 0.3425 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4523 Grad: 0.2360 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4518 Grad: 0.2335 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4519 Grad: 0.2405 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4521 Grad: 0.4089 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4522 Grad: 0.2475 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 

Epoch 3 - avg_train_loss: 0.4517  avg_val_loss: 0.4558  time: 1189s
Epoch 3 - Accuracy: 0.8032746028807994
Epoch 3 - Save Best Score: 0.8033 Model


Epoch: [4][0/1929] Elapsed 0m 2s (remain 65m 7s) Loss avg.: 0.4491 Grad: 0.3237 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 40s) Loss avg.: 0.4464 Grad: 0.2737 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4454 Grad: 0.2534 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4453 Grad: 0.2216 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4451 Grad: 0.2212 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4454 Grad: 0.2657 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4453 Grad: 0.2119 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4451 Grad: 0.2626 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4454 Grad: 0.2221 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4455 Grad: 0.2816 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed 

Epoch 4 - avg_train_loss: 0.4455  avg_val_loss: 0.4501  time: 1189s
Epoch 4 - Accuracy: 0.8059242503960625
Epoch 4 - Save Best Score: 0.8059 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 62m 28s) Loss avg.: 0.4297 Grad: 0.2314 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4409 Grad: 0.2679 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4416 Grad: 0.1970 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4409 Grad: 0.1985 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4405 Grad: 0.2858 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4406 Grad: 0.2124 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4406 Grad: 0.2113 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4404 Grad: 0.2562 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4405 Grad: 0.3216 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4404 Grad: 0.2524 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed

Epoch 5 - avg_train_loss: 0.4407  avg_val_loss: 0.4463  time: 1189s
Epoch 5 - Accuracy: 0.8074720642911173
Epoch 5 - Save Best Score: 0.8075 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 25s) Loss avg.: 0.4424 Grad: 0.2609 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4354 Grad: 0.2466 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4349 Grad: 0.2534 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4356 Grad: 0.3243 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4356 Grad: 0.2214 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4359 Grad: 0.2396 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4358 Grad: 0.2260 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4360 Grad: 0.2406 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 8m 0s (remain 11m 15s) Loss avg.: 0.4362 Grad: 0.2306 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4361 Grad: 0.1933 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 9

Epoch 6 - avg_train_loss: 0.4363  avg_val_loss: 0.4424  time: 1190s
Epoch 6 - Accuracy: 0.8094337945196894
Epoch 6 - Save Best Score: 0.8094 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 61m 1s) Loss avg.: 0.4204 Grad: 0.2405 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4309 Grad: 0.2161 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4303 Grad: 0.1806 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 18s) Loss avg.: 0.4306 Grad: 0.2300 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4307 Grad: 0.1942 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4310 Grad: 0.2108 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4312 Grad: 0.2072 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4315 Grad: 0.2470 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4316 Grad: 0.1982 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 14s) Loss avg.: 0.4316 Grad: 0.2337 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 

Epoch 7 - avg_train_loss: 0.4321  avg_val_loss: 0.4408  time: 1188s
Epoch 7 - Accuracy: 0.8099366154178005
Epoch 7 - Save Best Score: 0.8099 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 60m 39s) Loss avg.: 0.4262 Grad: 0.2546 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4289 Grad: 0.2551 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4274 Grad: 0.2109 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4269 Grad: 0.2294 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4274 Grad: 0.2514 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4274 Grad: 0.2195 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4273 Grad: 0.2422 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4274 Grad: 0.2258 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4276 Grad: 0.2225 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4277 Grad: 0.1979 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed

Epoch 8 - avg_train_loss: 0.4280  avg_val_loss: 0.4397  time: 1188s
Epoch 8 - Accuracy: 0.810758618277321
Epoch 8 - Save Best Score: 0.8108 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 17s) Loss avg.: 0.4466 Grad: 0.2117 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4264 Grad: 0.2260 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4249 Grad: 0.2090 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4253 Grad: 0.2145 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4247 Grad: 0.2265 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4242 Grad: 0.2232 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4241 Grad: 0.2589 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4244 Grad: 0.2284 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 8m 0s (remain 11m 15s) Loss avg.: 0.4245 Grad: 0.2532 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4244 Grad: 0.2246 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 9

Epoch 9 - avg_train_loss: 0.4245  avg_val_loss: 0.4388  time: 1190s
Epoch 9 - Accuracy: 0.8112395773972533
Epoch 9 - Save Best Score: 0.8112 Model


Epoch: [10][0/1929] Elapsed 0m 2s (remain 64m 49s) Loss avg.: 0.3968 Grad: 0.2432 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 42s) Loss avg.: 0.4208 Grad: 0.2499 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.4212 Grad: 0.2439 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4213 Grad: 0.2510 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4211 Grad: 0.2770 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4209 Grad: 0.2277 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4216 Grad: 0.2281 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4214 Grad: 0.2260 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4215 Grad: 0.2127 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4216 Grad: 0.2130 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4219  avg_val_loss: 0.4380  time: 1189s
Epoch 10 - Accuracy: 0.8115048336391553
Epoch 10 - Save Best Score: 0.8115 Model
Epoch 10 - Save final model
Score: 0.81150


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 1s (remain 60m 52s) Loss avg.: 1.4034 Grad: 0.3895 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.7100 Grad: 0.8094 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.6402 Grad: 0.5947 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.6067 Grad: 0.9274 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.5852 Grad: 0.5514 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.5703 Grad: 0.5996 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 1s (remain 13m 17s) Loss avg.: 0.5587 Grad: 0.5042 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.5500 Grad: 0.5380 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.5430 Grad: 0.4001 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5371 Grad: 0.4752 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5050  avg_val_loss: 0.4696  time: 1190s
Epoch 1 - Accuracy: 0.7963706533319536
Epoch 1 - Save Best Score: 0.7964 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 60m 22s) Loss avg.: 0.4726 Grad: 0.5146 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4667 Grad: 0.3344 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4661 Grad: 0.3882 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4657 Grad: 0.3008 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4654 Grad: 0.3216 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4650 Grad: 0.3459 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4648 Grad: 0.4052 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4644 Grad: 0.2784 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4640 Grad: 0.3913 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4638 Grad: 0.2798 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed

Epoch 2 - avg_train_loss: 0.4611  avg_val_loss: 0.4571  time: 1189s
Epoch 2 - Accuracy: 0.8024788341550932
Epoch 2 - Save Best Score: 0.8025 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 61m 17s) Loss avg.: 0.4795 Grad: 0.2706 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4516 Grad: 0.2896 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4511 Grad: 0.2766 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4521 Grad: 0.3156 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4522 Grad: 0.2646 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4523 Grad: 0.3159 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4520 Grad: 0.2514 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4522 Grad: 0.3744 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4520 Grad: 0.2104 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4521 Grad: 0.2904 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed

Epoch 3 - avg_train_loss: 0.4515  avg_val_loss: 0.4518  time: 1189s
Epoch 3 - Accuracy: 0.8047568314413179
Epoch 3 - Save Best Score: 0.8048 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 60m 35s) Loss avg.: 0.4342 Grad: 0.2852 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 34s) Loss avg.: 0.4460 Grad: 0.2107 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 22s) Loss avg.: 0.4451 Grad: 0.2341 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4449 Grad: 0.2469 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4450 Grad: 0.3040 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4447 Grad: 0.2256 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 5m 59s (remain 13m 15s) Loss avg.: 0.4451 Grad: 0.2727 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4451 Grad: 0.2202 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4452 Grad: 0.2705 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4452 Grad: 0.2863 LR: 0.00081  
Epoch: [4][1000/1929] Elapse

Epoch 4 - avg_train_loss: 0.4455  avg_val_loss: 0.4504  time: 1188s
Epoch 4 - Accuracy: 0.8057712179488113
Epoch 4 - Save Best Score: 0.8058 Model


Epoch: [5][0/1929] Elapsed 0m 2s (remain 64m 47s) Loss avg.: 0.4419 Grad: 0.3433 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.4392 Grad: 0.2085 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4393 Grad: 0.2472 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4394 Grad: 0.2100 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4393 Grad: 0.2735 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4399 Grad: 0.2208 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4398 Grad: 0.3591 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4397 Grad: 0.1805 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4397 Grad: 0.2726 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4398 Grad: 0.2436 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed

Epoch 5 - avg_train_loss: 0.4406  avg_val_loss: 0.4456  time: 1189s
Epoch 5 - Accuracy: 0.8077154587548405
Epoch 5 - Save Best Score: 0.8077 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 60m 34s) Loss avg.: 0.4140 Grad: 0.1858 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4361 Grad: 0.3307 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4351 Grad: 0.2311 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4348 Grad: 0.2047 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4344 Grad: 0.1955 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4347 Grad: 0.2918 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4352 Grad: 0.2382 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4354 Grad: 0.2667 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4357 Grad: 0.2156 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4357 Grad: 0.2764 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed

Epoch 6 - avg_train_loss: 0.4362  avg_val_loss: 0.4455  time: 1189s
Epoch 6 - Accuracy: 0.8081045984064221
Epoch 6 - Save Best Score: 0.8081 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 61m 47s) Loss avg.: 0.4310 Grad: 0.3713 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4304 Grad: 0.2931 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4307 Grad: 0.2507 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4306 Grad: 0.1908 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4311 Grad: 0.2249 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4316 Grad: 0.2853 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4316 Grad: 0.2553 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4315 Grad: 0.2123 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4315 Grad: 0.2656 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4313 Grad: 0.2584 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 

Epoch 7 - avg_train_loss: 0.4320  avg_val_loss: 0.4416  time: 1189s
Epoch 7 - Accuracy: 0.8098389661419354
Epoch 7 - Save Best Score: 0.8098 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 60m 43s) Loss avg.: 0.4285 Grad: 0.2467 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4286 Grad: 0.2324 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4284 Grad: 0.2289 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4280 Grad: 0.2524 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4281 Grad: 0.2228 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4278 Grad: 0.2130 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4281 Grad: 0.2173 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4280 Grad: 0.2224 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4279 Grad: 0.2812 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4278 Grad: 0.2539 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed

Epoch 8 - avg_train_loss: 0.4281  avg_val_loss: 0.4390  time: 1189s
Epoch 8 - Accuracy: 0.8113620033550543
Epoch 8 - Save Best Score: 0.8114 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 60m 58s) Loss avg.: 0.4171 Grad: 0.2892 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 33s) Loss avg.: 0.4241 Grad: 0.2028 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 21s) Loss avg.: 0.4239 Grad: 0.2792 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 0s (remain 16m 18s) Loss avg.: 0.4240 Grad: 0.2153 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 16s) Loss avg.: 0.4241 Grad: 0.2573 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 15s) Loss avg.: 0.4242 Grad: 0.2443 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4241 Grad: 0.2160 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4240 Grad: 0.2429 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 7m 59s (remain 11m 14s) Loss avg.: 0.4241 Grad: 0.2742 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 58s (remain 10m 14s) Loss avg.: 0.4240 Grad: 0.2156 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed

Epoch 9 - avg_train_loss: 0.4246  avg_val_loss: 0.4384  time: 1188s
Epoch 9 - Accuracy: 0.8116053978187775
Epoch 9 - Save Best Score: 0.8116 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 60m 28s) Loss avg.: 0.4150 Grad: 0.1967 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 35s) Loss avg.: 0.4214 Grad: 0.2108 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4206 Grad: 0.2401 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4212 Grad: 0.2132 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4215 Grad: 0.2303 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4217 Grad: 0.2490 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 15s) Loss avg.: 0.4219 Grad: 0.2089 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4221 Grad: 0.2310 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4219 Grad: 0.2014 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4219 Grad: 0.2686 LR: 0.00012  
Epoch: [10][1000/19

Epoch 10 - avg_train_loss: 0.4220  avg_val_loss: 0.4381  time: 1189s
Epoch 10 - Accuracy: 0.8116564086345279
Epoch 10 - Save Best Score: 0.8117 Model
Epoch 10 - Save final model
Score: 0.81166


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 2s (remain 65m 16s) Loss avg.: 1.3965 Grad: 0.3519 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.7022 Grad: 0.7898 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 27s) Loss avg.: 0.6341 Grad: 0.6242 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.6010 Grad: 0.5542 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.5801 Grad: 0.6988 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 17s) Loss avg.: 0.5648 Grad: 0.5629 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.5539 Grad: 0.4368 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.5452 Grad: 0.6940 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.5381 Grad: 0.5456 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5324 Grad: 0.3052 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5019  avg_val_loss: 0.4740  time: 1190s
Epoch 1 - Accuracy: 0.7934178558259453
Epoch 1 - Save Best Score: 0.7934 Model


Epoch: [2][0/1929] Elapsed 0m 1s (remain 60m 32s) Loss avg.: 0.4440 Grad: 0.4162 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4638 Grad: 0.3140 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4641 Grad: 0.3084 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4634 Grad: 0.3332 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4636 Grad: 0.2944 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4635 Grad: 0.2809 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4635 Grad: 0.3532 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 6m 59s (remain 12m 15s) Loss avg.: 0.4627 Grad: 0.4812 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4623 Grad: 0.2701 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4621 Grad: 0.3597 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed

Epoch 2 - avg_train_loss: 0.4596  avg_val_loss: 0.4575  time: 1189s
Epoch 2 - Accuracy: 0.8025327598746008
Epoch 2 - Save Best Score: 0.8025 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 61m 29s) Loss avg.: 0.4484 Grad: 0.2712 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4492 Grad: 0.3404 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4499 Grad: 0.3410 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4507 Grad: 0.3869 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4506 Grad: 0.3723 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4511 Grad: 0.3303 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4510 Grad: 0.3026 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4509 Grad: 0.3271 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4508 Grad: 0.2402 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4507 Grad: 0.2172 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 

Epoch 3 - avg_train_loss: 0.4502  avg_val_loss: 0.4534  time: 1190s
Epoch 3 - Accuracy: 0.8037949132014534
Epoch 3 - Save Best Score: 0.8038 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 61m 0s) Loss avg.: 0.4531 Grad: 0.3007 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4442 Grad: 0.2262 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4442 Grad: 0.2712 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4441 Grad: 0.2335 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4438 Grad: 0.2189 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4441 Grad: 0.2088 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4443 Grad: 0.2212 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4444 Grad: 0.2191 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4444 Grad: 0.2191 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4446 Grad: 0.2491 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed 9

Epoch 4 - avg_train_loss: 0.4443  avg_val_loss: 0.4489  time: 1190s
Epoch 4 - Accuracy: 0.806135580918457
Epoch 4 - Save Best Score: 0.8061 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 61m 49s) Loss avg.: 0.4415 Grad: 0.2395 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4402 Grad: 0.1980 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4397 Grad: 0.2381 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4389 Grad: 0.2324 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4385 Grad: 0.3042 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4386 Grad: 0.2605 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4390 Grad: 0.1978 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4389 Grad: 0.2299 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4390 Grad: 0.2499 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4392 Grad: 0.2755 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 9

Epoch 5 - avg_train_loss: 0.4395  avg_val_loss: 0.4465  time: 1190s
Epoch 5 - Accuracy: 0.8073685852077379
Epoch 5 - Save Best Score: 0.8074 Model


Epoch: [6][0/1929] Elapsed 0m 2s (remain 65m 6s) Loss avg.: 0.4379 Grad: 0.2554 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.4334 Grad: 0.2308 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4335 Grad: 0.2832 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4342 Grad: 0.1802 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4346 Grad: 0.2301 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4352 Grad: 0.3057 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4351 Grad: 0.2436 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4349 Grad: 0.1965 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4351 Grad: 0.2265 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4350 Grad: 0.2089 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 9

Epoch 6 - avg_train_loss: 0.4352  avg_val_loss: 0.4433  time: 1189s
Epoch 6 - Accuracy: 0.8093463474069745
Epoch 6 - Save Best Score: 0.8093 Model


Epoch: [7][0/1929] Elapsed 0m 1s (remain 60m 49s) Loss avg.: 0.4310 Grad: 0.1967 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4303 Grad: 0.2402 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.4294 Grad: 0.1781 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4302 Grad: 0.2110 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4303 Grad: 0.2520 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4305 Grad: 0.2065 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4303 Grad: 0.2036 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4302 Grad: 0.2469 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 8m 0s (remain 11m 15s) Loss avg.: 0.4307 Grad: 0.2064 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4308 Grad: 0.2039 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 9

Epoch 7 - avg_train_loss: 0.4311  avg_val_loss: 0.4423  time: 1190s
Epoch 7 - Accuracy: 0.8097690084517635
Epoch 7 - Save Best Score: 0.8098 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 61m 11s) Loss avg.: 0.4050 Grad: 0.2269 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 36s) Loss avg.: 0.4267 Grad: 0.2039 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 23s) Loss avg.: 0.4261 Grad: 0.1924 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4261 Grad: 0.2097 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4258 Grad: 0.2098 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4257 Grad: 0.2148 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4261 Grad: 0.2194 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4262 Grad: 0.2430 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4263 Grad: 0.2686 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4263 Grad: 0.2253 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed 

Epoch 8 - avg_train_loss: 0.4272  avg_val_loss: 0.4399  time: 1190s
Epoch 8 - Accuracy: 0.8107440437585351
Epoch 8 - Save Best Score: 0.8107 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 49s) Loss avg.: 0.4257 Grad: 0.2384 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4243 Grad: 0.2112 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4230 Grad: 0.2035 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4229 Grad: 0.2374 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4228 Grad: 0.2027 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4231 Grad: 0.1999 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4230 Grad: 0.2709 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4231 Grad: 0.2392 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4234 Grad: 0.2200 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4233 Grad: 0.2330 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 9

Epoch 9 - avg_train_loss: 0.4238  avg_val_loss: 0.4390  time: 1190s
Epoch 9 - Accuracy: 0.8109407997621438
Epoch 9 - Save Best Score: 0.8109 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 61m 19s) Loss avg.: 0.4229 Grad: 0.2180 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4209 Grad: 0.2097 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4207 Grad: 0.2042 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4205 Grad: 0.2166 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4208 Grad: 0.2199 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4206 Grad: 0.2477 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4207 Grad: 0.2219 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4205 Grad: 0.2185 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4206 Grad: 0.2460 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4206 Grad: 0.2248 LR: 0.00012  
Epoch: [10][1000/192

Epoch 10 - avg_train_loss: 0.4212  avg_val_loss: 0.4381  time: 1189s
Epoch 10 - Accuracy: 0.811429046141469
Epoch 10 - Save Best Score: 0.8114 Model
Epoch 10 - Save final model
Score: 0.81143


Failed to load pre-train weight.
Epoch: [1][0/1929] Elapsed 0m 1s (remain 62m 56s) Loss avg.: 1.3970 Grad: 0.3706 LR: 0.00100  
Epoch: [1][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.6945 Grad: 0.5884 LR: 0.00100  
Epoch: [1][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.6309 Grad: 0.5877 LR: 0.00100  
Epoch: [1][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.6005 Grad: 0.8615 LR: 0.00100  
Epoch: [1][400/1929] Elapsed 4m 1s (remain 15m 20s) Loss avg.: 0.5815 Grad: 0.8059 LR: 0.00100  
Epoch: [1][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.5676 Grad: 0.4923 LR: 0.00100  
Epoch: [1][600/1929] Elapsed 6m 1s (remain 13m 18s) Loss avg.: 0.5569 Grad: 0.4533 LR: 0.00100  
Epoch: [1][700/1929] Elapsed 7m 0s (remain 12m 17s) Loss avg.: 0.5485 Grad: 0.5088 LR: 0.00100  
Epoch: [1][800/1929] Elapsed 8m 0s (remain 11m 17s) Loss avg.: 0.5412 Grad: 0.6085 LR: 0.00100  
Epoch: [1][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.5353 Grad: 0.4452 LR: 0.00100  

Epoch 1 - avg_train_loss: 0.5040  avg_val_loss: 0.4720  time: 1190s
Epoch 1 - Accuracy: 0.7948796800601636
Epoch 1 - Save Best Score: 0.7949 Model


Epoch: [2][0/1929] Elapsed 0m 2s (remain 66m 24s) Loss avg.: 0.4699 Grad: 0.5035 LR: 0.00098  
Epoch: [2][100/1929] Elapsed 1m 2s (remain 18m 44s) Loss avg.: 0.4654 Grad: 0.4325 LR: 0.00098  
Epoch: [2][200/1929] Elapsed 2m 1s (remain 17m 27s) Loss avg.: 0.4645 Grad: 0.3415 LR: 0.00098  
Epoch: [2][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4653 Grad: 0.5049 LR: 0.00098  
Epoch: [2][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4648 Grad: 0.4221 LR: 0.00098  
Epoch: [2][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4642 Grad: 0.2910 LR: 0.00098  
Epoch: [2][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4640 Grad: 0.4179 LR: 0.00098  
Epoch: [2][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4636 Grad: 0.4707 LR: 0.00098  
Epoch: [2][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4634 Grad: 0.2861 LR: 0.00098  
Epoch: [2][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4631 Grad: 0.3326 LR: 0.00098  
Epoch: [2][1000/1929] Elapsed 

Epoch 2 - avg_train_loss: 0.4603  avg_val_loss: 0.4589  time: 1189s
Epoch 2 - Accuracy: 0.8012720639996269
Epoch 2 - Save Best Score: 0.8013 Model


Epoch: [3][0/1929] Elapsed 0m 1s (remain 62m 14s) Loss avg.: 0.4304 Grad: 0.3626 LR: 0.00091  
Epoch: [3][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4518 Grad: 0.3118 LR: 0.00091  
Epoch: [3][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4518 Grad: 0.2955 LR: 0.00091  
Epoch: [3][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4517 Grad: 0.2851 LR: 0.00091  
Epoch: [3][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4515 Grad: 0.2735 LR: 0.00091  
Epoch: [3][500/1929] Elapsed 5m 1s (remain 14m 17s) Loss avg.: 0.4513 Grad: 0.3510 LR: 0.00091  
Epoch: [3][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4514 Grad: 0.3197 LR: 0.00091  
Epoch: [3][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4515 Grad: 0.2680 LR: 0.00091  
Epoch: [3][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4513 Grad: 0.2497 LR: 0.00091  
Epoch: [3][900/1929] Elapsed 8m 59s (remain 10m 16s) Loss avg.: 0.4512 Grad: 0.3352 LR: 0.00091  
Epoch: [3][1000/1929] Elapsed 9

Epoch 3 - avg_train_loss: 0.4506  avg_val_loss: 0.4534  time: 1190s
Epoch 3 - Accuracy: 0.8043312554927717
Epoch 3 - Save Best Score: 0.8043 Model


Epoch: [4][0/1929] Elapsed 0m 1s (remain 62m 4s) Loss avg.: 0.4551 Grad: 0.2459 LR: 0.00081  
Epoch: [4][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4446 Grad: 0.2601 LR: 0.00081  
Epoch: [4][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4443 Grad: 0.2634 LR: 0.00081  
Epoch: [4][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4451 Grad: 0.2668 LR: 0.00081  
Epoch: [4][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4453 Grad: 0.2722 LR: 0.00081  
Epoch: [4][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4447 Grad: 0.3394 LR: 0.00081  
Epoch: [4][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4446 Grad: 0.2945 LR: 0.00081  
Epoch: [4][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4445 Grad: 0.2578 LR: 0.00081  
Epoch: [4][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4446 Grad: 0.3440 LR: 0.00081  
Epoch: [4][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4446 Grad: 0.2444 LR: 0.00081  
Epoch: [4][1000/1929] Elapsed 9

Epoch 4 - avg_train_loss: 0.4448  avg_val_loss: 0.4478  time: 1189s
Epoch 4 - Accuracy: 0.8072957126138087
Epoch 4 - Save Best Score: 0.8073 Model


Epoch: [5][0/1929] Elapsed 0m 1s (remain 62m 2s) Loss avg.: 0.4598 Grad: 0.2275 LR: 0.00069  
Epoch: [5][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4386 Grad: 0.2775 LR: 0.00069  
Epoch: [5][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4385 Grad: 0.2959 LR: 0.00069  
Epoch: [5][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4386 Grad: 0.2849 LR: 0.00069  
Epoch: [5][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4390 Grad: 0.3178 LR: 0.00069  
Epoch: [5][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4388 Grad: 0.2269 LR: 0.00069  
Epoch: [5][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4387 Grad: 0.2315 LR: 0.00069  
Epoch: [5][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4392 Grad: 0.1966 LR: 0.00069  
Epoch: [5][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4395 Grad: 0.2810 LR: 0.00069  
Epoch: [5][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4395 Grad: 0.2298 LR: 0.00069  
Epoch: [5][1000/1929] Elapsed 9m

Epoch 5 - avg_train_loss: 0.4400  avg_val_loss: 0.4454  time: 1190s
Epoch 5 - Accuracy: 0.8076396712571543
Epoch 5 - Save Best Score: 0.8076 Model


Epoch: [6][0/1929] Elapsed 0m 1s (remain 61m 53s) Loss avg.: 0.4380 Grad: 0.2390 LR: 0.00055  
Epoch: [6][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4342 Grad: 0.2345 LR: 0.00055  
Epoch: [6][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4348 Grad: 0.2385 LR: 0.00055  
Epoch: [6][300/1929] Elapsed 3m 1s (remain 16m 19s) Loss avg.: 0.4346 Grad: 0.2202 LR: 0.00055  
Epoch: [6][400/1929] Elapsed 4m 0s (remain 15m 17s) Loss avg.: 0.4345 Grad: 0.2177 LR: 0.00055  
Epoch: [6][500/1929] Elapsed 5m 0s (remain 14m 16s) Loss avg.: 0.4347 Grad: 0.2254 LR: 0.00055  
Epoch: [6][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4349 Grad: 0.2375 LR: 0.00055  
Epoch: [6][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4348 Grad: 0.2294 LR: 0.00055  
Epoch: [6][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4352 Grad: 0.2185 LR: 0.00055  
Epoch: [6][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4354 Grad: 0.2188 LR: 0.00055  
Epoch: [6][1000/1929] Elapsed 

Epoch 6 - avg_train_loss: 0.4355  avg_val_loss: 0.4421  time: 1189s
Epoch 6 - Accuracy: 0.8101217118063805
Epoch 6 - Save Best Score: 0.8101 Model


Epoch: [7][0/1929] Elapsed 0m 2s (remain 65m 28s) Loss avg.: 0.4400 Grad: 0.2844 LR: 0.00041  
Epoch: [7][100/1929] Elapsed 1m 2s (remain 18m 42s) Loss avg.: 0.4310 Grad: 0.2361 LR: 0.00041  
Epoch: [7][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4316 Grad: 0.2206 LR: 0.00041  
Epoch: [7][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4311 Grad: 0.2043 LR: 0.00041  
Epoch: [7][400/1929] Elapsed 4m 0s (remain 15m 18s) Loss avg.: 0.4314 Grad: 0.2162 LR: 0.00041  
Epoch: [7][500/1929] Elapsed 5m 0s (remain 14m 17s) Loss avg.: 0.4310 Grad: 0.2140 LR: 0.00041  
Epoch: [7][600/1929] Elapsed 6m 0s (remain 13m 16s) Loss avg.: 0.4312 Grad: 0.2277 LR: 0.00041  
Epoch: [7][700/1929] Elapsed 7m 0s (remain 12m 15s) Loss avg.: 0.4309 Grad: 0.2155 LR: 0.00041  
Epoch: [7][800/1929] Elapsed 7m 59s (remain 11m 15s) Loss avg.: 0.4312 Grad: 0.2439 LR: 0.00041  
Epoch: [7][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4314 Grad: 0.2089 LR: 0.00041  
Epoch: [7][1000/1929] Elapsed 

Epoch 7 - avg_train_loss: 0.4315  avg_val_loss: 0.4401  time: 1189s
Epoch 7 - Accuracy: 0.8107863098630141
Epoch 7 - Save Best Score: 0.8108 Model


Epoch: [8][0/1929] Elapsed 0m 1s (remain 61m 48s) Loss avg.: 0.4359 Grad: 0.2060 LR: 0.00029  
Epoch: [8][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4293 Grad: 0.2200 LR: 0.00029  
Epoch: [8][200/1929] Elapsed 2m 1s (remain 17m 26s) Loss avg.: 0.4286 Grad: 0.2302 LR: 0.00029  
Epoch: [8][300/1929] Elapsed 3m 1s (remain 16m 21s) Loss avg.: 0.4281 Grad: 0.2375 LR: 0.00029  
Epoch: [8][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4279 Grad: 0.2304 LR: 0.00029  
Epoch: [8][500/1929] Elapsed 5m 1s (remain 14m 17s) Loss avg.: 0.4282 Grad: 0.2218 LR: 0.00029  
Epoch: [8][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4280 Grad: 0.3283 LR: 0.00029  
Epoch: [8][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4277 Grad: 0.2503 LR: 0.00029  
Epoch: [8][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4276 Grad: 0.2435 LR: 0.00029  
Epoch: [8][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4277 Grad: 0.2106 LR: 0.00029  
Epoch: [8][1000/1929] Elapsed 9

Epoch 8 - avg_train_loss: 0.4277  avg_val_loss: 0.4394  time: 1190s
Epoch 8 - Accuracy: 0.8112264603303461
Epoch 8 - Save Best Score: 0.8112 Model


Epoch: [9][0/1929] Elapsed 0m 1s (remain 61m 49s) Loss avg.: 0.4266 Grad: 0.2636 LR: 0.00019  
Epoch: [9][100/1929] Elapsed 1m 1s (remain 18m 38s) Loss avg.: 0.4226 Grad: 0.2008 LR: 0.00019  
Epoch: [9][200/1929] Elapsed 2m 1s (remain 17m 25s) Loss avg.: 0.4230 Grad: 0.2196 LR: 0.00019  
Epoch: [9][300/1929] Elapsed 3m 1s (remain 16m 22s) Loss avg.: 0.4228 Grad: 0.2003 LR: 0.00019  
Epoch: [9][400/1929] Elapsed 4m 1s (remain 15m 19s) Loss avg.: 0.4231 Grad: 0.2059 LR: 0.00019  
Epoch: [9][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4230 Grad: 0.2790 LR: 0.00019  
Epoch: [9][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4232 Grad: 0.2675 LR: 0.00019  
Epoch: [9][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4235 Grad: 0.2529 LR: 0.00019  
Epoch: [9][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4236 Grad: 0.2103 LR: 0.00019  
Epoch: [9][900/1929] Elapsed 9m 0s (remain 10m 16s) Loss avg.: 0.4237 Grad: 0.2048 LR: 0.00019  
Epoch: [9][1000/1929] Elapsed 9m

Epoch 9 - avg_train_loss: 0.4242  avg_val_loss: 0.4380  time: 1191s
Epoch 9 - Accuracy: 0.8116928449314925
Epoch 9 - Save Best Score: 0.8117 Model


Epoch: [10][0/1929] Elapsed 0m 1s (remain 60m 17s) Loss avg.: 0.4178 Grad: 0.2188 LR: 0.00012  
Epoch: [10][100/1929] Elapsed 1m 1s (remain 18m 37s) Loss avg.: 0.4195 Grad: 0.2329 LR: 0.00012  
Epoch: [10][200/1929] Elapsed 2m 1s (remain 17m 24s) Loss avg.: 0.4206 Grad: 0.2480 LR: 0.00012  
Epoch: [10][300/1929] Elapsed 3m 1s (remain 16m 20s) Loss avg.: 0.4206 Grad: 0.2623 LR: 0.00012  
Epoch: [10][400/1929] Elapsed 4m 1s (remain 15m 18s) Loss avg.: 0.4207 Grad: 0.2363 LR: 0.00012  
Epoch: [10][500/1929] Elapsed 5m 1s (remain 14m 18s) Loss avg.: 0.4208 Grad: 0.2788 LR: 0.00012  
Epoch: [10][600/1929] Elapsed 6m 0s (remain 13m 17s) Loss avg.: 0.4207 Grad: 0.2270 LR: 0.00012  
Epoch: [10][700/1929] Elapsed 7m 0s (remain 12m 16s) Loss avg.: 0.4207 Grad: 0.2362 LR: 0.00012  
Epoch: [10][800/1929] Elapsed 8m 0s (remain 11m 16s) Loss avg.: 0.4209 Grad: 0.2117 LR: 0.00012  
Epoch: [10][900/1929] Elapsed 8m 59s (remain 10m 15s) Loss avg.: 0.4209 Grad: 0.2276 LR: 0.00012  
Epoch: [10][1000/1929

Epoch 10 - avg_train_loss: 0.4216  avg_val_loss: 0.4377  time: 1190s
Epoch 10 - Accuracy: 0.811914377617037
Epoch 10 - Save Best Score: 0.8119 Model
Epoch 10 - Save final model
Score: 0.81191
Score: 0.81191
