# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Config

In [3]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    gradient_accumulation_steps = 3
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 1000

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 1e-5

    epochs = 20
    model_name = "geese_net"
    pre_train_file = ""

    print_freq = 100

    train = True
    tuning = False
    debug = False
    apex = False

In [4]:
if Config.tuning:
    Config.epochs = 2

if Config.debug:
    Config.epochs = 1

In [5]:
if Config.apex:
    from apex import amp

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

19976


In [9]:
# fit for memory size...
paths = paths[:-10000]  # first stage
# paths = paths[-10000:]  # second stage
print(len(paths))

9976


In [10]:
if Config.debug:
    paths = paths[:10]

## Utils

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [12]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [13]:
def make_sentence(obses):
    b = np.zeros((7 * 11), dtype=np.uint8)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # whole position
        for pos in pos_list[1:-1]:
            b[pos] = 5 + pid
        # tip position
        for pos in pos_list[-1:]:
            b[pos] = 9 + pid
        # head position
        for pos in pos_list[:1]:
            b[pos] = 1 + pid

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            pid = (p - obs["index"]) % 4
            if pid == 0:
                for pos in pos_list[:1]:
                    b[pos] = 13

    # food
    for pos in obs["food"]:
        b[pos] = 14

    return b.reshape(1, 7, 11)

## Data

In [14]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_sentence(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.uint8)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except Exception as e:
        if Config.debug:
            raise Exception from e
        else:
            return 0, 0

In [15]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9976.0), HTML(value='')))


Num episode: 6217420


In [16]:
unique_ = False

In [17]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [18]:
if unique_:
    X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
    X_train_group = np.unique(X_train_sum_obs)
    X_train_group.shape

In [19]:
if unique_:
    X_train_unique = []
    y_train_unique = []
    for group in tqdm(X_train_group):
        group_index = np.where(X_train_sum_obs == group)

        X_train_ = X_train[group_index]
        y_train_ = y_train[group_index]

        X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
        y_train_ = y_train_[unique_index]

        X_train_unique.append(X_train_)
        y_train_unique.append(y_train_)

    X_train = np.concatenate(X_train_unique)
    y_train = np.concatenate(y_train_unique)

    print(f"Num episode: {len(X_train)}")

In [20]:
if unique_:
    del X_train_sum_obs
    del X_train_group
    del X_train_unique
    del y_train_unique
    del X_train_
    del y_train_
    del group_index
    del unique_index

In [21]:
X_train = X_train.astype(np.long)
X_train.dtype

dtype('int64')

In [22]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [23]:
y_df = pd.DataFrame(y_train, dtype=np.uint8)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,3
1,3
2,2
3,2
4,3
...,...
6217415,1
6217416,2
6217417,2
6217418,3


## CV Split

In [24]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(np.uint8)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         148646
      1         148647
      2         162224
      3         162225
1     0         148646
      1         148647
      2         162224
      3         162225
2     0         148646
      1         148647
      2         162224
      3         162225
3     0         148646
      1         148647
      2         162224
      3         162225
4     0         148647
      1         148646
      2         162225
      3         162224
5     0         148647
      1         148646
      2         162225
      3         162224
6     0         148647
      1         148646
      2         162225
      3         162224
7     0         148647
      1         148646
      2         162225
      3         162224
8     0         148647
      1         148647
      2         162224
      3         162224
9     0         148647
      1         148647
      2         162224
      3         162224
dtype: int64


## Dataset

In [25]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [26]:
# Test

if Config.debug or False:
    train_ds = TrainDataset(X_train, y_train)

    for i in range(1):
        obs, action = train_ds[i]
        print(obs.shape, action)

## Model

In [27]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

In [28]:
class GeeseNetImoKuri(nn.Module):
    class GeeseNetEncoder(nn.Module):
        def __init__(self, dim):
            super().__init__()
            self.dim = dim
            self.embed = nn.Embedding(16, dim)
            self.pe = PositionalEncoding(dim, 0)

        def forward(self, x):
            x = self.embed(x) * math.sqrt(self.dim)
            x = self.pe(x)
            return x

    class GeeseNetBlock(nn.Module):
        def __init__(self, dim, n_heads):
            super().__init__()
            self.attention = nn.MultiheadAttention(dim, n_heads)

        def forward(self, x):
            h, _ = self.attention(x, x, x)
            return h

    class GeeseNetHead(nn.Module):
        def __init__(self, dim):
            super().__init__()

            self.head_p1 = nn.Linear(dim, dim // 2, bias=False)
            self.head_p2 = nn.Linear(dim // 2, 4, bias=False)
            self.head_v1 = nn.Linear(dim + 77, dim, bias=False)
            self.head_v2 = nn.Linear(dim, 1, bias=False)

        def forward(self, x, h):
            zeros = torch.zeros(x.size(), dtype=torch.long, device=x.device)
            head = torch.where(x == 1, x, zeros).view(x.size(0), x.size(1), 1)

            h_head = (h * head).sum(1)
            h_avg = h.mean(-1)

            h_p = F.relu_(self.head_p1(torch.cat([h_head], 1)))
            p = self.head_p2(h_p)

            h_v = F.relu_(self.head_v1(torch.cat([h_head, h_avg], 1)))
            v = torch.tanh(self.head_v2(h_v))

            return p, v

    def __init__(self):
        super().__init__()
        dim = 64
        blocks = 5

        self.encoder = self.GeeseNetEncoder(dim)
        self.blocks = nn.ModuleList([self.GeeseNetBlock(dim, 4) for _ in range(blocks)])
        self.head = self.GeeseNetHead(dim)

    def forward(self, x, _=None):
        x = x.view(x.size(0), -1)

        h = self.encoder(x)

        for block in self.blocks:
            h = block(h)

        p, v = self.head(x, h)

        return {"policy": p, "value": v}

In [29]:
# Test

if Config.debug or False:
    model = GeeseNetImoKuri()
    # print(model)

    params = sum(p.numel() for p in model.parameters())
    print(f"params: {params:,}")

    train_ds = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    for obs, action in train_loader:
        output = model(obs)
        print(output)
        print(f"{torch.argmax(output['policy'], dim=1)}")
        break

## Loss

## Scoring

In [30]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [31]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["action"].values
    score = get_score(labels, preds)
    LOGGER.info(f"Score: {score:<.5f}")
    return score

## Helper functions

In [32]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [33]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()

    # switch to train mode
    model.train()
    start = time.time()

    for step, (obs, action) in enumerate(train_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.5f}  "
            )

    return losses.avg

In [34]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"Eval: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss avg.: {losses.avg:.4f} "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [35]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    # X_train_folds = X_train[folds["fold"] != fold]
    # X_valid_folds = X_train[folds["fold"] == fold]

    # y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    # y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    # train_dataset = TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold])
    # valid_dataset = TrainDataset(X_train[folds["fold"] == fold], y_valid_folds)

    train_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] != fold], y_train[folds["fold"] != fold]),
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        TrainDataset(X_train[folds["fold"] == fold], y_valid_folds),
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetImoKuri()
    try:
        model.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, Config.pre_train_file)))
    except:
        print(f"Skip to load pre-train weight.")

    # Disable training for value network
    for param in model.head.head_v1.parameters():
        param.requires_grad = False
    for param in model.head.head_v2.parameters():
        param.requires_grad = False

    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    if Config.train:
        y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
        y_df_valid_folds["preds"] = best_preds.argmax(1)

        return y_df_valid_folds

    if Config.tuning:
        score = get_score(y_df_valid_folds["action"].values, best_preds.argmax(1))
        return score

In [36]:
def objective(trial):
    Config.geese_net_layers = trial.suggest_int("layers", 6, 18)
    Config.geese_net_filters = trial.suggest_int("filters", 32, 128)

    score = train_loop(folds, 0)
    return score

## Main


In [37]:
def main():
    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    if Config.tuning:
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=10)

        trial = study.best_trial
        print("Best trial:")
        print("  Value: ", trial.value)
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [38]:
if __name__ == "__main__":
    main()



Skip to load pre-train weight.
Epoch: [1][0/5595] Elapsed 0m 2s (remain 200m 27s) Loss avg.: 1.3863 Grad: 0.0066 LR: 0.00100  
Epoch: [1][100/5595] Elapsed 0m 35s (remain 32m 18s) Loss avg.: 1.3858 Grad: 0.0175 LR: 0.00100  
Epoch: [1][200/5595] Elapsed 1m 9s (remain 30m 59s) Loss avg.: 1.3856 Grad: 0.0163 LR: 0.00100  
Epoch: [1][300/5595] Elapsed 1m 42s (remain 30m 9s) Loss avg.: 1.3856 Grad: 0.0046 LR: 0.00100  
Epoch: [1][400/5595] Elapsed 2m 16s (remain 29m 28s) Loss avg.: 1.3855 Grad: 0.0054 LR: 0.00100  
Epoch: [1][500/5595] Elapsed 2m 50s (remain 28m 51s) Loss avg.: 1.3855 Grad: 0.0082 LR: 0.00100  
Epoch: [1][600/5595] Elapsed 3m 23s (remain 28m 14s) Loss avg.: 1.3855 Grad: 0.0015 LR: 0.00100  
Epoch: [1][700/5595] Elapsed 3m 57s (remain 27m 38s) Loss avg.: 1.3855 Grad: 0.0021 LR: 0.00100  
Epoch: [1][800/5595] Elapsed 4m 31s (remain 27m 2s) Loss avg.: 1.3855 Grad: 0.0024 LR: 0.00100  
Epoch: [1][900/5595] Elapsed 5m 4s (remain 26m 27s) Loss avg.: 1.3854 Grad: 0.0032 LR: 0.001

Epoch 1 - avg_train_loss: 1.3854  avg_val_loss: 1.3854  time: 1941s
Epoch 1 - Accuracy: 0.26092012442460055
Epoch 1 - Save Best Score: 0.2609 Model


Eval: [621/622] Elapsed 1m 15s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [2][0/5595] Elapsed 0m 1s (remain 94m 14s) Loss avg.: 1.3858 Grad: 0.0019 LR: 0.00098  
Epoch: [2][100/5595] Elapsed 0m 34s (remain 30m 59s) Loss avg.: 1.3856 Grad: 0.0014 LR: 0.00098  
Epoch: [2][200/5595] Elapsed 1m 7s (remain 30m 6s) Loss avg.: 1.3854 Grad: 0.0023 LR: 0.00098  
Epoch: [2][300/5595] Elapsed 1m 40s (remain 29m 27s) Loss avg.: 1.3854 Grad: 0.0021 LR: 0.00098  
Epoch: [2][400/5595] Elapsed 2m 13s (remain 28m 52s) Loss avg.: 1.3853 Grad: 0.0019 LR: 0.00098  
Epoch: [2][500/5595] Elapsed 2m 47s (remain 28m 18s) Loss avg.: 1.3853 Grad: 0.0046 LR: 0.00098  
Epoch: [2][600/5595] Elapsed 3m 20s (remain 27m 44s) Loss avg.: 1.3853 Grad: 0.0034 LR: 0.00098  
Epoch: [2][700/5595] Elapsed 3m 53s (remain 27m 9s) Loss avg.: 1.3853 Grad: 0.0026 LR: 0.00098  
Epoch: [2][800/5595] Elapsed 4m 26s (remain 26m 36s) Loss avg.: 1.3853 Grad: 0.0058 LR: 0.00098  
Epoch: [2][900/5595] Elapsed 4m 59s (remain 26m 2s) Loss av

Epoch 2 - avg_train_loss: 1.3854  avg_val_loss: 1.3854  time: 1936s
Epoch 2 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [3][0/5595] Elapsed 0m 1s (remain 94m 6s) Loss avg.: 1.3858 Grad: 0.0019 LR: 0.00091  
Epoch: [3][100/5595] Elapsed 0m 34s (remain 30m 59s) Loss avg.: 1.3852 Grad: 0.0032 LR: 0.00091  
Epoch: [3][200/5595] Elapsed 1m 7s (remain 30m 5s) Loss avg.: 1.3852 Grad: 0.0045 LR: 0.00091  
Epoch: [3][300/5595] Elapsed 1m 40s (remain 29m 25s) Loss avg.: 1.3853 Grad: 0.0043 LR: 0.00091  
Epoch: [3][400/5595] Elapsed 2m 13s (remain 28m 49s) Loss avg.: 1.3853 Grad: 0.0013 LR: 0.00091  
Epoch: [3][500/5595] Elapsed 2m 46s (remain 28m 16s) Loss avg.: 1.3853 Grad: 0.0028 LR: 0.00091  
Epoch: [3][600/5595] Elapsed 3m 20s (remain 27m 42s) Loss avg.: 1.3853 Grad: 0.0031 LR: 0.00091  
Epoch: [3][700/5595] Elapsed 3m 53s (remain 27m 8s) Loss avg.: 1.3853 Grad: 0.0055 LR: 0.00091  
Epoch: [3][800/5595] Elapsed 4m 26s (remain 26m 34s) Loss avg.: 1.3854 Grad: 0.0047 LR: 0.00091  
Epoch: [3][900/5595] Elapsed 4m 59s (remain 26m 1s) Loss avg

Epoch 3 - avg_train_loss: 1.3854  avg_val_loss: 1.3854  time: 1936s
Epoch 3 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [4][0/5595] Elapsed 0m 1s (remain 93m 55s) Loss avg.: 1.3869 Grad: 0.0025 LR: 0.00081  
Epoch: [4][100/5595] Elapsed 0m 34s (remain 30m 57s) Loss avg.: 1.3856 Grad: 0.0027 LR: 0.00081  
Epoch: [4][200/5595] Elapsed 1m 7s (remain 30m 7s) Loss avg.: 1.3855 Grad: 0.0054 LR: 0.00081  
Epoch: [4][300/5595] Elapsed 1m 40s (remain 29m 29s) Loss avg.: 1.3855 Grad: 0.0015 LR: 0.00081  
Epoch: [4][400/5595] Elapsed 2m 13s (remain 28m 54s) Loss avg.: 1.3854 Grad: 0.0068 LR: 0.00081  
Epoch: [4][500/5595] Elapsed 2m 47s (remain 28m 20s) Loss avg.: 1.3854 Grad: 0.0049 LR: 0.00081  
Epoch: [4][600/5595] Elapsed 3m 20s (remain 27m 45s) Loss avg.: 1.3854 Grad: 0.0012 LR: 0.00081  
Epoch: [4][700/5595] Elapsed 3m 53s (remain 27m 11s) Loss avg.: 1.3854 Grad: 0.0029 LR: 0.00081  
Epoch: [4][800/5595] Elapsed 4m 26s (remain 26m 37s) Loss avg.: 1.3854 Grad: 0.0015 LR: 0.00081  
Epoch: [4][900/5595] Elapsed 5m 0s (remain 26m 4s) Loss av

Epoch 4 - avg_train_loss: 1.3854  avg_val_loss: 1.3854  time: 1937s
Epoch 4 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [5][0/5595] Elapsed 0m 0s (remain 92m 16s) Loss avg.: 1.3873 Grad: 0.0033 LR: 0.00069  
Epoch: [5][100/5595] Elapsed 0m 34s (remain 30m 53s) Loss avg.: 1.3853 Grad: 0.0039 LR: 0.00069  
Epoch: [5][200/5595] Elapsed 1m 7s (remain 30m 7s) Loss avg.: 1.3851 Grad: 0.0072 LR: 0.00069  
Epoch: [5][300/5595] Elapsed 1m 40s (remain 29m 27s) Loss avg.: 1.3852 Grad: 0.0018 LR: 0.00069  
Epoch: [5][400/5595] Elapsed 2m 13s (remain 28m 50s) Loss avg.: 1.3852 Grad: 0.0055 LR: 0.00069  
Epoch: [5][500/5595] Elapsed 2m 46s (remain 28m 14s) Loss avg.: 1.3852 Grad: 0.0009 LR: 0.00069  
Epoch: [5][600/5595] Elapsed 3m 19s (remain 27m 40s) Loss avg.: 1.3852 Grad: 0.0014 LR: 0.00069  
Epoch: [5][700/5595] Elapsed 3m 53s (remain 27m 7s) Loss avg.: 1.3852 Grad: 0.0016 LR: 0.00069  
Epoch: [5][800/5595] Elapsed 4m 26s (remain 26m 34s) Loss avg.: 1.3853 Grad: 0.0047 LR: 0.00069  
Epoch: [5][900/5595] Elapsed 4m 59s (remain 26m 1s) Loss av

Epoch 5 - avg_train_loss: 1.3853  avg_val_loss: 1.3854  time: 1937s
Epoch 5 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [6][0/5595] Elapsed 0m 0s (remain 92m 46s) Loss avg.: 1.3846 Grad: 0.0009 LR: 0.00055  
Epoch: [6][100/5595] Elapsed 0m 34s (remain 31m 3s) Loss avg.: 1.3851 Grad: 0.0036 LR: 0.00055  
Epoch: [6][200/5595] Elapsed 1m 7s (remain 30m 6s) Loss avg.: 1.3853 Grad: 0.0026 LR: 0.00055  
Epoch: [6][300/5595] Elapsed 1m 40s (remain 29m 28s) Loss avg.: 1.3853 Grad: 0.0008 LR: 0.00055  
Epoch: [6][400/5595] Elapsed 2m 13s (remain 28m 53s) Loss avg.: 1.3853 Grad: 0.0022 LR: 0.00055  
Epoch: [6][500/5595] Elapsed 2m 47s (remain 28m 19s) Loss avg.: 1.3853 Grad: 0.0016 LR: 0.00055  
Epoch: [6][600/5595] Elapsed 3m 20s (remain 27m 44s) Loss avg.: 1.3853 Grad: 0.0026 LR: 0.00055  
Epoch: [6][700/5595] Elapsed 3m 53s (remain 27m 10s) Loss avg.: 1.3853 Grad: 0.0021 LR: 0.00055  
Epoch: [6][800/5595] Elapsed 4m 26s (remain 26m 37s) Loss avg.: 1.3853 Grad: 0.0021 LR: 0.00055  
Epoch: [6][900/5595] Elapsed 5m 0s (remain 26m 3s) Loss avg

Epoch 6 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1938s
Epoch 6 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [7][0/5595] Elapsed 0m 0s (remain 92m 14s) Loss avg.: 1.3859 Grad: 0.0013 LR: 0.00041  
Epoch: [7][100/5595] Elapsed 0m 34s (remain 30m 55s) Loss avg.: 1.3853 Grad: 0.0017 LR: 0.00041  
Epoch: [7][200/5595] Elapsed 1m 7s (remain 30m 5s) Loss avg.: 1.3854 Grad: 0.0047 LR: 0.00041  
Epoch: [7][300/5595] Elapsed 1m 40s (remain 29m 27s) Loss avg.: 1.3854 Grad: 0.0008 LR: 0.00041  
Epoch: [7][400/5595] Elapsed 2m 13s (remain 28m 51s) Loss avg.: 1.3854 Grad: 0.0050 LR: 0.00041  
Epoch: [7][500/5595] Elapsed 2m 46s (remain 28m 16s) Loss avg.: 1.3854 Grad: 0.0024 LR: 0.00041  
Epoch: [7][600/5595] Elapsed 3m 20s (remain 27m 41s) Loss avg.: 1.3854 Grad: 0.0026 LR: 0.00041  
Epoch: [7][700/5595] Elapsed 3m 53s (remain 27m 8s) Loss avg.: 1.3853 Grad: 0.0028 LR: 0.00041  
Epoch: [7][800/5595] Elapsed 4m 26s (remain 26m 34s) Loss avg.: 1.3854 Grad: 0.0015 LR: 0.00041  
Epoch: [7][900/5595] Elapsed 4m 59s (remain 26m 1s) Loss av

Epoch 7 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1937s
Epoch 7 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [8][0/5595] Elapsed 0m 1s (remain 94m 49s) Loss avg.: 1.3829 Grad: 0.0037 LR: 0.00029  
Epoch: [8][100/5595] Elapsed 0m 34s (remain 31m 2s) Loss avg.: 1.3853 Grad: 0.0022 LR: 0.00029  
Epoch: [8][200/5595] Elapsed 1m 7s (remain 30m 11s) Loss avg.: 1.3854 Grad: 0.0031 LR: 0.00029  
Epoch: [8][300/5595] Elapsed 1m 40s (remain 29m 32s) Loss avg.: 1.3854 Grad: 0.0011 LR: 0.00029  
Epoch: [8][400/5595] Elapsed 2m 14s (remain 28m 56s) Loss avg.: 1.3854 Grad: 0.0030 LR: 0.00029  
Epoch: [8][500/5595] Elapsed 2m 47s (remain 28m 21s) Loss avg.: 1.3853 Grad: 0.0032 LR: 0.00029  
Epoch: [8][600/5595] Elapsed 3m 20s (remain 27m 47s) Loss avg.: 1.3853 Grad: 0.0035 LR: 0.00029  
Epoch: [8][700/5595] Elapsed 3m 53s (remain 27m 12s) Loss avg.: 1.3853 Grad: 0.0034 LR: 0.00029  
Epoch: [8][800/5595] Elapsed 4m 27s (remain 26m 38s) Loss avg.: 1.3853 Grad: 0.0058 LR: 0.00029  
Epoch: [8][900/5595] Elapsed 5m 0s (remain 26m 5s) Loss av

Epoch 8 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1938s
Epoch 8 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 15s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [9][0/5595] Elapsed 0m 0s (remain 90m 34s) Loss avg.: 1.3910 Grad: 0.0085 LR: 0.00019  
Epoch: [9][100/5595] Elapsed 0m 34s (remain 30m 56s) Loss avg.: 1.3855 Grad: 0.0022 LR: 0.00019  
Epoch: [9][200/5595] Elapsed 1m 7s (remain 30m 6s) Loss avg.: 1.3855 Grad: 0.0051 LR: 0.00019  
Epoch: [9][300/5595] Elapsed 1m 40s (remain 29m 28s) Loss avg.: 1.3854 Grad: 0.0025 LR: 0.00019  
Epoch: [9][400/5595] Elapsed 2m 13s (remain 28m 52s) Loss avg.: 1.3854 Grad: 0.0022 LR: 0.00019  
Epoch: [9][500/5595] Elapsed 2m 46s (remain 28m 17s) Loss avg.: 1.3854 Grad: 0.0051 LR: 0.00019  
Epoch: [9][600/5595] Elapsed 3m 20s (remain 27m 43s) Loss avg.: 1.3854 Grad: 0.0011 LR: 0.00019  
Epoch: [9][700/5595] Elapsed 3m 53s (remain 27m 10s) Loss avg.: 1.3853 Grad: 0.0031 LR: 0.00019  
Epoch: [9][800/5595] Elapsed 4m 26s (remain 26m 36s) Loss avg.: 1.3854 Grad: 0.0040 LR: 0.00019  
Epoch: [9][900/5595] Elapsed 5m 0s (remain 26m 3s) Loss av

Epoch 9 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1937s
Epoch 9 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [10][0/5595] Elapsed 0m 0s (remain 90m 59s) Loss avg.: 1.3854 Grad: 0.0008 LR: 0.00012  
Epoch: [10][100/5595] Elapsed 0m 34s (remain 30m 57s) Loss avg.: 1.3852 Grad: 0.0020 LR: 0.00012  
Epoch: [10][200/5595] Elapsed 1m 7s (remain 30m 7s) Loss avg.: 1.3851 Grad: 0.0035 LR: 0.00012  
Epoch: [10][300/5595] Elapsed 1m 40s (remain 29m 26s) Loss avg.: 1.3852 Grad: 0.0015 LR: 0.00012  
Epoch: [10][400/5595] Elapsed 2m 13s (remain 28m 50s) Loss avg.: 1.3852 Grad: 0.0020 LR: 0.00012  
Epoch: [10][500/5595] Elapsed 2m 46s (remain 28m 16s) Loss avg.: 1.3852 Grad: 0.0080 LR: 0.00012  
Epoch: [10][600/5595] Elapsed 3m 20s (remain 27m 42s) Loss avg.: 1.3852 Grad: 0.0041 LR: 0.00012  
Epoch: [10][700/5595] Elapsed 3m 53s (remain 27m 9s) Loss avg.: 1.3853 Grad: 0.0014 LR: 0.00012  
Epoch: [10][800/5595] Elapsed 4m 26s (remain 26m 36s) Loss avg.: 1.3853 Grad: 0.0037 LR: 0.00012  
Epoch: [10][900/5595] Elapsed 4m 59s (remain 26m 2

Epoch 10 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1936s
Epoch 10 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [11][0/5595] Elapsed 0m 0s (remain 91m 0s) Loss avg.: 1.3874 Grad: 0.0031 LR: 0.00100  
Epoch: [11][100/5595] Elapsed 0m 34s (remain 30m 50s) Loss avg.: 1.3855 Grad: 0.0038 LR: 0.00100  
Epoch: [11][200/5595] Elapsed 1m 7s (remain 30m 5s) Loss avg.: 1.3854 Grad: 0.0019 LR: 0.00100  
Epoch: [11][300/5595] Elapsed 1m 40s (remain 29m 28s) Loss avg.: 1.3854 Grad: 0.0022 LR: 0.00100  
Epoch: [11][400/5595] Elapsed 2m 13s (remain 28m 53s) Loss avg.: 1.3853 Grad: 0.0037 LR: 0.00100  
Epoch: [11][500/5595] Elapsed 2m 47s (remain 28m 18s) Loss avg.: 1.3853 Grad: 0.0039 LR: 0.00100  
Epoch: [11][600/5595] Elapsed 3m 20s (remain 27m 44s) Loss avg.: 1.3853 Grad: 0.0030 LR: 0.00100  
Epoch: [11][700/5595] Elapsed 3m 53s (remain 27m 10s) Loss avg.: 1.3853 Grad: 0.0031 LR: 0.00100  
Epoch: [11][800/5595] Elapsed 4m 26s (remain 26m 36s) Loss avg.: 1.3853 Grad: 0.0056 LR: 0.00100  
Epoch: [11][900/5595] Elapsed 5m 0s (remain 26m 3s

Epoch 11 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1936s
Epoch 11 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 15s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [12][0/5595] Elapsed 0m 0s (remain 91m 55s) Loss avg.: 1.3834 Grad: 0.0039 LR: 0.00098  
Epoch: [12][100/5595] Elapsed 0m 34s (remain 30m 59s) Loss avg.: 1.3854 Grad: 0.0026 LR: 0.00098  
Epoch: [12][200/5595] Elapsed 1m 7s (remain 30m 10s) Loss avg.: 1.3854 Grad: 0.0016 LR: 0.00098  
Epoch: [12][300/5595] Elapsed 1m 40s (remain 29m 31s) Loss avg.: 1.3853 Grad: 0.0014 LR: 0.00098  
Epoch: [12][400/5595] Elapsed 2m 13s (remain 28m 55s) Loss avg.: 1.3853 Grad: 0.0040 LR: 0.00098  
Epoch: [12][500/5595] Elapsed 2m 47s (remain 28m 20s) Loss avg.: 1.3853 Grad: 0.0026 LR: 0.00098  
Epoch: [12][600/5595] Elapsed 3m 20s (remain 27m 46s) Loss avg.: 1.3853 Grad: 0.0019 LR: 0.00098  
Epoch: [12][700/5595] Elapsed 3m 53s (remain 27m 12s) Loss avg.: 1.3854 Grad: 0.0043 LR: 0.00098  
Epoch: [12][800/5595] Elapsed 4m 27s (remain 26m 38s) Loss avg.: 1.3853 Grad: 0.0021 LR: 0.00098  
Epoch: [12][900/5595] Elapsed 5m 0s (remain 26m 

Epoch 12 - avg_train_loss: 1.3853  avg_val_loss: 1.3854  time: 1938s
Epoch 12 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 15s (remain 0m 0s) Loss avg.: 1.3854 
Epoch: [13][0/5595] Elapsed 0m 1s (remain 96m 52s) Loss avg.: 1.3846 Grad: 0.0012 LR: 0.00091  
Epoch: [13][100/5595] Elapsed 0m 34s (remain 31m 2s) Loss avg.: 1.3853 Grad: 0.0032 LR: 0.00091  
Epoch: [13][200/5595] Elapsed 1m 7s (remain 30m 10s) Loss avg.: 1.3853 Grad: 0.0017 LR: 0.00091  
Epoch: [13][300/5595] Elapsed 1m 40s (remain 29m 30s) Loss avg.: 1.3853 Grad: 0.0008 LR: 0.00091  
Epoch: [13][400/5595] Elapsed 2m 13s (remain 28m 53s) Loss avg.: 1.3853 Grad: 0.0032 LR: 0.00091  
Epoch: [13][500/5595] Elapsed 2m 47s (remain 28m 19s) Loss avg.: 1.3853 Grad: 0.0031 LR: 0.00091  
Epoch: [13][600/5595] Elapsed 3m 20s (remain 27m 45s) Loss avg.: 1.3853 Grad: 0.0025 LR: 0.00091  
Epoch: [13][700/5595] Elapsed 3m 53s (remain 27m 11s) Loss avg.: 1.3853 Grad: 0.0027 LR: 0.00091  
Epoch: [13][800/5595] Elapsed 4m 26s (remain 26m 37s) Loss avg.: 1.3853 Grad: 0.0014 LR: 0.00091  
Epoch: [13][900/5595] Elapsed 5m 0s (remain 26m 4

Epoch 13 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1932s
Epoch 13 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 15s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [14][0/5595] Elapsed 0m 0s (remain 91m 22s) Loss avg.: 1.3827 Grad: 0.0037 LR: 0.00081  
Epoch: [14][100/5595] Elapsed 0m 34s (remain 30m 56s) Loss avg.: 1.3854 Grad: 0.0026 LR: 0.00081  
Epoch: [14][200/5595] Elapsed 1m 7s (remain 30m 4s) Loss avg.: 1.3853 Grad: 0.0056 LR: 0.00081  
Epoch: [14][300/5595] Elapsed 1m 40s (remain 29m 24s) Loss avg.: 1.3853 Grad: 0.0020 LR: 0.00081  
Epoch: [14][400/5595] Elapsed 2m 13s (remain 28m 47s) Loss avg.: 1.3853 Grad: 0.0040 LR: 0.00081  
Epoch: [14][500/5595] Elapsed 2m 46s (remain 28m 12s) Loss avg.: 1.3853 Grad: 0.0036 LR: 0.00081  
Epoch: [14][600/5595] Elapsed 3m 19s (remain 27m 37s) Loss avg.: 1.3853 Grad: 0.0027 LR: 0.00081  
Epoch: [14][700/5595] Elapsed 3m 52s (remain 27m 3s) Loss avg.: 1.3854 Grad: 0.0015 LR: 0.00081  
Epoch: [14][800/5595] Elapsed 4m 25s (remain 26m 29s) Loss avg.: 1.3853 Grad: 0.0030 LR: 0.00081  
Epoch: [14][900/5595] Elapsed 4m 58s (remain 25m 5

Epoch 14 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1928s
Epoch 14 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [15][0/5595] Elapsed 0m 0s (remain 92m 30s) Loss avg.: 1.3865 Grad: 0.0021 LR: 0.00069  
Epoch: [15][100/5595] Elapsed 0m 34s (remain 30m 58s) Loss avg.: 1.3854 Grad: 0.0033 LR: 0.00069  
Epoch: [15][200/5595] Elapsed 1m 7s (remain 30m 3s) Loss avg.: 1.3853 Grad: 0.0023 LR: 0.00069  
Epoch: [15][300/5595] Elapsed 1m 40s (remain 29m 22s) Loss avg.: 1.3853 Grad: 0.0022 LR: 0.00069  
Epoch: [15][400/5595] Elapsed 2m 13s (remain 28m 46s) Loss avg.: 1.3854 Grad: 0.0028 LR: 0.00069  
Epoch: [15][500/5595] Elapsed 2m 46s (remain 28m 12s) Loss avg.: 1.3854 Grad: 0.0034 LR: 0.00069  
Epoch: [15][600/5595] Elapsed 3m 19s (remain 27m 36s) Loss avg.: 1.3853 Grad: 0.0017 LR: 0.00069  
Epoch: [15][700/5595] Elapsed 3m 52s (remain 27m 2s) Loss avg.: 1.3853 Grad: 0.0014 LR: 0.00069  
Epoch: [15][800/5595] Elapsed 4m 25s (remain 26m 30s) Loss avg.: 1.3853 Grad: 0.0030 LR: 0.00069  
Epoch: [15][900/5595] Elapsed 4m 58s (remain 25m 5

Epoch 15 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1925s
Epoch 15 - Accuracy: 0.2609185160404155


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [16][0/5595] Elapsed 0m 0s (remain 91m 27s) Loss avg.: 1.3852 Grad: 0.0013 LR: 0.00055  
Epoch: [16][100/5595] Elapsed 0m 34s (remain 30m 53s) Loss avg.: 1.3856 Grad: 0.0024 LR: 0.00055  
Epoch: [16][200/5595] Elapsed 1m 7s (remain 30m 6s) Loss avg.: 1.3854 Grad: 0.0050 LR: 0.00055  
Epoch: [16][300/5595] Elapsed 1m 40s (remain 29m 27s) Loss avg.: 1.3854 Grad: 0.0027 LR: 0.00055  
Epoch: [16][400/5595] Elapsed 2m 13s (remain 28m 49s) Loss avg.: 1.3854 Grad: 0.0029 LR: 0.00055  
Epoch: [16][500/5595] Elapsed 2m 46s (remain 28m 13s) Loss avg.: 1.3854 Grad: 0.0031 LR: 0.00055  
Epoch: [16][600/5595] Elapsed 3m 19s (remain 27m 39s) Loss avg.: 1.3854 Grad: 0.0014 LR: 0.00055  
Epoch: [16][700/5595] Elapsed 3m 52s (remain 27m 4s) Loss avg.: 1.3853 Grad: 0.0039 LR: 0.00055  
Epoch: [16][800/5595] Elapsed 4m 25s (remain 26m 30s) Loss avg.: 1.3853 Grad: 0.0036 LR: 0.00055  
Epoch: [16][900/5595] Elapsed 4m 58s (remain 25m 5

Epoch 16 - avg_train_loss: 1.3853  avg_val_loss: 1.3853  time: 1930s
Epoch 16 - Accuracy: 0.26092012442460055


Eval: [621/622] Elapsed 1m 14s (remain 0m 0s) Loss avg.: 1.3853 
Epoch: [17][0/5595] Elapsed 0m 0s (remain 91m 42s) Loss avg.: 1.3867 Grad: 0.0023 LR: 0.00041  
Epoch: [17][100/5595] Elapsed 0m 34s (remain 30m 51s) Loss avg.: 1.3854 Grad: 0.0027 LR: 0.00041  
Epoch: [17][200/5595] Elapsed 1m 7s (remain 30m 1s) Loss avg.: 1.3854 Grad: 0.0014 LR: 0.00041  
Epoch: [17][300/5595] Elapsed 1m 40s (remain 29m 24s) Loss avg.: 1.3855 Grad: 0.0015 LR: 0.00041  
Epoch: [17][400/5595] Elapsed 2m 13s (remain 28m 48s) Loss avg.: 1.3855 Grad: 0.0025 LR: 0.00041  
Epoch: [17][500/5595] Elapsed 2m 46s (remain 28m 14s) Loss avg.: 1.3854 Grad: 0.0035 LR: 0.00041  
Epoch: [17][600/5595] Elapsed 3m 19s (remain 27m 40s) Loss avg.: 1.3854 Grad: 0.0037 LR: 0.00041  
Epoch: [17][700/5595] Elapsed 3m 53s (remain 27m 6s) Loss avg.: 1.3854 Grad: 0.0027 LR: 0.00041  
Epoch: [17][800/5595] Elapsed 4m 26s (remain 26m 33s) Loss avg.: 1.3854 Grad: 0.0033 LR: 0.00041  
Epoch: [17][900/5595] Elapsed 4m 59s (remain 26m 0

KeyboardInterrupt: 