# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Load Data

In [3]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

9512


## Config

In [5]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 0

    epochs = 10
    model_name = "geese_net"

    print_freq = 100

    train = True
    debug = False
    apex = False

In [6]:
if Config.debug:
    Config.epochs = 1

In [7]:
if Config.apex:
    from apex import amp

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [10]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [11]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [12]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    b[0, 0] = obs["step"]  # 0-198

    return b.reshape(1, 7, 11)

## Data

In [13]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.float32)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [14]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9512.0), HTML(value='')))


Num episode: 5996532


In [15]:
# %%time

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [16]:
X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [17]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 5995760


In [18]:
del X_train_sum_obs
del X_train_group
del X_train_unique
del y_train_unique
del X_train_
del y_train_
del group_index
del unique_index

In [19]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [20]:
y_df = pd.DataFrame(y_train)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,2
2,0
3,2
4,3
...,...
5995755,1
5995756,2
5995757,3
5995758,2


## CV Split

In [21]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(int)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         142836
      1         142836
      2         156952
      3         156952
1     0         142836
      1         142836
      2         156952
      3         156952
2     0         142836
      1         142836
      2         156952
      3         156952
3     0         142836
      1         142836
      2         156952
      3         156952
4     0         142836
      1         142836
      2         156952
      3         156952
5     0         142836
      1         142836
      2         156952
      3         156952
6     0         142836
      1         142836
      2         156952
      3         156952
7     0         142836
      1         142836
      2         156952
      3         156952
8     0         142837
      1         142837
      2         156951
      3         156951
9     0         142837
      1         142837
      2         156951
      3         156951
dtype: int64


## Dataset

In [22]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [23]:
# Test

train_ds = TrainDataset(X_train, y_train)

for i in range(1):
    obs, action = train_ds[i]
    print(obs.shape, action)

(17, 7, 11) tensor(2)


## Model

In [24]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [25]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [28]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 64

        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        # self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 5 + 77, filters * 3, bias=False)
        self.head_p2 = nn.Linear(filters * 3, 4, bias=False)
        # self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        # self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        # h_v = F.relu_(self.conv_v(h))
        # h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        # h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        # h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        # v = torch.tanh(self.head_v2(h_v))

        return {"policy": p}  # "value": v

In [29]:
# Test

model = GeeseNetAlpha()
# print(model)

params = sum(p.numel() for p in model.parameters())
print(f"params: {params:,}")

train_ds = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

for obs, action in train_loader:
    output = model(obs)
    print(output)
    print(f"{torch.argmax(output['policy'], dim=1)}")
    break

params: 568,704
{'policy': tensor([[ 0.1287,  0.2129,  0.0457, -0.1798],
        [-0.1429,  0.1576,  0.0892, -0.2146],
        [-0.0177,  0.1524,  0.0746, -0.1201],
        [-0.1002,  0.2029,  0.0028, -0.0931]], grad_fn=<MmBackward>)}
tensor([1, 1, 1, 1])


## Loss

## Scoring

In [30]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

## Helper functions

In [31]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [32]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0

    for step, (obs, action) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.6f}  "
            )

    return losses.avg

In [33]:
def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [34]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    X_train_folds = X_train[folds["fold"] != fold]
    X_valid_folds = X_train[folds["fold"] == fold]

    y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    train_dataset = TrainDataset(X_train_folds, y_train_folds)
    valid_dataset = TrainDataset(X_valid_folds, y_valid_folds)

    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel
        # torch.backends.cudnn.benchmark=True

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
    y_df_valid_folds["preds"] = best_preds.argmax(1)

    return y_df_valid_folds

## Main


In [35]:
def main():
    def get_result(result_df):
        preds = result_df["preds"].values
        labels = result_df["action"].values
        score = get_score(labels, preds)
        LOGGER.info(f"Score: {score:<.5f}")

    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

In [36]:
if __name__ == "__main__":
    main()



Epoch: [1][0/1686] Elapsed 0m 3s (remain 104m 7s) Loss: 1.3871(1.3871) Grad: 0.3429 LR: 0.001000  
Epoch: [1][100/1686] Elapsed 1m 12s (remain 19m 4s) Loss: 0.5778(0.6889) Grad: 1.1594 LR: 0.001000  
Epoch: [1][200/1686] Elapsed 2m 22s (remain 17m 29s) Loss: 0.5561(0.6303) Grad: 0.9499 LR: 0.001000  
Epoch: [1][300/1686] Elapsed 3m 31s (remain 16m 12s) Loss: 0.5334(0.6005) Grad: 0.6855 LR: 0.001000  
Epoch: [1][400/1686] Elapsed 4m 40s (remain 14m 58s) Loss: 0.5356(0.5813) Grad: 0.3950 LR: 0.001000  
Epoch: [1][500/1686] Elapsed 5m 49s (remain 13m 46s) Loss: 0.5090(0.5679) Grad: 0.5891 LR: 0.001000  
Epoch: [1][600/1686] Elapsed 6m 58s (remain 12m 36s) Loss: 0.5245(0.5580) Grad: 0.6371 LR: 0.001000  
Epoch: [1][700/1686] Elapsed 8m 8s (remain 11m 25s) Loss: 0.4832(0.5500) Grad: 0.4434 LR: 0.001000  
Epoch: [1][800/1686] Elapsed 9m 17s (remain 10m 15s) Loss: 0.4974(0.5438) Grad: 0.4388 LR: 0.001000  
Epoch: [1][900/1686] Elapsed 10m 26s (remain 9m 5s) Loss: 0.4908(0.5385) Grad: 0.3652 L

Epoch 1 - avg_train_loss: 0.5144  avg_val_loss: 0.4819  time: 1206s
Epoch 1 - Accuracy: 0.7895729648951926
Epoch 1 - Save Best Score: 0.7896 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2644(0.4819) 
Epoch: [2][0/1686] Elapsed 0m 1s (remain 42m 36s) Loss: 0.4892(0.4892) Grad: 0.2867 LR: 0.000978  
Epoch: [2][100/1686] Elapsed 1m 10s (remain 18m 31s) Loss: 0.4788(0.4752) Grad: 0.3878 LR: 0.000978  
Epoch: [2][200/1686] Elapsed 2m 20s (remain 17m 14s) Loss: 0.4809(0.4768) Grad: 0.2241 LR: 0.000978  
Epoch: [2][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4811(0.4765) Grad: 0.3738 LR: 0.000978  
Epoch: [2][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4807(0.4764) Grad: 0.4123 LR: 0.000978  
Epoch: [2][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4819(0.4756) Grad: 0.4420 LR: 0.000978  
Epoch: [2][600/1686] Elapsed 6m 57s (remain 12m 32s) Loss: 0.4826(0.4758) Grad: 0.3586 LR: 0.000978  
Epoch: [2][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4797(0.4756) Grad: 0.2953 LR: 0.000978  
Epoch: [2][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4684(0.4752) Grad: 0.2696 LR: 0.000978  
Epoch: [2][900/1686

Epoch 2 - avg_train_loss: 0.4730  avg_val_loss: 0.4728  time: 1204s
Epoch 2 - Accuracy: 0.7928502808651448
Epoch 2 - Save Best Score: 0.7929 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2596(0.4728) 
Epoch: [3][0/1686] Elapsed 0m 1s (remain 41m 46s) Loss: 0.4517(0.4517) Grad: 0.3365 LR: 0.000914  
Epoch: [3][100/1686] Elapsed 1m 10s (remain 18m 30s) Loss: 0.4573(0.4628) Grad: 0.2359 LR: 0.000914  
Epoch: [3][200/1686] Elapsed 2m 20s (remain 17m 14s) Loss: 0.4601(0.4627) Grad: 0.4136 LR: 0.000914  
Epoch: [3][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4610(0.4634) Grad: 0.2994 LR: 0.000914  
Epoch: [3][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4626(0.4639) Grad: 0.2723 LR: 0.000914  
Epoch: [3][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4766(0.4642) Grad: 0.3755 LR: 0.000914  
Epoch: [3][600/1686] Elapsed 6m 56s (remain 12m 32s) Loss: 0.4673(0.4641) Grad: 0.2952 LR: 0.000914  
Epoch: [3][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4495(0.4638) Grad: 0.1874 LR: 0.000914  
Epoch: [3][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4477(0.4638) Grad: 0.2413 LR: 0.000914  
Epoch: [3][900/1686

Epoch 3 - avg_train_loss: 0.4634  avg_val_loss: 0.4679  time: 1204s
Epoch 3 - Accuracy: 0.795915780484876
Epoch 3 - Save Best Score: 0.7959 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2573(0.4679) 
Epoch: [4][0/1686] Elapsed 0m 1s (remain 42m 19s) Loss: 0.4379(0.4379) Grad: 0.2698 LR: 0.000815  
Epoch: [4][100/1686] Elapsed 1m 10s (remain 18m 30s) Loss: 0.4549(0.4550) Grad: 0.2785 LR: 0.000815  
Epoch: [4][200/1686] Elapsed 2m 19s (remain 17m 13s) Loss: 0.4438(0.4552) Grad: 0.2142 LR: 0.000815  
Epoch: [4][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4664(0.4563) Grad: 0.2151 LR: 0.000815  
Epoch: [4][400/1686] Elapsed 4m 38s (remain 14m 51s) Loss: 0.4691(0.4569) Grad: 0.3396 LR: 0.000815  
Epoch: [4][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4562(0.4575) Grad: 0.2340 LR: 0.000815  
Epoch: [4][600/1686] Elapsed 6m 56s (remain 12m 32s) Loss: 0.4499(0.4574) Grad: 0.1861 LR: 0.000815  
Epoch: [4][700/1686] Elapsed 8m 6s (remain 11m 22s) Loss: 0.4579(0.4573) Grad: 0.2818 LR: 0.000815  
Epoch: [4][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4620(0.4572) Grad: 0.2194 LR: 0.000815  
Epoch: [4][900/1686

Epoch 4 - avg_train_loss: 0.4570  avg_val_loss: 0.4611  time: 1204s
Epoch 4 - Accuracy: 0.7988995556860181
Epoch 4 - Save Best Score: 0.7989 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2510(0.4611) 
Epoch: [5][0/1686] Elapsed 0m 1s (remain 43m 4s) Loss: 0.4335(0.4335) Grad: 0.1960 LR: 0.000689  
Epoch: [5][100/1686] Elapsed 1m 10s (remain 18m 29s) Loss: 0.4623(0.4524) Grad: 0.1945 LR: 0.000689  
Epoch: [5][200/1686] Elapsed 2m 19s (remain 17m 13s) Loss: 0.4648(0.4515) Grad: 0.2367 LR: 0.000689  
Epoch: [5][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4581(0.4511) Grad: 0.1925 LR: 0.000689  
Epoch: [5][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4608(0.4514) Grad: 0.2115 LR: 0.000689  
Epoch: [5][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4570(0.4514) Grad: 0.2772 LR: 0.000689  
Epoch: [5][600/1686] Elapsed 6m 56s (remain 12m 32s) Loss: 0.4690(0.4516) Grad: 0.2106 LR: 0.000689  
Epoch: [5][700/1686] Elapsed 8m 5s (remain 11m 22s) Loss: 0.4446(0.4516) Grad: 0.1742 LR: 0.000689  
Epoch: [5][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4520(0.4517) Grad: 0.1884 LR: 0.000689  
Epoch: [5][900/1686]

Epoch 5 - avg_train_loss: 0.4517  avg_val_loss: 0.4586  time: 1204s
Epoch 5 - Accuracy: 0.8000753866065353
Epoch 5 - Save Best Score: 0.8001 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2490(0.4586) 
Epoch: [6][0/1686] Elapsed 0m 1s (remain 45m 2s) Loss: 0.4517(0.4517) Grad: 0.1634 LR: 0.000550  
Epoch: [6][100/1686] Elapsed 1m 11s (remain 18m 35s) Loss: 0.4374(0.4457) Grad: 0.2440 LR: 0.000550  
Epoch: [6][200/1686] Elapsed 2m 20s (remain 17m 16s) Loss: 0.4455(0.4466) Grad: 0.1852 LR: 0.000550  
Epoch: [6][300/1686] Elapsed 3m 29s (remain 16m 3s) Loss: 0.4531(0.4464) Grad: 0.2040 LR: 0.000550  
Epoch: [6][400/1686] Elapsed 4m 38s (remain 14m 53s) Loss: 0.4397(0.4464) Grad: 0.1840 LR: 0.000550  
Epoch: [6][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4517(0.4466) Grad: 0.3054 LR: 0.000550  
Epoch: [6][600/1686] Elapsed 6m 57s (remain 12m 33s) Loss: 0.4518(0.4465) Grad: 0.1516 LR: 0.000550  
Epoch: [6][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4304(0.4468) Grad: 0.2008 LR: 0.000550  
Epoch: [6][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4375(0.4467) Grad: 0.2033 LR: 0.000550  
Epoch: [6][900/1686]

Epoch 6 - avg_train_loss: 0.4467  avg_val_loss: 0.4563  time: 1204s
Epoch 6 - Accuracy: 0.8015347512241985
Epoch 6 - Save Best Score: 0.8015 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2506(0.4563) 
Epoch: [7][0/1686] Elapsed 0m 1s (remain 43m 12s) Loss: 0.4512(0.4512) Grad: 0.1726 LR: 0.000411  
Epoch: [7][100/1686] Elapsed 1m 10s (remain 18m 31s) Loss: 0.4490(0.4400) Grad: 0.2539 LR: 0.000411  
Epoch: [7][200/1686] Elapsed 2m 20s (remain 17m 14s) Loss: 0.4458(0.4398) Grad: 0.2155 LR: 0.000411  
Epoch: [7][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4618(0.4402) Grad: 0.2120 LR: 0.000411  
Epoch: [7][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4419(0.4404) Grad: 0.2054 LR: 0.000411  
Epoch: [7][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4211(0.4408) Grad: 0.1867 LR: 0.000411  
Epoch: [7][600/1686] Elapsed 6m 57s (remain 12m 32s) Loss: 0.4585(0.4409) Grad: 0.1892 LR: 0.000411  
Epoch: [7][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4526(0.4412) Grad: 0.2767 LR: 0.000411  
Epoch: [7][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4386(0.4413) Grad: 0.2275 LR: 0.000411  
Epoch: [7][900/1686

Epoch 7 - avg_train_loss: 0.4421  avg_val_loss: 0.4560  time: 1204s
Epoch 7 - Accuracy: 0.8013679666964655


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2451(0.4560) 
Epoch: [8][0/1686] Elapsed 0m 1s (remain 45m 42s) Loss: 0.4270(0.4270) Grad: 0.1887 LR: 0.000285  
Epoch: [8][100/1686] Elapsed 1m 10s (remain 18m 32s) Loss: 0.4578(0.4358) Grad: 0.2064 LR: 0.000285  
Epoch: [8][200/1686] Elapsed 2m 20s (remain 17m 16s) Loss: 0.4414(0.4366) Grad: 0.2362 LR: 0.000285  
Epoch: [8][300/1686] Elapsed 3m 29s (remain 16m 3s) Loss: 0.4395(0.4368) Grad: 0.1692 LR: 0.000285  
Epoch: [8][400/1686] Elapsed 4m 38s (remain 14m 53s) Loss: 0.4278(0.4370) Grad: 0.1966 LR: 0.000285  
Epoch: [8][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4129(0.4368) Grad: 0.1906 LR: 0.000285  
Epoch: [8][600/1686] Elapsed 6m 57s (remain 12m 33s) Loss: 0.4335(0.4368) Grad: 0.1839 LR: 0.000285  
Epoch: [8][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4438(0.4369) Grad: 0.1699 LR: 0.000285  
Epoch: [8][800/1686] Elapsed 9m 15s (remain 10m 14s) Loss: 0.4295(0.4372) Grad: 0.1956 LR: 0.000285  
Epoch: [8][900/1686

Epoch 8 - avg_train_loss: 0.4377  avg_val_loss: 0.4546  time: 1205s
Epoch 8 - Accuracy: 0.8020968150826584
Epoch 8 - Save Best Score: 0.8021 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2481(0.4546) 
Epoch: [9][0/1686] Elapsed 0m 1s (remain 42m 16s) Loss: 0.4471(0.4471) Grad: 0.1772 LR: 0.000186  
Epoch: [9][100/1686] Elapsed 1m 10s (remain 18m 30s) Loss: 0.4300(0.4313) Grad: 0.2155 LR: 0.000186  
Epoch: [9][200/1686] Elapsed 2m 19s (remain 17m 14s) Loss: 0.4456(0.4323) Grad: 0.1874 LR: 0.000186  
Epoch: [9][300/1686] Elapsed 3m 29s (remain 16m 2s) Loss: 0.4400(0.4328) Grad: 0.2173 LR: 0.000186  
Epoch: [9][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4207(0.4330) Grad: 0.1741 LR: 0.000186  
Epoch: [9][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4122(0.4328) Grad: 0.1992 LR: 0.000186  
Epoch: [9][600/1686] Elapsed 6m 56s (remain 12m 32s) Loss: 0.4425(0.4331) Grad: 0.2698 LR: 0.000186  
Epoch: [9][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4285(0.4334) Grad: 0.2544 LR: 0.000186  
Epoch: [9][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4400(0.4334) Grad: 0.1866 LR: 0.000186  
Epoch: [9][900/1686

Epoch 9 - avg_train_loss: 0.4340  avg_val_loss: 0.4546  time: 1204s
Epoch 9 - Accuracy: 0.8023586667911992
Epoch 9 - Save Best Score: 0.8024 Model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2438(0.4546) 
Epoch: [10][0/1686] Elapsed 0m 1s (remain 42m 30s) Loss: 0.4202(0.4202) Grad: 0.1925 LR: 0.000122  
Epoch: [10][100/1686] Elapsed 1m 10s (remain 18m 30s) Loss: 0.4272(0.4294) Grad: 0.1878 LR: 0.000122  
Epoch: [10][200/1686] Elapsed 2m 19s (remain 17m 14s) Loss: 0.4237(0.4311) Grad: 0.2359 LR: 0.000122  
Epoch: [10][300/1686] Elapsed 3m 29s (remain 16m 3s) Loss: 0.4286(0.4313) Grad: 0.2118 LR: 0.000122  
Epoch: [10][400/1686] Elapsed 4m 38s (remain 14m 52s) Loss: 0.4317(0.4307) Grad: 0.1689 LR: 0.000122  
Epoch: [10][500/1686] Elapsed 5m 47s (remain 13m 42s) Loss: 0.4250(0.4311) Grad: 0.2153 LR: 0.000122  
Epoch: [10][600/1686] Elapsed 6m 56s (remain 12m 32s) Loss: 0.4275(0.4312) Grad: 0.1726 LR: 0.000122  
Epoch: [10][700/1686] Elapsed 8m 6s (remain 11m 23s) Loss: 0.4408(0.4312) Grad: 0.2153 LR: 0.000122  
Epoch: [10][800/1686] Elapsed 9m 15s (remain 10m 13s) Loss: 0.4472(0.4312) Grad: 0.2043 LR: 0.000122  
Epoch: [10

Epoch 10 - avg_train_loss: 0.4312  avg_val_loss: 0.4544  time: 1204s
Epoch 10 - Accuracy: 0.8022252391690128
Epoch 10 - Save final model


EVAL: [187/188] Elapsed 0m 35s (remain 0m 0s) Loss: 0.2464(0.4544) 


Score: 0.80236
