# About this notebook ...

## Library

In [1]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
warnings.filterwarnings("ignore")

## Load Data

In [3]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

6011


## Config

In [5]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 0

    epochs = 10
    model_name = "geese_net"

    print_freq = 100

    train = True
    debug = False
    apex = False

In [6]:
if Config.debug:
    Config.epochs = 1

In [7]:
if Config.apex:
    from apex import amp

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [10]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [11]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [12]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    num_step = obs["step"]  # 0-198
    b[0, 0] = num_step / 198

    return b.reshape(1, 7, 11)

## Data

In [13]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.float32)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [14]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6011.0), HTML(value='')))


Num episode: 3800040


In [15]:
# TODO: データをuniqueにしたいがメモリエラーになってしまう。

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [16]:
# まとめてuniqueかけるとメモリ量が足りないので、いくつかのグループに分けてuniqueにする

X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [17]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 3799476


In [18]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [19]:
y_df = pd.DataFrame(y_train)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,2
1,0
2,2
3,3
4,2
...,...
3799471,1
3799472,2
3799473,3
3799474,2


## CV Split

In [20]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(int)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         90530
      1         90531
      2         99444
      3         99443
1     0         90530
      1         90531
      2         99444
      3         99443
2     0         90531
      1         90531
      2         99443
      3         99443
3     0         90531
      1         90531
      2         99443
      3         99443
4     0         90531
      1         90531
      2         99443
      3         99443
5     0         90531
      1         90531
      2         99443
      3         99443
6     0         90531
      1         90530
      2         99443
      3         99443
7     0         90531
      1         90530
      2         99443
      3         99443
8     0         90530
      1         90530
      2         99443
      3         99444
9     0         90530
      1         90530
      2         99443
      3         99444
dtype: int64


## Dataset

In [21]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [22]:
# Test

train_ds = TrainDataset(X_train, y_train)

for i in range(1):
    obs, action = train_ds[i]
    print(obs.shape, action)

(17, 7, 11) tensor(2)


## Model

In [23]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [24]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [40]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 64
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        # self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 7 + 77, filters * 4, bias=False)
        self.head_p2 = nn.Linear(filters * 4, 4, bias=False)
        # self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        # self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p2 = (h_p * x[:, 1:2]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p3 = (h_p * x[:, 2:3]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_head_p4 = (h_p * x[:, 3:4]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)
        
        food_index = torch.where(x[:, 16] == 1)
        h_food = h_p[food_index[0], :, food_index[1], food_index[2]].view(h_p.size(0), -1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_head_p2, h_head_p3, h_head_p4, h_food, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        # h_v = F.relu_(self.conv_v(h))
        # h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        # h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        # h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        # v = torch.tanh(self.head_v2(h_v))

        return {"policy": p}  # "value": v

In [41]:
# Test

model = GeeseNetAlpha()
# print(model)

params = sum(p.numel() for p in model.parameters())
print(f"params: {params:,}")

train_ds = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

for obs, action in train_loader:
    output = model(obs)
    print(output)
    print(f"{torch.argmax(output['policy'], dim=1)}")
    break

params: 627,136
{'policy': tensor([[ 0.0276, -0.1905, -0.2060,  0.1315],
        [-0.2258,  0.0550, -0.0403, -0.0946],
        [ 0.0086, -0.2949,  0.1063,  0.0499],
        [ 0.2187,  0.0999,  0.0164, -0.3738]], grad_fn=<MmBackward>)}
tensor([3, 1, 2, 0])


## Loss

## Scoring

In [27]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

## Helper functions

In [28]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [29]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0

    for step, (obs, action) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.6f}  "
            )

    return losses.avg

In [30]:
def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [31]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    X_train_folds = X_train[folds["fold"] != fold]
    X_valid_folds = X_train[folds["fold"] == fold]

    y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    train_dataset = TrainDataset(X_train_folds, y_train_folds)
    valid_dataset = TrainDataset(X_valid_folds, y_valid_folds)

    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel
        # torch.backends.cudnn.benchmark=True

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth")

    y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds
    y_df_valid_folds["preds"] = best_preds.argmax(1)

    return y_df_valid_folds

## Main


In [32]:
def main():
    def get_result(result_df):
        preds = result_df["preds"].values
        labels = result_df["action"].values
        score = get_score(labels, preds)
        LOGGER.info(f"Score: {score:<.5f}")

    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # fold 1つだけ実行する
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

In [33]:
if __name__ == "__main__":
    main()



Epoch: [1][0/1068] Elapsed 0m 3s (remain 62m 42s) Loss: 1.3914(1.3914) Grad: 0.3771 LR: 0.001000  
Epoch: [1][100/1068] Elapsed 1m 12s (remain 11m 35s) Loss: 0.5934(0.6831) Grad: 0.8712 LR: 0.001000  
Epoch: [1][200/1068] Elapsed 2m 21s (remain 10m 11s) Loss: 0.5736(0.6275) Grad: 0.4336 LR: 0.001000  
Epoch: [1][300/1068] Elapsed 3m 31s (remain 8m 57s) Loss: 0.5413(0.5987) Grad: 0.6496 LR: 0.001000  
Epoch: [1][400/1068] Elapsed 4m 40s (remain 7m 46s) Loss: 0.5014(0.5803) Grad: 0.4925 LR: 0.001000  
Epoch: [1][500/1068] Elapsed 5m 49s (remain 6m 35s) Loss: 0.5154(0.5675) Grad: 0.6113 LR: 0.001000  
Epoch: [1][600/1068] Elapsed 6m 58s (remain 5m 25s) Loss: 0.5228(0.5578) Grad: 0.5445 LR: 0.001000  
Epoch: [1][700/1068] Elapsed 8m 7s (remain 4m 15s) Loss: 0.4782(0.5498) Grad: 0.5062 LR: 0.001000  
Epoch: [1][800/1068] Elapsed 9m 17s (remain 3m 5s) Loss: 0.4928(0.5434) Grad: 0.5547 LR: 0.001000  
Epoch: [1][900/1068] Elapsed 10m 26s (remain 1m 56s) Loss: 0.4866(0.5383) Grad: 0.3345 LR: 0.

Epoch 1 - avg_train_loss: 0.5310  avg_val_loss: 0.4970  time: 765s
Epoch 1 - Accuracy: 0.7801409666585953
Epoch 1 - Save Best Score: 0.7801 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3240(0.4970) 
Epoch: [2][0/1068] Elapsed 0m 1s (remain 26m 11s) Loss: 0.4962(0.4962) Grad: 0.7315 LR: 0.000978  
Epoch: [2][100/1068] Elapsed 1m 10s (remain 11m 17s) Loss: 0.4683(0.4859) Grad: 0.3915 LR: 0.000978  
Epoch: [2][200/1068] Elapsed 2m 20s (remain 10m 4s) Loss: 0.4859(0.4846) Grad: 0.3386 LR: 0.000978  
Epoch: [2][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4822(0.4837) Grad: 0.4190 LR: 0.000978  
Epoch: [2][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4895(0.4829) Grad: 0.4703 LR: 0.000978  
Epoch: [2][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.5032(0.4821) Grad: 0.3068 LR: 0.000978  
Epoch: [2][600/1068] Elapsed 6m 57s (remain 5m 24s) Loss: 0.4838(0.4820) Grad: 0.4929 LR: 0.000978  
Epoch: [2][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4863(0.4817) Grad: 0.3699 LR: 0.000978  
Epoch: [2][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4746(0.4811) Grad: 0.3433 LR: 0.000978  
Epoch: [2][900/1068] Elaps

Epoch 2 - avg_train_loss: 0.4799  avg_val_loss: 0.4778  time: 764s
Epoch 2 - Accuracy: 0.7911240485540126
Epoch 2 - Save Best Score: 0.7911 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3147(0.4778) 
Epoch: [3][0/1068] Elapsed 0m 1s (remain 26m 52s) Loss: 0.4651(0.4651) Grad: 0.3268 LR: 0.000914  
Epoch: [3][100/1068] Elapsed 1m 10s (remain 11m 18s) Loss: 0.4754(0.4690) Grad: 0.2585 LR: 0.000914  
Epoch: [3][200/1068] Elapsed 2m 20s (remain 10m 4s) Loss: 0.4718(0.4682) Grad: 0.3254 LR: 0.000914  
Epoch: [3][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4521(0.4683) Grad: 0.2547 LR: 0.000914  
Epoch: [3][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4782(0.4685) Grad: 0.3929 LR: 0.000914  
Epoch: [3][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4700(0.4683) Grad: 0.2575 LR: 0.000914  
Epoch: [3][600/1068] Elapsed 6m 57s (remain 5m 24s) Loss: 0.4578(0.4678) Grad: 0.3057 LR: 0.000914  
Epoch: [3][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4715(0.4678) Grad: 0.3762 LR: 0.000914  
Epoch: [3][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4737(0.4682) Grad: 0.3401 LR: 0.000914  
Epoch: [3][900/1068] Elaps

Epoch 3 - avg_train_loss: 0.4680  avg_val_loss: 0.4720  time: 764s
Epoch 3 - Accuracy: 0.7938928484950573
Epoch 3 - Save Best Score: 0.7939 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3122(0.4720) 
Epoch: [4][0/1068] Elapsed 0m 1s (remain 25m 9s) Loss: 0.4612(0.4612) Grad: 0.2996 LR: 0.000815  
Epoch: [4][100/1068] Elapsed 1m 10s (remain 11m 17s) Loss: 0.4488(0.4600) Grad: 0.2168 LR: 0.000815  
Epoch: [4][200/1068] Elapsed 2m 19s (remain 10m 3s) Loss: 0.4576(0.4602) Grad: 0.3860 LR: 0.000815  
Epoch: [4][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4592(0.4598) Grad: 0.2715 LR: 0.000815  
Epoch: [4][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4429(0.4595) Grad: 0.2230 LR: 0.000815  
Epoch: [4][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4616(0.4599) Grad: 0.3173 LR: 0.000815  
Epoch: [4][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4613(0.4600) Grad: 0.2801 LR: 0.000815  
Epoch: [4][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4610(0.4602) Grad: 0.2744 LR: 0.000815  
Epoch: [4][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4547(0.4603) Grad: 0.2670 LR: 0.000815  
Epoch: [4][900/1068] Elapse

Epoch 4 - avg_train_loss: 0.4603  avg_val_loss: 0.4687  time: 763s
Epoch 4 - Accuracy: 0.7953746302125554
Epoch 4 - Save Best Score: 0.7954 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3079(0.4687) 
Epoch: [5][0/1068] Elapsed 0m 1s (remain 25m 11s) Loss: 0.4663(0.4663) Grad: 0.3168 LR: 0.000689  
Epoch: [5][100/1068] Elapsed 1m 10s (remain 11m 16s) Loss: 0.4649(0.4514) Grad: 0.2605 LR: 0.000689  
Epoch: [5][200/1068] Elapsed 2m 19s (remain 10m 3s) Loss: 0.4541(0.4519) Grad: 0.3138 LR: 0.000689  
Epoch: [5][300/1068] Elapsed 3m 29s (remain 8m 52s) Loss: 0.4343(0.4524) Grad: 0.1967 LR: 0.000689  
Epoch: [5][400/1068] Elapsed 4m 38s (remain 7m 42s) Loss: 0.4429(0.4529) Grad: 0.3257 LR: 0.000689  
Epoch: [5][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4699(0.4531) Grad: 0.2393 LR: 0.000689  
Epoch: [5][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4495(0.4535) Grad: 0.3692 LR: 0.000689  
Epoch: [5][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4551(0.4538) Grad: 0.2314 LR: 0.000689  
Epoch: [5][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4509(0.4540) Grad: 0.2196 LR: 0.000689  
Epoch: [5][900/1068] Elaps

Epoch 5 - avg_train_loss: 0.4541  avg_val_loss: 0.4644  time: 763s
Epoch 5 - Accuracy: 0.7966958636444987
Epoch 5 - Save Best Score: 0.7967 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3048(0.4644) 
Epoch: [6][0/1068] Elapsed 0m 1s (remain 25m 18s) Loss: 0.4527(0.4527) Grad: 0.2053 LR: 0.000550  
Epoch: [6][100/1068] Elapsed 1m 10s (remain 11m 17s) Loss: 0.4513(0.4465) Grad: 0.2864 LR: 0.000550  
Epoch: [6][200/1068] Elapsed 2m 19s (remain 10m 3s) Loss: 0.4625(0.4461) Grad: 0.2275 LR: 0.000550  
Epoch: [6][300/1068] Elapsed 3m 29s (remain 8m 52s) Loss: 0.4507(0.4473) Grad: 0.2387 LR: 0.000550  
Epoch: [6][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4580(0.4478) Grad: 0.2851 LR: 0.000550  
Epoch: [6][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4611(0.4480) Grad: 0.2439 LR: 0.000550  
Epoch: [6][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4376(0.4479) Grad: 0.2227 LR: 0.000550  
Epoch: [6][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4488(0.4479) Grad: 0.2973 LR: 0.000550  
Epoch: [6][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4397(0.4479) Grad: 0.2963 LR: 0.000550  
Epoch: [6][900/1068] Elaps

Epoch 6 - avg_train_loss: 0.4481  avg_val_loss: 0.4635  time: 763s
Epoch 6 - Accuracy: 0.7971801404402707
Epoch 6 - Save Best Score: 0.7972 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3042(0.4635) 
Epoch: [7][0/1068] Elapsed 0m 1s (remain 25m 15s) Loss: 0.4419(0.4419) Grad: 0.2756 LR: 0.000411  
Epoch: [7][100/1068] Elapsed 1m 10s (remain 11m 16s) Loss: 0.4288(0.4417) Grad: 0.2642 LR: 0.000411  
Epoch: [7][200/1068] Elapsed 2m 19s (remain 10m 3s) Loss: 0.4600(0.4421) Grad: 0.2396 LR: 0.000411  
Epoch: [7][300/1068] Elapsed 3m 29s (remain 8m 52s) Loss: 0.4515(0.4421) Grad: 0.2743 LR: 0.000411  
Epoch: [7][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4464(0.4418) Grad: 0.2052 LR: 0.000411  
Epoch: [7][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4665(0.4417) Grad: 0.3302 LR: 0.000411  
Epoch: [7][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4502(0.4418) Grad: 0.3276 LR: 0.000411  
Epoch: [7][700/1068] Elapsed 8m 5s (remain 4m 14s) Loss: 0.4330(0.4419) Grad: 0.2047 LR: 0.000411  
Epoch: [7][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4500(0.4420) Grad: 0.2630 LR: 0.000411  
Epoch: [7][900/1068] Elaps

Epoch 7 - avg_train_loss: 0.4423  avg_val_loss: 0.4621  time: 763s
Epoch 7 - Accuracy: 0.7984250476380979
Epoch 7 - Save Best Score: 0.7984 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3088(0.4621) 
Epoch: [8][0/1068] Elapsed 0m 1s (remain 25m 39s) Loss: 0.4383(0.4383) Grad: 0.2989 LR: 0.000285  
Epoch: [8][100/1068] Elapsed 1m 10s (remain 11m 16s) Loss: 0.4479(0.4345) Grad: 0.2199 LR: 0.000285  
Epoch: [8][200/1068] Elapsed 2m 20s (remain 10m 3s) Loss: 0.4496(0.4342) Grad: 0.2339 LR: 0.000285  
Epoch: [8][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4379(0.4350) Grad: 0.2652 LR: 0.000285  
Epoch: [8][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4177(0.4353) Grad: 0.2249 LR: 0.000285  
Epoch: [8][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4296(0.4355) Grad: 0.2182 LR: 0.000285  
Epoch: [8][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4407(0.4358) Grad: 0.2477 LR: 0.000285  
Epoch: [8][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4352(0.4361) Grad: 0.2282 LR: 0.000285  
Epoch: [8][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4577(0.4364) Grad: 0.2582 LR: 0.000285  
Epoch: [8][900/1068] Elaps

Epoch 8 - avg_train_loss: 0.4367  avg_val_loss: 0.4600  time: 763s
Epoch 8 - Accuracy: 0.7991777822228305
Epoch 8 - Save Best Score: 0.7992 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3103(0.4600) 
Epoch: [9][0/1068] Elapsed 0m 1s (remain 27m 15s) Loss: 0.4275(0.4275) Grad: 0.2092 LR: 0.000186  
Epoch: [9][100/1068] Elapsed 1m 10s (remain 11m 19s) Loss: 0.4452(0.4284) Grad: 0.2333 LR: 0.000186  
Epoch: [9][200/1068] Elapsed 2m 20s (remain 10m 4s) Loss: 0.4534(0.4297) Grad: 0.2945 LR: 0.000186  
Epoch: [9][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4220(0.4302) Grad: 0.2296 LR: 0.000186  
Epoch: [9][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4328(0.4307) Grad: 0.2460 LR: 0.000186  
Epoch: [9][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4486(0.4312) Grad: 0.2201 LR: 0.000186  
Epoch: [9][600/1068] Elapsed 6m 57s (remain 5m 24s) Loss: 0.4169(0.4313) Grad: 0.2540 LR: 0.000186  
Epoch: [9][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4290(0.4314) Grad: 0.2758 LR: 0.000186  
Epoch: [9][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4292(0.4314) Grad: 0.3277 LR: 0.000186  
Epoch: [9][900/1068] Elaps

Epoch 9 - avg_train_loss: 0.4318  avg_val_loss: 0.4597  time: 764s
Epoch 9 - Accuracy: 0.7993356985692779
Epoch 9 - Save Best Score: 0.7993 Model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3114(0.4597) 
Epoch: [10][0/1068] Elapsed 0m 1s (remain 25m 43s) Loss: 0.4273(0.4273) Grad: 0.2559 LR: 0.000122  
Epoch: [10][100/1068] Elapsed 1m 10s (remain 11m 17s) Loss: 0.4386(0.4265) Grad: 0.3406 LR: 0.000122  
Epoch: [10][200/1068] Elapsed 2m 19s (remain 10m 3s) Loss: 0.4105(0.4266) Grad: 0.2253 LR: 0.000122  
Epoch: [10][300/1068] Elapsed 3m 29s (remain 8m 53s) Loss: 0.4299(0.4269) Grad: 0.2561 LR: 0.000122  
Epoch: [10][400/1068] Elapsed 4m 38s (remain 7m 43s) Loss: 0.4226(0.4271) Grad: 0.2655 LR: 0.000122  
Epoch: [10][500/1068] Elapsed 5m 47s (remain 6m 33s) Loss: 0.4435(0.4274) Grad: 0.2678 LR: 0.000122  
Epoch: [10][600/1068] Elapsed 6m 56s (remain 5m 23s) Loss: 0.4280(0.4275) Grad: 0.2420 LR: 0.000122  
Epoch: [10][700/1068] Elapsed 8m 6s (remain 4m 14s) Loss: 0.4245(0.4275) Grad: 0.2868 LR: 0.000122  
Epoch: [10][800/1068] Elapsed 9m 15s (remain 3m 5s) Loss: 0.4357(0.4275) Grad: 0.2921 LR: 0.000122  
Epoch: [10][900/1

Epoch 10 - avg_train_loss: 0.4280  avg_val_loss: 0.4600  time: 764s
Epoch 10 - Accuracy: 0.800154231631697
Epoch 10 - Save Best Score: 0.8002 Model
Epoch 10 - Save final model


EVAL: [118/119] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3094(0.4600) 


Score: 0.80015
