# About this notebook ...

## Prepare for Colab

In [1]:
import sys

IN_COLAB = "google.colab" in sys.modules

In [2]:
COMPETE = "hungry-geese"
DATASETS = [
    "imokuri/hungrygeeseepisode",
]
KERNEL_OUTPUTS = [
    # "imokuri/notebook8d5846e909",
    # "imokuri/notebook42cdc46ffc",
]
PACKAGES = []

In [3]:
if IN_COLAB:
    # Work around for python2 exception.
    !python2 -m pip uninstall kaggle -y
    !python3 -m pip uninstall kaggle -y
    !python3 -m pip install -U -q kaggle

In [4]:
if IN_COLAB:
    !pip install -q -U git+https://github.com/IMOKURI/kaggle_on_google_colab.git

    from kaggle_on_google_colab import setup
    kaggle = setup.Setup()
    kaggle.dirs(COMPETE)

    # !kaggle competitions download -p /content/zip {COMPETE}
    # !unzip -q -n /content/zip/{COMPETE}.zip -d /content/{COMPETE}/input/{COMPETE}

    for dataset in DATASETS:
        dataset_name = dataset.split("/")[-1]
        !kaggle datasets download -p /content/zip {dataset}
        !unzip -q -n /content/zip/{dataset_name}.zip -d /content/{COMPETE}/input/{dataset_name}

    for kernel in KERNEL_OUTPUTS:
        kernel_name = kernel.split("/")[-1]
        !kaggle kernels output -p /content/{COMPETE}/input/{kernel_name} {kernel}

    for package in PACKAGES:
        !pip install -q {package}

    %cd /content/{COMPETE}/output

## Library

In [5]:
import glob
import json
import math
import os
import random
import time
import warnings
from contextlib import contextmanager

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [6]:
warnings.filterwarnings("ignore")

## Load Data

In [7]:
BASE_DIR = "../input/hungrygeeseepisode/hungry-geese-episode/"
OUTPUT_DIR = "pre-models/"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
paths = [path for path in glob.glob(BASE_DIR + "*.json") if "info" not in path]
print(len(paths))

4508


## Config

In [9]:
class Config:
    seed = 440

    n_class = 4
    n_fold = 10  # 今回は fold 1つしかまわさない

    gradient_accumulation_steps = 1
    max_grad_norm = 1000

    num_workers = 4
    batch_size = 3200

    scheduler = "CosineAnnealingWarmRestarts"
    # factor = 0.2  # ReduceLROnPlateau
    # patience = 4  # ReduceLROnPlateau
    # eps = 1e-6  # ReduceLROnPlateau
    # T_max = 10  # CosineAnnealingLR
    T_0 = 10  # CosineAnnealingWarmRestarts

    criterion = "CrossEntropyLoss"
    lr = 1e-3
    min_lr = 1e-4
    weight_decay = 0

    epochs = 10
    model_name = "geese_net"

    print_freq = 100

    train = True
    debug = False
    apex = False

In [10]:
if Config.debug:
    Config.epochs = 1

In [11]:
if Config.apex:
    from apex import amp

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

In [13]:
@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f"[{name}] start")
    yield
    LOGGER.info(f"[{name}] done in {time.time() - t0:.0f} s.")


def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(seed=Config.seed)

In [14]:
def reverse_ns(y):
    if y == 0:
        return 1
    if y == 1:
        return 0
    return y


def reverse_we(y):
    if y == 2:
        return 3
    if y == 3:
        return 2
    return y


def reverse_nswe(y):
    return reverse_ns(reverse_we(y))

## Observation

In [15]:
def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs["geese"]):
        pid = (p - obs["index"]) % 4

        # head position
        for pos in pos_list[:1]:
            b[0 + pid, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + pid, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + pid, pos] = 1

    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev["geese"]):
            for pos in pos_list[:1]:
                b[12 + (p - obs["index"]) % 4, pos] = 1

    # food
    for pos in obs["food"]:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)

In [16]:
def observation_num_step(obses):
    b = np.zeros((7, 11), dtype=np.float32)
    obs = obses[-1]

    num_step = obs["step"]  # 0-198
    b[0, 0] = num_step / 198

    return b.reshape(1, 7, 11)

## Data

In [17]:
def create_dataset_from_json(filepath, json_object=None, standing=0):
    if json_object is None:
        json_open = open(path, "r")
        json_load = json.load(json_open)
    else:
        json_load = json_object

    try:
        winner_index = np.argmax(np.argsort(json_load["rewards"]) == 3 - standing)

        obses = []
        X = []
        y = []
        actions = {"NORTH": 0, "SOUTH": 1, "WEST": 2, "EAST": 3}

        for i in range(len(json_load["steps"]) - 1):
            if json_load["steps"][i][winner_index]["status"] == "ACTIVE":
                y_ = json_load["steps"][i + 1][winner_index]["action"]
                if y_ is not None:
                    step = json_load["steps"][i]
                    step[winner_index]["observation"]["geese"] = step[0]["observation"]["geese"]
                    step[winner_index]["observation"]["food"] = step[0]["observation"]["food"]
                    step[winner_index]["observation"]["step"] = step[0]["observation"]["step"]
                    obses.append(step[winner_index]["observation"])
                    y.append(actions[y_])

                    y.append(reverse_ns(actions[y_]))  # 上下反転
                    y.append(reverse_we(actions[y_]))  # 左右反転
                    y.append(reverse_nswe(actions[y_]))  # 上下左右反転

        for j in range(len(obses)):
            # X_ = make_input(obses[: j + 1])

            X_ = []
            X_.append(make_input(obses[: j + 1]))
            # X_.append(observation_num_step(obses[: j + 1]))
            X_ = np.concatenate(X_)

            X.append(X_)

            X.append(X_[:, ::-1, :])  # 上下反転
            X.append(X_[:, :, ::-1])  # 左右反転
            X.append(X_[:, ::-1, ::-1])  # 上下左右反転

        X = np.array(X, dtype=np.float32)  # [starting_step:]
        y = np.array(y, dtype=np.uint8)  # [starting_step:]

        return X, y
    except:
        return 0, 0

In [18]:
X_train = []
y_train = []

for path in tqdm(paths[: int(len(paths))]):
    X, y = create_dataset_from_json(path, standing=0)  # use only winners' moves
    if X is not 0:
        X_train.append(X)
        y_train.append(y)

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4508.0), HTML(value='')))


Num episode: 2863164


In [19]:
# TODO: データをuniqueにしたいがメモリエラーになってしまう。

# X_train, unique_index = np.unique(X_train, axis=0, return_index=True)  # remove duplicate
# y_train = y_train[unique_index]

# y_train = np.eye(4, dtype="uint8")[y_train]  # to categorical

# print(f"Num episode: {len(X_train)}")

In [20]:
# まとめてuniqueかけるとメモリ量が足りないので、いくつかのグループに分けてuniqueにする

X_train_sum_obs = X_train.reshape(X_train.shape[0], -1).sum(1)
X_train_group = np.unique(X_train_sum_obs)
X_train_group.shape

(75,)

In [21]:
X_train_unique = []
y_train_unique = []
for group in tqdm(X_train_group):
    group_index = np.where(X_train_sum_obs == group)

    X_train_ = X_train[group_index]
    y_train_ = y_train[group_index]

    X_train_, unique_index = np.unique(X_train_, axis=0, return_index=True)  # remove duplicate
    y_train_ = y_train_[unique_index]

    X_train_unique.append(X_train_)
    y_train_unique.append(y_train_)

X_train = np.concatenate(X_train_unique)
y_train = np.concatenate(y_train_unique)

print(f"Num episode: {len(X_train)}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75.0), HTML(value='')))


Num episode: 2862924


In [22]:
if Config.debug:
    X_train = X_train[:1000]
    y_train = y_train[:1000]

In [23]:
y_df = pd.DataFrame(y_train)
y_df.columns = ["action"]
y_df

Unnamed: 0,action
0,1
1,1
2,3
3,0
4,2
...,...
2862919,1
2862920,2
2862921,3
2862922,2


## CV Split

In [24]:
folds = y_df.copy()
Fold = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["action"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds["fold"].astype(int)
print(folds.groupby(["fold", "action"]).size())

fold  action
0     0         68407
      1         68407
      2         74739
      3         74740
1     0         68407
      1         68407
      2         74739
      3         74740
2     0         68407
      1         68407
      2         74740
      3         74739
3     0         68407
      1         68407
      2         74740
      3         74739
4     0         68407
      1         68407
      2         74739
      3         74739
5     0         68407
      1         68407
      2         74739
      3         74739
6     0         68407
      1         68407
      2         74739
      3         74739
7     0         68407
      1         68407
      2         74739
      3         74739
8     0         68407
      1         68407
      2         74739
      3         74739
9     0         68407
      1         68407
      2         74739
      3         74739
dtype: int64


## Dataset

In [25]:
class TrainDataset(Dataset):
    def __init__(self, array, label):
        self.array = array
        self.label = label

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx], torch.tensor(self.label[idx]).long()


class TestDataset(Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [26]:
# Test

train_ds = TrainDataset(X_train, y_train)

for i in range(1):
    obs, action = train_ds[i]
    print(obs.shape, action)

(17, 7, 11) tensor(1)


## Model

In [27]:
class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:, :, :, -self.edge_size[1] :], x, x[:, :, :, : self.edge_size[1]]], dim=3)
        h = torch.cat([h[:, :, -self.edge_size[0] :], h, h[:, :, : self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h

In [28]:
class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        p = self.head_p(h_head_p)

        h_v = F.relu_(self.conv_v(h))
        h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        v = torch.tanh(self.head_v2(h_v))

        return {"policy": p, "value": v}

In [29]:
class GeeseNetAlpha(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 64
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])

        self.conv_p = TorusConv2d(filters, filters, (3, 3), True)
        # self.conv_v = TorusConv2d(filters, filters, (3, 3), True)

        self.head_p1 = nn.Linear(filters * 2 + 77, filters, bias=False)
        self.head_p2 = nn.Linear(filters, 4, bias=False)
        # self.head_v1 = nn.Linear(filters * 2, filters, bias=False)
        # self.head_v2 = nn.Linear(filters, 1, bias=False)

    def forward(self, x, _=None):
        # x = x[:, :-1]
        # num_step = x[:, -1, 0, 0].view(x.size(0), 1)

        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))

        h_p = F.relu_(self.conv_p(h))
        h_head_p = (h_p * x[:, :1]).view(h_p.size(0), h_p.size(1), -1).sum(-1)
        h_avg_p1 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(-1)
        h_avg_p2 = h_p.view(h_p.size(0), h_p.size(1), -1).mean(1)

        h_p = F.relu_(self.head_p1(torch.cat([h_head_p, h_avg_p1, h_avg_p2], 1)))
        p = self.head_p2(h_p)

        # h_v = F.relu_(self.conv_v(h))
        # h_head_v = (h_v * x[:, :1]).view(h_v.size(0), h_v.size(1), -1).sum(-1)
        # h_avg_v = h_v.view(h_v.size(0), h_v.size(1), -1).mean(-1)

        # h_v = F.relu_(self.head_v1(torch.cat([h_head_v, h_avg_v], 1)))
        # v = torch.tanh(self.head_v2(h_v))

        return {"policy": p}  # "value": v

In [30]:
# Test

model = GeeseNetAlpha()
# print(model)

params = sum(p.numel() for p in model.parameters())
print(f"params: {params:,}")

train_ds = TrainDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

for obs, action in train_loader:
    output = model(obs)
    print(output)
    print(f"{torch.argmax(output['policy'], dim=1)}")
    break

params: 505,088
{'policy': tensor([[-0.0146,  0.0525,  0.0750,  0.0517],
        [-0.0780, -0.0301,  0.1190,  0.0674],
        [-0.1203,  0.0389,  0.0797,  0.0230],
        [-0.0968, -0.1358, -0.0216,  0.0691]], grad_fn=<MmBackward>)}
tensor([2, 2, 2, 3])


## Loss

## Scoring

In [31]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

## Helper functions

In [32]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [33]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0

    for step, (obs, action) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        y_preds = model(obs.float())["policy"]

        loss = criterion(y_preds, action)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        if Config.apex:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                f"Grad: {grad_norm:.4f} "
                f"LR: {scheduler.get_last_lr()[0]:.6f}  "
            )

    return losses.avg

In [34]:
def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()

    for step, (obs, action) in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        obs = obs.to(device)
        action = action.to(device)
        batch_size = action.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(obs)["policy"]

        loss = criterion(y_preds, action)
        losses.update(loss.item(), batch_size)

        # record accuracy
        preds.append(y_preds.softmax(1).to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
            )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

## Train loop

In [35]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    X_train_folds = X_train[folds["fold"] != fold]
    X_valid_folds = X_train[folds["fold"] == fold]

    y_train_folds = y_train[folds["fold"] != fold]
    y_valid_folds = y_train[folds["fold"] == fold]

    y_df_train_folds = y_df[folds["fold"] != fold]
    y_df_valid_folds = y_df[folds["fold"] == fold]

    train_dataset = TrainDataset(X_train_folds, y_train_folds)
    valid_dataset = TrainDataset(X_valid_folds, y_valid_folds)

    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=Config.batch_size,
        shuffle=False,
        num_workers=Config.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if Config.scheduler == "ReduceLROnPlateau":
            scheduler = ReduceLROnPlateau(
                optimizer, mode="min", factor=Config.factor, patience=Config.patience, verbose=True, eps=Config.eps
            )
        elif Config.scheduler == "CosineAnnealingLR":
            scheduler = CosineAnnealingLR(optimizer, T_max=Config.T_max, eta_min=Config.min_lr, last_epoch=-1)
        elif Config.scheduler == "CosineAnnealingWarmRestarts":
            scheduler = CosineAnnealingWarmRestarts(
                optimizer, T_0=Config.T_0, T_mult=1, eta_min=Config.min_lr, last_epoch=-1
            )
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = GeeseNetAlpha()
    model.to(device)

    # Use multi GPU
    if device == torch.device("cuda") and not Config.apex:
        model = torch.nn.DataParallel(model)  # make parallel
        # torch.backends.cudnn.benchmark=True

    optimizer = Adam(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay, amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # apex
    # ====================================================
    if Config.apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    # ====================================================
    # Criterion
    # ====================================================
    def get_criterion():
        if Config.criterion == "CrossEntropyLoss":
            criterion = nn.CrossEntropyLoss()
        return criterion

    criterion = get_criterion()

    # ====================================================
    # loop
    # ====================================================
    best_score = 0.0
    best_loss = np.inf
    best_preds = None

    for epoch in range(Config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(y_valid_folds, preds.argmax(1))

        elapsed = time.time() - start_time

        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Accuracy: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                # {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth"
                model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth"
            )
            best_preds = preds

        if epoch == Config.epochs - 1:
            LOGGER.info(f"Epoch {epoch+1} - Save final model")
            torch.save(
                # {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth"
                model.module.state_dict(), OUTPUT_DIR + f"{Config.model_name}_fold{fold}_final.pth"
            )

    # check_point = torch.load(OUTPUT_DIR + f"{Config.model_name}_fold{fold}_best.pth")

    y_df_valid_folds[[str(c) for c in range(Config.n_class)]] = best_preds  # check_point["preds"]
    y_df_valid_folds["preds"] = best_preds.argmax(1)  # check_point["preds"].argmax(1)

    return y_df_valid_folds

## Main


In [36]:
def main():
    def get_result(result_df):
        preds = result_df["preds"].values
        labels = result_df["action"].values
        score = get_score(labels, preds)
        LOGGER.info(f"Score: {score:<.5f}")

    if Config.train:
        # train
        oof_df = pd.DataFrame()
        for fold in range(Config.n_fold):
            _oof_df = train_loop(folds, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
            break  # 今回は fold 1つしかまわさない
        # CV result
        # LOGGER.info(f"========== CV ==========")
        # get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

In [37]:
if __name__ == "__main__":
    main()



Epoch: [1][0/805] Elapsed 0m 3s (remain 43m 49s) Loss: 1.3881(1.3881) Grad: 0.3445 LR: 0.001000  
Epoch: [1][100/805] Elapsed 1m 11s (remain 8m 18s) Loss: 0.6174(0.6839) Grad: 1.1552 LR: 0.001000  
Epoch: [1][200/805] Elapsed 2m 19s (remain 6m 59s) Loss: 0.5495(0.6304) Grad: 0.6035 LR: 0.001000  
Epoch: [1][300/805] Elapsed 3m 27s (remain 5m 48s) Loss: 0.5227(0.6028) Grad: 0.9458 LR: 0.001000  
Epoch: [1][400/805] Elapsed 4m 36s (remain 4m 38s) Loss: 0.4957(0.5840) Grad: 0.4088 LR: 0.001000  
Epoch: [1][500/805] Elapsed 5m 44s (remain 3m 28s) Loss: 0.5237(0.5714) Grad: 0.5722 LR: 0.001000  
Epoch: [1][600/805] Elapsed 6m 52s (remain 2m 20s) Loss: 0.5289(0.5613) Grad: 0.4071 LR: 0.001000  
Epoch: [1][700/805] Elapsed 8m 0s (remain 1m 11s) Loss: 0.5178(0.5537) Grad: 0.3928 LR: 0.001000  
Epoch: [1][800/805] Elapsed 9m 9s (remain 0m 2s) Loss: 0.5150(0.5475) Grad: 0.3996 LR: 0.001000  
Epoch: [1][804/805] Elapsed 9m 11s (remain 0m 0s) Loss: 0.4934(0.5473) Grad: 0.4207 LR: 0.001000  
EVAL: 

Epoch 1 - avg_train_loss: 0.5473  avg_val_loss: 0.5028  time: 569s
Epoch 1 - Accuracy: 0.777025634577164
Epoch 1 - Save Best Score: 0.7770 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.3072(0.5028) 
Epoch: [2][0/805] Elapsed 0m 1s (remain 18m 21s) Loss: 0.4945(0.4945) Grad: 0.3693 LR: 0.000978  
Epoch: [2][100/805] Elapsed 1m 9s (remain 8m 5s) Loss: 0.4764(0.4936) Grad: 0.5142 LR: 0.000978  
Epoch: [2][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.5062(0.4932) Grad: 0.6635 LR: 0.000978  
Epoch: [2][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4858(0.4924) Grad: 0.2970 LR: 0.000978  
Epoch: [2][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4932(0.4912) Grad: 0.6144 LR: 0.000978  
Epoch: [2][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4927(0.4908) Grad: 0.5036 LR: 0.000978  
Epoch: [2][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.5040(0.4897) Grad: 0.4105 LR: 0.000978  
Epoch: [2][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4797(0.4887) Grad: 0.4742 LR: 0.000978  
Epoch: [2][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4630(0.4878) Grad: 0.3651 LR: 0.000978  
Epoch: [2][804/805] Elapsed 9m 10s (rema

Epoch 2 - avg_train_loss: 0.4878  avg_val_loss: 0.4878  time: 567s
Epoch 2 - Accuracy: 0.7840010059624231
Epoch 2 - Save Best Score: 0.7840 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2953(0.4878) 
Epoch: [3][0/805] Elapsed 0m 1s (remain 16m 55s) Loss: 0.4709(0.4709) Grad: 0.3525 LR: 0.000914  
Epoch: [3][100/805] Elapsed 1m 9s (remain 8m 4s) Loss: 0.4679(0.4754) Grad: 0.3862 LR: 0.000914  
Epoch: [3][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4640(0.4743) Grad: 0.3633 LR: 0.000914  
Epoch: [3][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4838(0.4741) Grad: 0.4831 LR: 0.000914  
Epoch: [3][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4678(0.4744) Grad: 0.3511 LR: 0.000914  
Epoch: [3][500/805] Elapsed 5m 42s (remain 3m 28s) Loss: 0.4793(0.4743) Grad: 0.4359 LR: 0.000914  
Epoch: [3][600/805] Elapsed 6m 51s (remain 2m 19s) Loss: 0.4837(0.4741) Grad: 0.2932 LR: 0.000914  
Epoch: [3][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4775(0.4736) Grad: 0.3003 LR: 0.000914  
Epoch: [3][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4716(0.4735) Grad: 0.2590 LR: 0.000914  
Epoch: [3][804/805] Elapsed 9m 10s (rema

Epoch 3 - avg_train_loss: 0.4735  avg_val_loss: 0.4753  time: 568s
Epoch 3 - Accuracy: 0.7902917640319532
Epoch 3 - Save Best Score: 0.7903 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2933(0.4753) 
Epoch: [4][0/805] Elapsed 0m 1s (remain 17m 6s) Loss: 0.4787(0.4787) Grad: 0.3925 LR: 0.000815  
Epoch: [4][100/805] Elapsed 1m 9s (remain 8m 4s) Loss: 0.4446(0.4655) Grad: 0.2700 LR: 0.000815  
Epoch: [4][200/805] Elapsed 2m 17s (remain 6m 53s) Loss: 0.4900(0.4648) Grad: 0.3623 LR: 0.000815  
Epoch: [4][300/805] Elapsed 3m 25s (remain 5m 44s) Loss: 0.4673(0.4649) Grad: 0.3450 LR: 0.000815  
Epoch: [4][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4636(0.4651) Grad: 0.4003 LR: 0.000815  
Epoch: [4][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4784(0.4653) Grad: 0.4368 LR: 0.000815  
Epoch: [4][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.4609(0.4653) Grad: 0.4037 LR: 0.000815  
Epoch: [4][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4510(0.4651) Grad: 0.3115 LR: 0.000815  
Epoch: [4][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4523(0.4648) Grad: 0.3132 LR: 0.000815  
Epoch: [4][804/805] Elapsed 9m 10s (remai

Epoch 4 - avg_train_loss: 0.4648  avg_val_loss: 0.4742  time: 567s
Epoch 4 - Accuracy: 0.790875082520355
Epoch 4 - Save Best Score: 0.7909 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2949(0.4742) 
Epoch: [5][0/805] Elapsed 0m 1s (remain 17m 14s) Loss: 0.4624(0.4624) Grad: 0.2779 LR: 0.000689  
Epoch: [5][100/805] Elapsed 1m 9s (remain 8m 4s) Loss: 0.4603(0.4565) Grad: 0.3781 LR: 0.000689  
Epoch: [5][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4537(0.4561) Grad: 0.3008 LR: 0.000689  
Epoch: [5][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4381(0.4562) Grad: 0.3195 LR: 0.000689  
Epoch: [5][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4428(0.4566) Grad: 0.2803 LR: 0.000689  
Epoch: [5][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4395(0.4570) Grad: 0.2527 LR: 0.000689  
Epoch: [5][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.4465(0.4573) Grad: 0.3193 LR: 0.000689  
Epoch: [5][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4466(0.4575) Grad: 0.4570 LR: 0.000689  
Epoch: [5][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4669(0.4577) Grad: 0.3471 LR: 0.000689  
Epoch: [5][804/805] Elapsed 9m 10s (rema

Epoch 5 - avg_train_loss: 0.4578  avg_val_loss: 0.4727  time: 567s
Epoch 5 - Accuracy: 0.7911510236016948
Epoch 5 - Save Best Score: 0.7912 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2840(0.4727) 
Epoch: [6][0/805] Elapsed 0m 1s (remain 17m 21s) Loss: 0.4473(0.4473) Grad: 0.3251 LR: 0.000550  
Epoch: [6][100/805] Elapsed 1m 9s (remain 8m 5s) Loss: 0.4477(0.4488) Grad: 0.4414 LR: 0.000550  
Epoch: [6][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4412(0.4494) Grad: 0.3024 LR: 0.000550  
Epoch: [6][300/805] Elapsed 3m 26s (remain 5m 44s) Loss: 0.4406(0.4502) Grad: 0.2307 LR: 0.000550  
Epoch: [6][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4711(0.4505) Grad: 0.3267 LR: 0.000550  
Epoch: [6][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4407(0.4508) Grad: 0.3069 LR: 0.000550  
Epoch: [6][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.4273(0.4507) Grad: 0.3648 LR: 0.000550  
Epoch: [6][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4491(0.4511) Grad: 0.3046 LR: 0.000550  
Epoch: [6][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4392(0.4511) Grad: 0.2575 LR: 0.000550  
Epoch: [6][804/805] Elapsed 9m 10s (rema

Epoch 6 - avg_train_loss: 0.4512  avg_val_loss: 0.4672  time: 567s
Epoch 6 - Accuracy: 0.7941619250208702
Epoch 6 - Save Best Score: 0.7942 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2832(0.4672) 
Epoch: [7][0/805] Elapsed 0m 1s (remain 17m 14s) Loss: 0.4393(0.4393) Grad: 0.2739 LR: 0.000411  
Epoch: [7][100/805] Elapsed 1m 9s (remain 8m 4s) Loss: 0.4510(0.4434) Grad: 0.3307 LR: 0.000411  
Epoch: [7][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4275(0.4426) Grad: 0.2370 LR: 0.000411  
Epoch: [7][300/805] Elapsed 3m 25s (remain 5m 44s) Loss: 0.4511(0.4430) Grad: 0.2744 LR: 0.000411  
Epoch: [7][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4666(0.4438) Grad: 0.3290 LR: 0.000411  
Epoch: [7][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4596(0.4439) Grad: 0.3414 LR: 0.000411  
Epoch: [7][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.4596(0.4441) Grad: 0.3949 LR: 0.000411  
Epoch: [7][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4724(0.4446) Grad: 0.3638 LR: 0.000411  
Epoch: [7][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4493(0.4448) Grad: 0.3192 LR: 0.000411  
Epoch: [7][804/805] Elapsed 9m 10s (rema

Epoch 7 - avg_train_loss: 0.4449  avg_val_loss: 0.4644  time: 567s
Epoch 7 - Accuracy: 0.7955835455285319
Epoch 7 - Save Best Score: 0.7956 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2820(0.4644) 
Epoch: [8][0/805] Elapsed 0m 1s (remain 17m 15s) Loss: 0.4309(0.4309) Grad: 0.2764 LR: 0.000285  
Epoch: [8][100/805] Elapsed 1m 9s (remain 8m 4s) Loss: 0.4329(0.4359) Grad: 0.2673 LR: 0.000285  
Epoch: [8][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4386(0.4369) Grad: 0.2889 LR: 0.000285  
Epoch: [8][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4406(0.4372) Grad: 0.2616 LR: 0.000285  
Epoch: [8][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4517(0.4374) Grad: 0.3849 LR: 0.000285  
Epoch: [8][500/805] Elapsed 5m 42s (remain 3m 27s) Loss: 0.4315(0.4377) Grad: 0.3417 LR: 0.000285  
Epoch: [8][600/805] Elapsed 6m 50s (remain 2m 19s) Loss: 0.4235(0.4378) Grad: 0.4205 LR: 0.000285  
Epoch: [8][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4580(0.4384) Grad: 0.3266 LR: 0.000285  
Epoch: [8][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4146(0.4384) Grad: 0.2937 LR: 0.000285  
Epoch: [8][804/805] Elapsed 9m 10s (rema

Epoch 8 - avg_train_loss: 0.4385  avg_val_loss: 0.4642  time: 567s
Epoch 8 - Accuracy: 0.7957651776327049
Epoch 8 - Save Best Score: 0.7958 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2848(0.4642) 
Epoch: [9][0/805] Elapsed 0m 1s (remain 17m 15s) Loss: 0.4318(0.4318) Grad: 0.2694 LR: 0.000186  
Epoch: [9][100/805] Elapsed 1m 9s (remain 8m 5s) Loss: 0.4105(0.4301) Grad: 0.3077 LR: 0.000186  
Epoch: [9][200/805] Elapsed 2m 17s (remain 6m 54s) Loss: 0.4286(0.4323) Grad: 0.3014 LR: 0.000186  
Epoch: [9][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4279(0.4320) Grad: 0.2779 LR: 0.000186  
Epoch: [9][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4436(0.4318) Grad: 0.4073 LR: 0.000186  
Epoch: [9][500/805] Elapsed 5m 42s (remain 3m 28s) Loss: 0.4411(0.4320) Grad: 0.2936 LR: 0.000186  
Epoch: [9][600/805] Elapsed 6m 51s (remain 2m 19s) Loss: 0.4181(0.4322) Grad: 0.4184 LR: 0.000186  
Epoch: [9][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4240(0.4325) Grad: 0.2943 LR: 0.000186  
Epoch: [9][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4417(0.4326) Grad: 0.3289 LR: 0.000186  
Epoch: [9][804/805] Elapsed 9m 10s (rema

Epoch 9 - avg_train_loss: 0.4327  avg_val_loss: 0.4639  time: 567s
Epoch 9 - Accuracy: 0.7960550904143657
Epoch 9 - Save Best Score: 0.7961 Model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2846(0.4639) 
Epoch: [10][0/805] Elapsed 0m 1s (remain 18m 21s) Loss: 0.4371(0.4371) Grad: 0.3335 LR: 0.000122  
Epoch: [10][100/805] Elapsed 1m 9s (remain 8m 6s) Loss: 0.4198(0.4249) Grad: 0.2861 LR: 0.000122  
Epoch: [10][200/805] Elapsed 2m 18s (remain 6m 54s) Loss: 0.4292(0.4263) Grad: 0.3284 LR: 0.000122  
Epoch: [10][300/805] Elapsed 3m 26s (remain 5m 45s) Loss: 0.4192(0.4269) Grad: 0.3282 LR: 0.000122  
Epoch: [10][400/805] Elapsed 4m 34s (remain 4m 36s) Loss: 0.4375(0.4273) Grad: 0.3146 LR: 0.000122  
Epoch: [10][500/805] Elapsed 5m 42s (remain 3m 28s) Loss: 0.4224(0.4275) Grad: 0.2937 LR: 0.000122  
Epoch: [10][600/805] Elapsed 6m 51s (remain 2m 19s) Loss: 0.4194(0.4278) Grad: 0.3225 LR: 0.000122  
Epoch: [10][700/805] Elapsed 7m 59s (remain 1m 11s) Loss: 0.4387(0.4278) Grad: 0.3909 LR: 0.000122  
Epoch: [10][800/805] Elapsed 9m 7s (remain 0m 2s) Loss: 0.4389(0.4283) Grad: 0.3634 LR: 0.000122  
Epoch: [10][804/805] Elapsed 9m

Epoch 10 - avg_train_loss: 0.4282  avg_val_loss: 0.4647  time: 568s
Epoch 10 - Accuracy: 0.7958979087857545
Epoch 10 - Save final model


EVAL: [89/90] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2861(0.4647) 


Score: 0.79606
