transformer_020_4

- saintv6_2
    - saint_v6からtask_container_id, prior_had_expを除いたもの
- user_step_ids (step=150)
- batch_zise: 512

## files
- ../data/input/train.csv
- ../data/input/questions.csv
- ../data/team/train_folds_vlatest_ALL_2p5M_v2_20201209.feather
<!-- - ../exp/000_tran/compe.yml -->
- ../data/team/transformer_020_4.yml
- ../data/team/seq10/row_{}.pkl

In [1]:
import sys
import numpy as np
import pandas as pd
import os
import gc
import torch
from torch.autograd import detect_anomaly
import time
from fastprogress import master_bar, progress_bar
import datetime
from sklearn import metrics

import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

sys.path.append('../src')
from utils import (DataHandler, Timer, seed_everything)

import warnings
warnings.filterwarnings('ignore')

In [2]:
def auc(y_true, y_pred):
    return metrics.roc_auc_score(y_true, y_pred)

In [3]:
INPUT_DIR = '../data/input'
FOLD_DIR = '../data/team'
VALID_SEQ_DIR = '../data/team/seq10'
# SAVE_DIR = '../save'

FOLD_NAME = 'vlatest_ALL_2p5M'
RANDOM_STATE = 20201209

DTYPE = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}

TARGET_COLS = ['answered_correctly']

model_name = 'transformer_020_4'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
debug = False

In [4]:
now = datetime.datetime.now()
run_id = f'{model_name}_{now:%Y%m%d%H%M%S}'
EXP_NAME = f'{FOLD_NAME}__Tran'

dh = DataHandler()
cfg = dh.load('../configs/common/compe.yml')
cfg.update(dh.load(f'../data/team/{model_name}.yml'))

# cfg.data.train.params.step_len = 75
# cfg.data.train.params.max_seq = 51
# cfg.model.params.seq_len = 51
# cfg.data.valid.params.max_seq = 51
# cfg.data.test.params.max_seq = 51
# cfg.data.train.loader.batch_size=1024

# if not os.path.exists(f'{SAVE_DIR}/{EXP_NAME}_{run_id}/'):
#     os.mkdir(f'{SAVE_DIR}/{EXP_NAME}_{run_id}/')
#     os.mkdir(f'{SAVE_DIR}/{EXP_NAME}_{run_id}/seq_model')
#     os.mkdir(f'{SAVE_DIR}/{EXP_NAME}_{run_id}/seq_model_train')

In [5]:
t = Timer()
seed_everything(cfg.common.seed)

STEP_LENGTH = cfg.data.train.params.step_len

In [6]:
if debug:
    train_df = pd.read_csv(f'{INPUT_DIR}/train.csv', dtype=DTYPE, nrows=10**6)
else:
    train_df = pd.read_csv(f'{INPUT_DIR}/train.csv', dtype=DTYPE)
    
###
te_content = pd.read_feather('../features/te_content_id_by_answered_correctly_train.feather')

if debug:
    te_content  = te_content.iloc[:10**6]

train_df['te_content_id_by_answered_correctly'] = te_content['te_content_id_by_answered_correctly'].values
###

folds = pd.read_feather(f'{FOLD_DIR}/train_folds_{FOLD_NAME}_v2_{RANDOM_STATE}.feather')
valid_idx = folds[folds.val == 1]['index'].values
if debug:
    valid_idx = valid_idx[np.where(valid_idx < len(train_df))]

fold_df = pd.DataFrame(index=range(len(train_df)))
fold_df['fold_0'] = 0
fold_df.loc[valid_idx, 'fold_0'] += 1

drop_idx = train_df[train_df.content_type_id != 0].index
train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

def make_content_map_dict():
    questions_df = pd.read_csv(f'{INPUT_DIR}/questions.csv')
    q2p = dict(questions_df[['question_id', 'part']].values)
    q2p = np.array(list(q2p.values()))   # ここを追加

    questions_df['tags'] = questions_df['tags'].fillna(0)
    questions_df['tag_list'] = questions_df['tags'].apply(lambda tags: [int(tag) for tag in str(tags).split(' ')])
    questions_df['tag_list'] = questions_df['tag_list'].apply(lambda x: [0] * (6 - len(x)) + x)
    q2tg = dict(questions_df[['question_id', 'tag_list']].values)
    q2tg = np.array(list(q2tg.values()))   # ここを追加

    te_dict = dh.load('../data/processed/te_content_id_by_answered_correctly.pkl')
    te_df = pd.DataFrame.from_dict(te_dict).sort_index().iloc[:13523]
    q2te = np.mean(te_df.values, axis=1)
    
    tsne0_dict = dh.load('../data/processed/tsne_encoder_0.pkl')
    q2ts0 = np.array(list(tsne0_dict.values()))[:13523]
    tsne1_dict = dh.load('../data/processed/tsne_encoder_1.pkl')
    q2ts1 = np.array(list(tsne1_dict.values()))[:13523]

    return q2p, q2tg, q2te, q2ts0, q2ts1

q2p, q2tg, q2te, q2ts0, q2ts1 = make_content_map_dict()

target_df = train_df[TARGET_COLS[0]]
n_splits = len(fold_df.columns)

In [7]:
# =================================================================================
# SAINT v6_2
# mod based on CustomTrainDataset9

class CustomTrainDataset7_2_(Dataset):
    def __init__(self, samples, df, q2p, q2tg, q2te, cfg=None):
        super(CustomTrainDataset7_2_, self).__init__()
        self.max_seq = cfg.params.max_seq
        self.n_content = cfg.params.n_skill
        self.n_tag = cfg.params.total_tg
        self.step_length = STEP_LENGTH
        self.seq_randomness = cfg.params.seq_randomness
        self.samples = samples
        self.q2p = q2p
        self.q2tg = q2tg
        self.q2te = q2te

        user_ids = []
        for user_id in samples.index:
            q = samples[user_id][0]
            if len(q) < 2:
                continue
            user_ids.append(user_id)
        self.user_step_ids = df[df['user_id'].isin(user_ids)]['user_step_id'].unique()

    def __len__(self):
        return len(self.user_step_ids)

    def __getitem__(self, index):
        user_step_id = self.user_step_ids[index]
        user_id, step_id = list(map(int, user_step_id.split('__')))
        
        q_, qa_, qt_, qe_, qte_ = self.samples[user_id]

        step_start, step_end = step_id * self.step_length, (step_id + 1) * self.step_length
        seq_len = len(q_[step_start: step_end])
        if step_id > 0 and seq_len < self.step_length:
            step_start = (step_id - 1) * self.step_length
        elif step_id == 0 and seq_len < self.step_length:
            step_end = random.randint(2, seq_len)

        q_ = q_[step_start: step_end]
        qa_ = qa_[step_start: step_end]
        qt_ = qt_[step_start: step_end]
        qe_ = qe_[step_start: step_end]
        qte_ = qte_[step_start: step_end]

        qt_ = qt_ / 60_000.   # ms -> m
        qe_ = qe_ / 1_000.   # ms -> s
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        qt = np.zeros(self.max_seq, dtype=int)
        qe = np.zeros(self.max_seq, dtype=int)
        qte = np.zeros(self.max_seq, dtype=float)
        qtg = np.zeros((self.max_seq - 1, 6), dtype=int) + self.n_tag
        if seq_len >= self.max_seq:
            start = random.randint(0, (seq_len - self.max_seq))
            end = start + self.max_seq
            q[:] = q_[start: end]
            qa[:] = qa_[start: end]
            qt[:] = qt_[start: end]
            qe[:] = qe_[start: end]
            qte[:] = qte_[start: end]
        else:
            start = 0
            end = random.randint(2, seq_len)
            seq_len = end - start
            q[-seq_len:] = q_[0: seq_len]
            qa[-seq_len:] = qa_[0: seq_len]
            qt[-seq_len:] = qt_[0: seq_len]
            qe[-seq_len:] = qe_[0: seq_len]
            qte[-seq_len:] = qte_[0: seq_len]

        target_id = np.array(q[1:].copy())
        label = qa[1:].copy()
        ac = np.array(qa[:-1].copy())
        ###
        te_content_id = qte[1:].copy()
        ###
        
        learn_start_idx = np.where(target_id > 0)[0][0]   # 変更した

        part = np.zeros(self.max_seq - 1)
        part[learn_start_idx:] = self.q2p[target_id[learn_start_idx:]]   # 変更した

        difftime = np.diff(qt.copy())
        difftime = np.where(difftime < 0, 300, difftime)
        difftime = np.log1p(difftime)

        prior_elapsed = qe[1:].copy()
        prior_elapsed = np.log1p(prior_elapsed)
        prior_elapsed = np.where(np.isnan(prior_elapsed), np.log1p(21), prior_elapsed)

        qtg[learn_start_idx:, :] = self.q2tg[target_id[learn_start_idx:]]   # 変更した
        
        ###
        te_content_id = np.where(np.isnan(te_content_id), 0.625164097637492, te_content_id)   # nanmean

        avg_u_target = np.zeros(self.max_seq - 1, dtype=float)
        ac_latest = ac[learn_start_idx:]
        avg_u_target[learn_start_idx:] = ac_latest.cumsum() / (np.arange(len(ac_latest)) + 1)
        avg_u_target = np.where(np.isnan(avg_u_target), 0, avg_u_target)
        
        num_feat = np.vstack([te_content_id, avg_u_target]).T
        ###

        feat = {
            'in_ex': torch.LongTensor(target_id),
            'in_dt': torch.FloatTensor(difftime),
            'in_el': torch.FloatTensor(prior_elapsed),
            'in_tag': torch.LongTensor(qtg),
            'in_cat': torch.LongTensor(part),
            'in_de': torch.LongTensor(ac),
            ###
            'num_feat': torch.FloatTensor(num_feat),
            ###
        }

        label = torch.FloatTensor(label)

        return feat, label


class CustomValidDataset7_2_(Dataset):
    def __init__(self, samples, df, q2p, q2tg, q2te, cfg=None):
        super(CustomValidDataset7_2_, self).__init__()
        self.max_seq = cfg.params.max_seq
        self.n_skill = cfg.params.n_skill
        self.n_tag = cfg.params.total_tg
        self.samples = samples
        self.df = df
        self.q2p = q2p
        self.q2tg = q2tg
        ###
        self.q2te = q2te
        ###

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        row_id = row['row_id']

        seq_list = dh.load(f'{VALID_SEQ_DIR}/row_{int(row_id)}.pkl')

        difftime = np.array(seq_list[1]) / 60_000.   # ms -> m
        difftime = np.where(difftime < 0, 300, difftime)
        difftime = np.log1p(difftime)

        prior_elapsed = np.array(seq_list[2]) / 1_000.
        prior_elapsed = np.log1p(prior_elapsed)
        prior_elapsed = np.where(np.isnan(prior_elapsed), np.log1p(21), prior_elapsed)

        content_id = np.array(seq_list[0])
        learn_start_idx = np.where(content_id > 0)[0][0]   # 変更した
        
        part = np.zeros(self.max_seq - 1)
        part[learn_start_idx:] = self.q2p[content_id[learn_start_idx:]]   # 変更した
        
        target = np.array(seq_list[3])

        qtg = np.zeros((self.max_seq - 1, 6)) + self.n_tag
        qtg[learn_start_idx:, :] = self.q2tg[content_id[learn_start_idx:]]   # 変更した
        
        ###
        avg_u_target = np.zeros(self.max_seq - 1, dtype=float)
        ac_latest = target[learn_start_idx:]
        avg_u_target[learn_start_idx:] = ac_latest.cumsum() / (np.arange(len(ac_latest)) + 1)
        avg_u_target = np.where(np.isnan(avg_u_target), 0, avg_u_target)
        
        te_content_id = np.zeros(self.max_seq - 1)
        te_content_id[learn_start_idx:] = self.q2te[content_id[learn_start_idx:]]
        te_content_id = np.where(np.isnan(te_content_id), 0.625164097637492, te_content_id)   # nanmean
        
        num_feat = np.vstack([te_content_id, avg_u_target]).T
        ###
        
        feat = {
            'in_ex': torch.LongTensor(content_id),
            'in_dt': torch.FloatTensor(difftime),
            'in_el': torch.FloatTensor(prior_elapsed),
            'in_tag': torch.LongTensor(qtg),
            'in_cat': torch.LongTensor(part),
            'in_de': torch.LongTensor(target),
            ###
            'num_feat': torch.FloatTensor(num_feat),
            ###
        }

        if TARGET_COLS[0] in self.df.columns:
            label = np.append(target[1:], [row[TARGET_COLS[0]]])
            label = torch.FloatTensor(label)
            return feat, label
        else:
            return feat

In [8]:
# https://github.com/arshadshk/SAINT-pytorch/blob/main/saint.py
class Feed_Forward_block(nn.Module):
    """
    out =  Relu( M_out*w1 + b1) *w2 + b2
    """
    def __init__(self, dim_ff):
        super().__init__()
        self.layer1 = nn.Linear(in_features=dim_ff, out_features=dim_ff)
        self.layer2 = nn.Linear(in_features=dim_ff, out_features=dim_ff)

    def forward(self, ffn_in):
        return self.layer2(F.relu(self.layer1(ffn_in)))


class Encoder_block(nn.Module):
    """
    M = SkipConct(Multihead(LayerNorm(Qin;Kin;Vin)))
    O = SkipConct(FFN(LayerNorm(M)))
    """

    def __init__(self, dim_model, heads_en, total_ex, total_cat, total_tg, seq_len):
        super().__init__()
        self.seq_len = seq_len - 1
        self.embd_ex = nn.Embedding(total_ex, embedding_dim=dim_model)
        self.embd_cat = nn.Embedding(total_cat + 1, embedding_dim=dim_model)
        self.embd_tg = nn.Embedding(total_tg + 1, embedding_dim=dim_model)
        self.embd_pos = nn.Embedding(seq_len, embedding_dim=dim_model)
        self.dt_fc = nn.Linear(1, dim_model, bias=False)
        # self.task_fc = nn.Linear(1, dim_model, bias=False)

        self.multi_en = nn.MultiheadAttention(embed_dim=dim_model, num_heads=heads_en)
        self.ffn_en = Feed_Forward_block(dim_model)
        self.layer_norm1 = nn.LayerNorm(dim_model)
        self.layer_norm2 = nn.LayerNorm(dim_model)

    def forward(self, in_ex, in_cat, in_tg, in_dt, first_block=True):
        device = in_ex.device

        if first_block:
            in_ex = self.embd_ex(in_ex)
            in_cat = self.embd_cat(in_cat)

            in_dt = in_dt.unsqueeze(-1)
            in_dt = self.dt_fc(in_dt)

            in_tg = self.embd_tg(in_tg)
            avg_in_tg_embed = in_tg.mean(dim=2)
            max_in_tg_embed = in_tg.max(dim=2).values

            # in_task = in_task.unsqueeze(-1)
            # in_task = self.task_fc(in_task)

            # combining the embedings
            # out = in_ex + in_cat + in_dt + (avg_in_tg_embed + max_in_tg_embed) + in_task
            out = in_ex + in_cat + in_dt + (avg_in_tg_embed + max_in_tg_embed)
        else:
            out = in_ex

        in_pos = get_pos(self.seq_len, device)
        in_pos = self.embd_pos(in_pos)
        out = out + in_pos

        out = out.permute(1, 0, 2)

        # Multihead attention
        n, _, _ = out.shape
        out = self.layer_norm1(out)
        skip_out = out
        out, attn_wt = self.multi_en(out, out, out,
                                     attn_mask=get_mask(seq_len=n, device=device))
        out = out + skip_out

        # feed forward
        out = out.permute(1, 0, 2)
        out = self.layer_norm2(out)
        skip_out = out
        out = self.ffn_en(out)
        out = out + skip_out

        return out


class Decoder_block(nn.Module):
    """
    M1 = SkipConct(Multihead(LayerNorm(Qin;Kin;Vin)))
    M2 = SkipConct(Multihead(LayerNorm(M1;O;O)))
    L = SkipConct(FFN(LayerNorm(M2)))
    """
    def __init__(self, dim_model, total_in, total_exp, heads_de, seq_len):
        super().__init__()
        self.seq_len = seq_len - 1
        self.embd_in = nn.Embedding(total_in, embedding_dim=dim_model)
        # self.embd_exp = nn.Embedding(total_exp, embedding_dim=dim_model)
        self.embd_pos = nn.Embedding(self.seq_len, embedding_dim=dim_model)
        self.multi_de1 = nn.MultiheadAttention(embed_dim=dim_model, num_heads=heads_de)
        self.multi_de2 = nn.MultiheadAttention(embed_dim=dim_model, num_heads=heads_de)
        self.ffn_en = Feed_Forward_block(dim_model)
        self.el_fc = nn.Linear(1, dim_model, bias=False)

        self.layer_norm1 = nn.LayerNorm(dim_model)
        self.layer_norm2 = nn.LayerNorm(dim_model)
        self.layer_norm3 = nn.LayerNorm(dim_model)

    def forward(self, in_in, in_el, en_out, first_block=True):
        device = in_in.device

        if first_block:
            in_in = self.embd_in(in_in)

            in_el = in_el.unsqueeze(-1)
            in_el = self.el_fc(in_el)
            # in_exp = self.embd_exp(in_exp)

            # out = in_in + in_el + in_exp
            out = in_in + in_el
        else:
            out = in_in

        in_pos = get_pos(self.seq_len, device)
        in_pos = self.embd_pos(in_pos)
        out = out + in_pos

        out = out.permute(1, 0, 2)
        n, _, _ = out.shape

        out = self.layer_norm1(out)
        skip_out = out
        out, attn_wt = self.multi_de1(out, out, out,
                                      attn_mask=get_mask(seq_len=n, device=device))
        out = skip_out + out

        en_out = en_out.permute(1, 0, 2)
        en_out = self.layer_norm2(en_out)
        skip_out = out
        out, attn_wt = self.multi_de2(out, en_out, en_out,
                                      attn_mask=get_mask(seq_len=n, device=device))
        out = out + skip_out

        out = out.permute(1, 0, 2)
        out = self.layer_norm3(out)
        skip_out = out
        out = self.ffn_en(out)
        out = out + skip_out

        return out


def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


def get_mask(seq_len, device):
    mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool)).to(device)
    return mask


def get_pos(seq_len, device):
    # use sine positional embeddinds
    return torch.arange(seq_len, device=device).unsqueeze(0)


class SAINT(nn.Module):
    def __init__(self, dim_model, num_en, num_de, heads_en, total_ex, total_cat, total_tg, total_in, total_exp,
                      heads_de, seq_len, num_fc_in_dim=2, num_fc_out_dim=32):
        super().__init__()

        self.num_en = num_en
        self.num_de = num_de

        self.encoder = get_clones(Encoder_block(dim_model, heads_en, total_ex, total_cat, total_tg, seq_len), num_en)
        self.decoder = get_clones(Decoder_block(dim_model, total_in, total_exp, heads_de, seq_len), num_de)

#         self.out = nn.Linear(in_features=dim_model, out_features=1)
        
        self.num_fc = nn.Linear(in_features=num_fc_in_dim, out_features=num_fc_out_dim)
        self.out_fc1 = nn.Linear(in_features=dim_model, out_features=num_fc_out_dim)
        self.out_fc2 = nn.Linear(in_features=num_fc_out_dim * 2, out_features=1)

    def forward(self, feat):
        in_ex = feat['in_ex']
        in_dt = feat['in_dt']
        in_el = feat['in_el']
        in_tg = feat['in_tag']
        in_cat = feat['in_cat']
        in_in = feat['in_de']
        ###
        num_feat = feat['num_feat']
        ###

        first_block = True
        for x in range(self.num_en):
            if x >= 1:
                first_block = False
            in_ex = self.encoder[x](in_ex, in_cat, in_tg, in_dt, first_block=first_block)
            in_cat = in_ex

        first_block = True
        for x in range(self.num_de):
            if x >= 1:
                first_block = False
            in_in = self.decoder[x](in_in, in_el, en_out=in_ex, first_block=first_block)

#         in_in = self.out(in_in)
        num_feat = self.num_fc(num_feat)
        in_in = self.out_fc1(in_in)
        in_in = torch.cat([in_in, num_feat], dim=2)
        in_in = self.out_fc2(in_in)
    
        return in_in.squeeze(-1)

def replace_fc(model, cfg):
    return model

class CustomModel(nn.Module):
    def __init__(self, cfg):
        super(CustomModel, self).__init__()
        self.cfg = cfg
        self.base_model = SAINT(**cfg['model']['params'])
        self.model = replace_fc(self.base_model, cfg)

    def forward(self, x):
        x = self.model(x)
        return x

In [9]:
def _train_epoch(model, train_loader, criterion, optimizer, mb):
    model.train()
    avg_loss = 0.

    for feats, targets in progress_bar(train_loader, parent=mb):
        if type(feats) == dict:
            for k, v in feats.items():
                feats[k] = v.to(device)
        else:
            feats = feats.to(device)
        targets = targets.to(device)

        preds = model(feats)

        loss = criterion(preds, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    del feats, targets; gc.collect()
    return model, avg_loss

def _val_epoch(model, valid_loader, criterion):
    model.eval()
    valid_preds = np.zeros((len(valid_loader.dataset), cfg.model.n_classes))

    avg_val_loss = 0.
    valid_batch_size = valid_loader.batch_size

    with torch.no_grad():
        for i, (feats, targets) in enumerate(valid_loader):
            if type(feats) == dict:
                for k, v in feats.items():
                    feats[k] = v.to(device)
            else:
                feats = feats.to(device)

            targets = targets.to(device)

            preds = model(feats)

            loss = criterion(preds, targets)

            preds = preds[:, -1]
            valid_preds[i * valid_batch_size: (i + 1) * valid_batch_size, :] = preds.sigmoid().cpu().detach().numpy().reshape(-1, 1)
            avg_val_loss += loss.item() / len(valid_loader)
    return valid_preds, avg_val_loss

In [10]:
def exp(cfg):
    
    train_df['step'] = train_df.groupby('user_id').cumcount() // STEP_LENGTH
    train_df['user_step_id'] = train_df['user_id'].astype(str) + '__' + train_df['step'].astype(str)

    oof = np.zeros((len(train_df), cfg.model.n_classes))
    cv = 0
    col = 'fold_0'

    trn_x, val_x = train_df[fold_df[col] == 0], train_df[fold_df[col] > 0]
    val_y = target_df[fold_df[col] > 0].values

    usecols = ['user_id', 'content_id', 'timestamp', 'prior_question_elapsed_time',
                    'answered_correctly', 'te_content_id_by_answered_correctly']
    group = (trn_x[usecols]
             .groupby('user_id')
             .apply(lambda r: (r['content_id'].values,
                                        r['answered_correctly'].values,
                                        r['timestamp'].values,
                                        r['prior_question_elapsed_time'].values,
                                        r['te_content_id_by_answered_correctly'].values)))

    dataset = CustomTrainDataset7_2_(samples=group, df=trn_x, q2p=q2p, q2tg=q2tg, q2te=q2te, cfg=cfg.data.train)
    train_loader = DataLoader(dataset, **cfg.data.train.loader)

    dataset = CustomValidDataset7_2_(samples=group, df=val_x, q2p=q2p, q2tg=q2tg, q2te=q2te, cfg=cfg.data.valid)
    valid_loader = DataLoader(dataset, **cfg.data.valid.loader)

    is_train = True
    model = CustomModel(cfg)
    model = nn.DataParallel(model)
    model = model.to(device)

    loss_func = getattr(nn, cfg.loss.name)(**cfg.loss.params)
    metric_func = auc
    optimizer = getattr(torch.optim, cfg.optimizer.name)(params=model.parameters(), **cfg.optimizer.params)
    scheduler = getattr(torch.optim.lr_scheduler, cfg.scheduler.name)(
        optimizer,
        **cfg.scheduler.params,
    )

    best_epoch = -1
    best_val_score = -np.inf
    mb = master_bar(range(cfg.model.epochs))

    train_loss_list = []
    val_loss_list = []
    val_score_list = []

    for epoch in mb:
        start_time = time.time()

        with detect_anomaly():
            model, avg_loss = _train_epoch(model, train_loader, loss_func, optimizer, mb)

        valid_preds, avg_val_loss = _val_epoch(model, valid_loader, loss_func)

        val_score = metric_func(val_y, valid_preds)

        train_loss_list.append(avg_loss)
        val_loss_list.append(avg_val_loss)
        val_score_list.append(val_score)
        scheduler.step(avg_val_loss)

        elapsed = time.time() - start_time
        mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.6f}  avg_val_loss: {avg_val_loss:.6f} val_score: {val_score:.6f} time: {elapsed:.0f}s')

        if val_score > best_val_score:
            best_epoch = epoch + 1
            best_val_score = val_score
            best_valid_preds = valid_preds
            if cfg.model.multi_gpu:
                best_model = model.module.state_dict()
            else:
                best_model = model.state_dict()
#             torch.save(best_model, './seq50_step75_model.pt')

    oof[val_x.index, :] = best_valid_preds
    cv += best_val_score * fold_df[col].max()
    
#     np.save('./seq50_step75_oof.npy', oof)

#     torch.save(best_model, f'{SAVE_DIR}/{EXP_NAME}_{run_id}/{run_id}_weight_best.pt')
#     shutil.copy('pg_transformer20_3_on_note_1.ipynb', f'{SAVE_DIR}/{EXP_NAME}_{run_id}/pg_transformer20_3_on_note_1.ipynb')
#     shutil.copy(f'../exp/000_tran/{model_name}.yml', f'{SAVE_DIR}/{EXP_NAME}_{run_id}/config.yml')

    print(f'\nEpoch {best_epoch} - val_score: {best_val_score:.6f}')

    print('\n\n===================================\n')
    print(f'CV: {cv:.6f}')
    print('\n===================================\n\n')
    return cv

In [None]:
cv = exp(cfg)

Epoch 1 - avg_train_loss: 0.393206  avg_val_loss: 0.419488 val_score: 0.773363 time: 1020s
Epoch 2 - avg_train_loss: 0.383440  avg_val_loss: 0.415616 val_score: 0.779901 time: 1017s
█

- デフォルト<br>
Epoch 1 - avg_train_loss: 0.418306  avg_val_loss: 0.420112 val_score: 0.772268 time: 1200s <br>
Epoch 2 - avg_train_loss: 0.405379  avg_val_loss: 0.416274 val_score: 0.778598 time: 1099s <br>
Epoch 3 - avg_train_loss: 0.402293  avg_val_loss: 0.415704 val_score: 0.780514 time: 1100s <br>
Epoch 4 - avg_train_loss: 0.400122  avg_val_loss: 0.411587 val_score: 0.784737 time: 1100s <br>
Epoch 5 - avg_train_loss: 0.397836  avg_val_loss: 0.410092 val_score: 0.787489 time: 1101s <br>


- tag高速化 & part修正 <br>
Epoch 1 - avg_train_loss: 0.415494  avg_val_loss: 0.420086 val_score: 0.772824 time: 976s <br>
Epoch 2 - avg_train_loss: 0.402696  avg_val_loss: 0.416425 val_score: 0.778567 time: 972s <br>
Epoch 3 - avg_train_loss: 0.399676  avg_val_loss: 0.414455 val_score: 0.782042 time: 973s <br>
Epoch 4 - avg_train_loss: 0.397366  avg_val_loss: 0.411210 val_score: 0.785679 time: 974s <br>
Epoch 5 - avg_train_loss: 0.395128  avg_val_loss: 0.409938 val_score: 0.787973 time: 972s <br>


- seq_len=50 <br>
Epoch 1 - avg_train_loss: 0.502999  avg_val_loss: 0.471466 val_score: 0.769687 time: 859s <br>
Epoch 2 - avg_train_loss: 0.489230  avg_val_loss: 0.468640 val_score: 0.775218 time: 858s <br>
Epoch 3 - avg_train_loss: 0.486275  avg_val_loss: 0.466067 val_score: 0.777586 time: 858s <br>
Epoch 4 - avg_train_loss: 0.484485  avg_val_loss: 0.464312 val_score: 0.780000 time: 856s <br>
Epoch 5 - avg_train_loss: 0.483002  avg_val_loss: 0.463606 val_score: 0.781718 time: 859s <br>


- add te, avg_u_target <br>
Epoch 1 - avg_train_loss: 0.411664  avg_val_loss: 0.419125 val_score: 0.774234 time: 1006s <br>
Epoch 2 - avg_train_loss: 0.401464  avg_val_loss: 0.414941 val_score: 0.780295 time: 1006s <br>
Epoch 3 - avg_train_loss: 0.398836  avg_val_loss: 0.413717 val_score: 0.782442 time: 1007s <br>
Epoch 4 - avg_train_loss: 0.397053  avg_val_loss: 0.413128 val_score: 0.785359 time: 1007s <br>
Epoch 5 - avg_train_loss: 0.395297  avg_val_loss: 0.409641 val_score: 0.788182 time: 1007s <br>
Epoch 6 - avg_train_loss: 0.393423  avg_val_loss: 0.408399 val_score: 0.790023 time: 1006s <br>
Epoch 7 - avg_train_loss: 0.392346  avg_val_loss: 0.407184 val_score: 0.791622 time: 1007s <br>
Epoch 8 - avg_train_loss: 0.391387  avg_val_loss: 0.407105 val_score: 0.792346 time: 1006s <br>
Epoch 9 - avg_train_loss: 0.390765  avg_val_loss: 0.405870 val_score: 0.793542 time: 1013s <br>
Epoch 10 - avg_train_loss: 0.390244  avg_val_loss: 0.406136 val_score: 0.793739 time: 1007s <br>
Epoch 11 - avg_train_loss: 0.389793  avg_val_loss: 0.405646 val_score: 0.794341 time: 1007s <br>
Epoch 12 - avg_train_loss: 0.389439  avg_val_loss: 0.405179 val_score: 0.793943 time: 1007s <br>
Epoch 13 - avg_train_loss: 0.389116  avg_val_loss: 0.404685 val_score: 0.794862 time: 1007s <br>
Epoch 14 - avg_train_loss: 0.388748  avg_val_loss: 0.404421 val_score: 0.795003 time: 1007s <br>
Epoch 15 - avg_train_loss: 0.388514  avg_val_loss: 0.404539 val_score: 0.794840 time: 1008s <br>

In [None]:
# if debug:
#     train_df = pd.read_csv(f'{INPUT_DIR}/train.csv', dtype=DTYPE, nrows=10**6)
# else:
#     train_df = pd.read_csv(f'{INPUT_DIR}/train.csv', dtype=DTYPE)

# folds = pd.read_feather(f'{FOLD_DIR}/train_folds_{FOLD_NAME}_v2_{RANDOM_STATE}.feather')
# valid_idx = folds[folds.val == 1]['index'].values
# if debug:
#     valid_idx = valid_idx[np.where(valid_idx < len(train_df))]

# fold_df = pd.DataFrame(index=range(len(train_df)))
# fold_df['fold_0'] = 0
# fold_df.loc[valid_idx, 'fold_0'] += 1

# drop_idx = train_df[train_df.content_type_id != 0].index
# train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
# fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

# oof = np.load('./seq50_step75_oof.npy')
# val_idx = fold_df[fold_df['fold_0'] == 1].index
# oof = oof[val_idx
          
# preds_df = pd.DataFrame(oof, columns=['preds'])
# preds_df.to_csv('./preds_val_based_seq50_step75.csv', index=False)