In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


In [2]:
import ast
import copy
import gc
import itertools
import joblib
import json
import math
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import scipy as sp
import string
import sys
import time
import warnings
import wandb
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
# ======= OPTIONS =========
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device is: {device}")
warnings.filterwarnings("ignore")
!mkdir output

Current device is: cuda


In [3]:
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tokenizers import AddedToken
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true
#print(f"tokenizers.__version__: {tokenizers.__version__}")
#print(f"transformers.__version__: {transformers.__version_}")

env: TOKENIZERS_PARALLELISM=true


In [4]:
class config:
    APEX = True # Automatic Precision Enabled
    BATCH_SCHEDULER = True
    BATCH_SIZE_TRAIN = 4
    BATCH_SIZE_VALID = 4
    BETAS = (0.9, 0.999)
    DEBUG = False
    DECODER_LR = 2e-5
    ENCODER_LR = 2e-5
    EPOCHS = 2
    EPS = 1e-6
    layerwise_learning_rate_decay= 0.9
    FOLDS = 5
    GRADIENT_ACCUMULATION_STEPS = 1
    GRADIENT_CHECKPOINTING = True
    MAX_GRAD_NORM=1000
    MAX_LEN = 512
    MIN_LR = 2e-5
    #MODEL = "google/bigbird-roberta-base"
    MODEL= "Microsoft/deberta-v3-small"
    NUM_CYCLES = 0.5
    NUM_CLASSES = 6
    NUM_WARMUP_STEPS = 0
    NUM_WORKERS = 0 #multiprocessing.cpu_count()
    PRINT_FREQ = 20
    SCHEDULER = 'cosine' # ['linear', 'cosine']
    SEED = 20
    TRAIN = True
    TRAIN_FOLDS = [0, 1, 2, 3, 4]
    WEIGHT_DECAY = 0.01
    #num_hidden_layers=13
class paths:
    OUTPUT_DIR = "/kaggle/working/output"
    TEST_CSV = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
    TRAIN_CSV = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv"
if config.DEBUG:
    config.EPOCHS = 2
    config.TRAIN_FOLDS = [0]

In [5]:
def get_optimizer_grouped_parameters(
    model,
    learning_rate, weight_decay, 
    layerwise_learning_rate_decay
):
    no_decay = ["bias", "LayerNorm.weight"]
    # initialize lr for task specific layer
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
            "weight_decay": 0.0,
            "lr": learning_rate,
        },
    ]
    # initialize lrs for every layer
    num_layers = model.config.num_hidden_layers
    layers =  [model.model.embeddings.word_embeddings] + list(model.model.encoder.layer)
    layers.reverse()
    lr = learning_rate
    for layer in layers:
        lr *= layerwise_learning_rate_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters
"""
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters
    """
def get_logger(filename=paths.OUTPUT_DIR):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.SCHEDULER == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps
        )
    elif cfg.SCHEDULER == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.NUM_WARMUP_STEPS,
            num_training_steps=num_train_steps, num_cycles=cfg.NUM_CYCLES
        )
    return scheduler
def get_score(y_true, y_pred):
    score = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return score
def seed_everything(seed=20):
    """Seed everything to ensure reproducibility"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
def sep():
    print("-"*100)
LOGGER = get_logger()
seed_everything(seed=config.SEED)

In [6]:
train_df = pd.read_csv(paths.TRAIN_CSV, sep=',')
train_df["score"] = train_df["score"] - 1
test_df = pd.read_csv(paths.TEST_CSV, sep=',')
print(f"Train dataframe has shape: {train_df.shape}"), sep()
print(f"Test dataframe has shape: {test_df.shape}"), sep()
display(train_df.head())
display(test_df.head())

Train dataframe has shape: (17307, 3)
----------------------------------------------------------------------------------------------------
Test dataframe has shape: (3, 2)
----------------------------------------------------------------------------------------------------


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,2
1,000fe60,I am a scientist at NASA that is discussing th...,2
2,001ab80,People always wish they had the same technolog...,3
3,001bdc0,"We all heard about Venus, the planet without a...",3
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",2


Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [7]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
train_df["fold"] = -1
X = train_df["full_text"]
y = train_df["score"]
skf = StratifiedKFold(n_splits=config.FOLDS)
for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    train_df.loc[val_index, "fold"] = i
print(train_df["fold"].value_counts())

fold
0    3462
1    3462
2    3461
3    3461
4    3461
Name: count, dtype: int64


In [8]:
tokenizer = AutoTokenizer.from_pretrained(config.MODEL)
tokenizer.save_pretrained(paths.OUTPUT_DIR + '/tokenizer/')
#tokenizer.add_tokens([AddedToken("\n", normalized=False)])
#tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
print(tokenizer)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

DebertaV2TokenizerFast(name_or_path='Microsoft/deberta-v3-small', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [9]:
def prepare_input(cfg, text, tokenizer):
    """
    This function tokenizes the input text with the configured padding and truncation. Then,
    returns the input dictionary, which contains the following keys: "input_ids",
    "token_type_ids" and "attention_mask". Each value is a torch.tensor.
    :param cfg: configuration class with a TOKENIZER attribute.
    :param text: a numpy array where each value is a text as string.
    :return inputs: python dictionary where values are torch tensors.
    """
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=cfg.MAX_LEN,
        padding='max_length', # TODO: check padding to max sequence in batch
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long) # TODO: check dtypes
    return inputs
def collate(inputs):
    """
    It truncates the inputs to the maximum sequence length in the batch.
    """
    mask_len = int(inputs["attention_mask"].sum(axis=1).max()) # Get batch's max sequence length
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs
class CustomDataset(Dataset):
    def __init__(self, cfg, df, tokenizer):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df['score'].values
        self.tokenizer = tokenizer
        self.essay_ids = df['essay_id'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        output = {}
        output["inputs"] = prepare_input(self.cfg, self.texts[item], self.tokenizer)
        output["labels"] = torch.tensor(self.labels[item], dtype=torch.long) # TODO: check dtypes
        output["essay_ids"] = self.essay_ids[item]
        return output

In [10]:
"""
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        # Load config by inferencing it from the model name.
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.MODEL, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        # Load config from a file.
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.MODEL, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.GRADIENT_CHECKPOINTING:
            self.model.gradient_checkpointing_enable()
        # Add MeanPooling and Linear head at the end to transform the Model into a RegressionModel
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, config.NUM_CLASSES)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        
        #This method initializes weights for different types of layers. The type of layers
        #supported are nn.Linear, nn.Embedding and nn.LayerNorm.
        
        
  
     
        from transformers import logging
        logging.set_verbosity_warning()
        logging.set_verbosity_error()
        from transformers import AutoConfig
        from transformers import AutoModelForSequenceClassification
        
        #_model_type = 'deberta'
        _pretrained_model = 'microsoft/deberta-v3-small'
        config = AutoConfig.from_pretrained(_pretrained_model)
        #model = AutoModelForSequenceClassification.from_pretrained
        reinit_layers = 2
        deberta_model = transformers.TFAutoModel.from_pretrained(_pretrained_model, config=self.config)
        #_model_type = 'google/bigbird-roberta-base'
        if reinit_layers > 0:
            print(f'Reinitializing Last {reinit_layers} Layers ...')
            #encoder_temp = getattr(self.model, _model_type)
            for encoder_block in deberta_model.deberta.encoder.layer[-reinit_layers:]:
            #         print(f'{encoder_block}')
                for layer in encoder_block.submodules:
            #for layer in model.deberta.encoder.layer[-reinit_layers:]:
                #for module in layer.modules():
                    if isinstance(module, nn.Linear):
                        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                        if module.bias is not None:
                            module.bias.data.zero_()
                    elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                    elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
    def feature(self, inputs):
        
        
        #This method makes a forward pass through the model, get the last hidden state (embedding)
        #and pass it through the MeanPooling layer.
        
        
        
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        
        
        #This method makes a forward pass through the model, the MeanPooling layer and finally
        #then through the Linear layer to get a regression value.
        
        
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
        """



In [11]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        # Load config by inferencing it from the model name.
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.MODEL, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        # Load config from a file.
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.MODEL, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.GRADIENT_CHECKPOINTING:
            self.model.gradient_checkpointing_enable()
        # Add MeanPooling and Linear head at the end to transform the Model into a RegressionModel
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, config.NUM_CLASSES)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        """
        This method initializes weights for different types of layers. The type of layers
        supported are nn.Linear, nn.Embedding and nn.LayerNorm.
        """
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        """
        This method makes a forward pass through the model, get the last hidden state (embedding)
        and pass it through the MeanPooling layer.
        """
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        """
        This method makes a forward pass through the model, the MeanPooling layer and finally
        then through the Linear layer to get a regression value.
        """
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [12]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
def train_epoch(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    """One epoch training pass."""
    model.train() # set model in train mode
    scaler = torch.cuda.amp.GradScaler(enabled=config.APEX) # Automatic Mixed Precision tries to match each op to its appropriate datatype.
    losses = AverageMeter() # initiate AverageMeter to track the loss.
    start = end = time.time() # track the execution time.
    global_step = 0
    # ========== ITERATE OVER TRAIN BATCHES ============
    with tqdm(train_loader, unit="train_batch", desc='Train') as tqdm_train_loader:
        for step, batch in enumerate(tqdm_train_loader):
            inputs = batch.pop("inputs")
            labels = batch.pop("labels")
            inputs = collate(inputs) # collate inputs
            for k, v in inputs.items(): # send each tensor value to `device`
                inputs[k] = v.to(device)
            labels = labels.to(device) # send labels to `device`
            batch_size = labels.size(0)
            with torch.cuda.amp.autocast(enabled=config.APEX):
                y_preds = model(inputs) # forward propagation pass
                loss = criterion(y_preds, labels) # get loss
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size) # update loss function tracking
            scaler.scale(loss).backward() # backward propagation pass
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.step(optimizer) # update optimizer parameters
                scaler.update()
                optimizer.zero_grad() # zero out the gradients
                global_step += 1
                if config.BATCH_SCHEDULER:
                    scheduler.step() # update learning rate
            end = time.time() # get finish time
            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.avg:.4f} '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.8f}  '
                      .format(epoch+1, step, len(train_loader),
                              remain=timeSince(start, float(step+1)/len(train_loader)),
                              loss=losses,
                              grad_norm=grad_norm,
                              lr=scheduler.get_lr()[0]))
    return losses.avg
def valid_epoch(valid_loader, model, criterion, device):
    model.eval() # set model in evaluation mode
    losses = AverageMeter() # initiate AverageMeter for tracking the loss.
    prediction_dict = {}
    preds = []
    start = end = time.time() # track the execution time.
    with tqdm(valid_loader, unit="valid_batch", desc='Validation') as tqdm_valid_loader:
        for step, batch in enumerate(tqdm_valid_loader):
            inputs = batch.pop("inputs")
            labels = batch.pop("labels")
            student_ids = batch.pop("essay_ids")
            inputs = collate(inputs) # collate inputs
            for k, v in inputs.items():
                inputs[k] = v.to(device) # send inputs to device
            labels = labels.to(device)
            batch_size = labels.size(0)
            with torch.no_grad():
                y_preds = model(inputs) # forward propagation pass
                loss = criterion(y_preds, labels) # get loss
            if config.GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / config.GRADIENT_ACCUMULATION_STEPS
            losses.update(loss.item(), batch_size) # update loss function tracking
            preds.append(y_preds.to('cpu').numpy()) # save predictions
            end = time.time() # get finish time
            # ========== LOG INFO ==========
            if step % config.PRINT_FREQ == 0 or step == (len(valid_loader)-1):
                print('EVAL: [{0}/{1}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.avg:.4f} '
                      .format(step, len(valid_loader),
                              loss=losses,
                              remain=timeSince(start, float(step+1)/len(valid_loader))))
    prediction_dict["predictions"] = np.concatenate(preds) # np.array() of shape (fold_size, target_cols)
    prediction_dict["essay_ids"] = student_ids
    return losses.avg, prediction_dict

In [13]:
def train_loop(folds, fold):
    LOGGER.info(f"========== Fold: {fold} training ==========")
    # ======== SPLIT ==========
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds["score"].values
    # ======== DATASETS ==========
    train_dataset = CustomDataset(config, train_folds, tokenizer)
    valid_dataset = CustomDataset(config, valid_folds, tokenizer)
    # ======== DATALOADERS ==========
    train_loader = DataLoader(train_dataset,
                              batch_size=config.BATCH_SIZE_TRAIN, # TODO: split into train and valid
                              shuffle=True,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.BATCH_SIZE_VALID,
                              shuffle=False,
                              num_workers=config.NUM_WORKERS, pin_memory=True, drop_last=False)
    # ======== MODEL ==========
    model = CustomModel(config, config_path=None, pretrained=True)
    torch.save(model.config, paths.OUTPUT_DIR + '/config.pth')
    model.to(device)
    optimizer_parameters = get_optimizer_grouped_parameters(model,
                                                learning_rate=config.MIN_LR,
                                               weight_decay=config.WEIGHT_DECAY,
                                                      layerwise_learning_rate_decay= config.layerwise_learning_rate_decay)
    optimizer = AdamW(
        optimizer_parameters,
        lr=(config.MIN_LR)*(config.layerwise_learning_rate_decay),
        eps=config.EPS,
        betas=config.BETAS
    )
    num_train_steps = int(len(train_folds) / config.BATCH_SIZE_TRAIN * config.EPOCHS)
    scheduler = get_scheduler(config, optimizer, num_train_steps)
    # ======= LOSS ==========
    # criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')
    criterion = nn.CrossEntropyLoss()
    softmax = nn.Softmax(dim=1)
    best_score = -np.inf
    # ====== ITERATE EPOCHS ========
    for epoch in range(config.EPOCHS):
        start_time = time.time()
        # ======= TRAIN ==========
        avg_loss = train_epoch(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)
        # ======= EVALUATION ==========
        avg_val_loss, prediction_dict = valid_epoch(valid_loader, model, criterion, device)
        predictions = prediction_dict["predictions"]
        _, predictions = torch.max(softmax(torch.tensor(predictions)), dim=1)
        # ======= SCORING ==========
        score = get_score(valid_labels, predictions)
        elapsed = time.time() - start_time
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        paths.OUTPUT_DIR + f"/{config.MODEL.replace('/', '_')}_fold_{fold}_best.pth")
    predictions = torch.load(paths.OUTPUT_DIR + f"/{config.MODEL.replace('/', '_')}_fold_{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds["pred_score"] = predictions
    torch.cuda.empty_cache()
    gc.collect()
    return valid_folds

In [14]:
if __name__ == '__main__':
    def get_result(oof_df):
        labels = oof_df["score"].values
        preds = oof_df["pred_score"].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    if config.TRAIN:
        oof_df = pd.DataFrame()
        for fold in range(config.FOLDS):
            if fold in config.TRAIN_FOLDS:
                _oof_df = train_loop(train_df, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== Fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_csv(paths.OUTPUT_DIR + '/oof_df.csv', index=False)



pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [1][0/3461] Elapsed 0m 1s (remain 81m 40s) Loss: 1.8853 Grad: inf  LR: 0.00002000  
Epoch: [1][20/3461] Elapsed 0m 5s (remain 15m 58s) Loss: 1.8996 Grad: nan  LR: 0.00002000  
Epoch: [1][40/3461] Elapsed 0m 10s (remain 14m 3s) Loss: 1.8664 Grad: nan  LR: 0.00002000  
Epoch: [1][60/3461] Elapsed 0m 14s (remain 13m 22s) Loss: 1.8736 Grad: nan  LR: 0.00002000  
Epoch: [1][80/3461] Elapsed 0m 18s (remain 13m 2s) Loss: 1.8665 Grad: nan  LR: 0.00001999  
Epoch: [1][100/3461] Elapsed 0m 23s (remain 12m 46s) Loss: 1.8701 Grad: nan  LR: 0.00001999  
Epoch: [1][120/3461] Elapsed 0m 27s (remain 12m 35s) Loss: 1.8687 Grad: nan  LR: 0.00001998  
Epoch: [1][140/3461] Elapsed 0m 31s (remain 12m 26s) Loss: 1.8684 Grad: nan  LR: 0.00001998  
Epoch: [1][160/3461] Elapsed 0m 35s (remain 12m 17s) Loss: 1.8686 Grad: nan  LR: 0.00001997  
Epoch: [1][180/3461] Elapsed 0m 40s (remain 12m 9s) Loss: 1.8676 Grad: nan  LR: 0.00001997  
Epoch: [1][200/3461] Elapsed 0m 44s (remain 12m 1s) Loss: 1.8687 Grad: 

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 15s) Loss: 1.8455 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 51s) Loss: 1.8804 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 50s) Loss: 1.8857 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.8786 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 47s) Loss: 1.8705 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 1.8729 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.8750 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 43s) Loss: 1.8746 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 42s) Loss: 1.8788 
EVAL: [180/866] Elapsed 0m 10s (remain 0m 41s) Loss: 1.8740 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.8759 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 38s) Loss: 1.8728 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 37s) Loss: 1.8748 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 1.8717 
EVAL: [280/866] Elapsed 0m 16s (remain 0m 35s) Loss: 1.8739 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.8727 
EVAL: [320/866] Elapsed 0m 19s (remain 

Epoch 1 - avg_train_loss: 1.8690  avg_val_loss: 1.8663  time: 804s
Epoch 1 - Score: 0.0763
Epoch 1 - Save Best Score: 0.0763 Model


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.8663 


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [2][0/3461] Elapsed 0m 0s (remain 13m 52s) Loss: 1.8757 Grad: nan  LR: 0.00001000  
Epoch: [2][20/3461] Elapsed 0m 4s (remain 13m 0s) Loss: 1.8746 Grad: nan  LR: 0.00000990  
Epoch: [2][40/3461] Elapsed 0m 9s (remain 12m 48s) Loss: 1.8674 Grad: nan  LR: 0.00000981  
Epoch: [2][60/3461] Elapsed 0m 13s (remain 12m 27s) Loss: 1.8877 Grad: nan  LR: 0.00000972  
Epoch: [2][80/3461] Elapsed 0m 17s (remain 12m 25s) Loss: 1.8873 Grad: nan  LR: 0.00000963  
Epoch: [2][100/3461] Elapsed 0m 22s (remain 12m 14s) Loss: 1.8803 Grad: nan  LR: 0.00000954  
Epoch: [2][120/3461] Elapsed 0m 26s (remain 12m 10s) Loss: 1.8769 Grad: nan  LR: 0.00000945  
Epoch: [2][140/3461] Elapsed 0m 30s (remain 12m 4s) Loss: 1.8792 Grad: nan  LR: 0.00000936  
Epoch: [2][160/3461] Elapsed 0m 35s (remain 11m 59s) Loss: 1.8755 Grad: nan  LR: 0.00000927  
Epoch: [2][180/3461] Elapsed 0m 39s (remain 11m 52s) Loss: 1.8793 Grad: nan  LR: 0.00000918  
Epoch: [2][200/3461] Elapsed 0m 43s (remain 11m 48s) Loss: 1.8793 Grad:

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 6s) Loss: 1.8455 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 51s) Loss: 1.8804 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 50s) Loss: 1.8857 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.8786 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 48s) Loss: 1.8705 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 1.8729 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.8750 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.8746 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 42s) Loss: 1.8788 
EVAL: [180/866] Elapsed 0m 10s (remain 0m 41s) Loss: 1.8740 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.8759 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.8728 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 37s) Loss: 1.8748 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 1.8717 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 1.8739 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.8727 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 2 - avg_train_loss: 1.8690  avg_val_loss: 1.8663  time: 803s
Epoch 2 - Score: 0.0763


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.8663 


Score: 0.0763


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [1][0/3461] Elapsed 0m 0s (remain 13m 52s) Loss: 1.7622 Grad: inf  LR: 0.00002000  
Epoch: [1][20/3461] Elapsed 0m 4s (remain 11m 55s) Loss: 1.6464 Grad: nan  LR: 0.00002000  
Epoch: [1][40/3461] Elapsed 0m 8s (remain 12m 6s) Loss: 1.6510 Grad: nan  LR: 0.00002000  
Epoch: [1][60/3461] Elapsed 0m 12s (remain 12m 3s) Loss: 1.6759 Grad: nan  LR: 0.00002000  
Epoch: [1][80/3461] Elapsed 0m 17s (remain 11m 57s) Loss: 1.6634 Grad: nan  LR: 0.00001999  
Epoch: [1][100/3461] Elapsed 0m 21s (remain 11m 56s) Loss: 1.6571 Grad: nan  LR: 0.00001999  
Epoch: [1][120/3461] Elapsed 0m 25s (remain 11m 56s) Loss: 1.6602 Grad: nan  LR: 0.00001998  
Epoch: [1][140/3461] Elapsed 0m 30s (remain 11m 56s) Loss: 1.6569 Grad: nan  LR: 0.00001998  
Epoch: [1][160/3461] Elapsed 0m 34s (remain 11m 53s) Loss: 1.6536 Grad: nan  LR: 0.00001997  
Epoch: [1][180/3461] Elapsed 0m 39s (remain 11m 46s) Loss: 1.6531 Grad: nan  LR: 0.00001997  
Epoch: [1][200/3461] Elapsed 0m 43s (remain 11m 44s) Loss: 1.6534 Grad:

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 9s) Loss: 2.1280 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 52s) Loss: 1.7387 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 50s) Loss: 1.7028 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.6866 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 48s) Loss: 1.6911 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 47s) Loss: 1.6757 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.6681 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.6581 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 43s) Loss: 1.6559 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 41s) Loss: 1.6556 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.6528 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.6505 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.6565 
EVAL: [260/866] Elapsed 0m 16s (remain 0m 37s) Loss: 1.6587 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 1.6608 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.6606 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 1 - avg_train_loss: 1.6530  avg_val_loss: 1.6544  time: 805s
Epoch 1 - Score: -0.0151
Epoch 1 - Save Best Score: -0.0151 Model


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.6544 


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [2][0/3461] Elapsed 0m 0s (remain 14m 2s) Loss: 1.7795 Grad: nan  LR: 0.00001000  
Epoch: [2][20/3461] Elapsed 0m 4s (remain 12m 28s) Loss: 1.6806 Grad: nan  LR: 0.00000990  
Epoch: [2][40/3461] Elapsed 0m 9s (remain 12m 33s) Loss: 1.6570 Grad: nan  LR: 0.00000981  
Epoch: [2][60/3461] Elapsed 0m 13s (remain 12m 29s) Loss: 1.6574 Grad: nan  LR: 0.00000972  
Epoch: [2][80/3461] Elapsed 0m 17s (remain 12m 20s) Loss: 1.6596 Grad: nan  LR: 0.00000963  
Epoch: [2][100/3461] Elapsed 0m 22s (remain 12m 15s) Loss: 1.6550 Grad: nan  LR: 0.00000954  
Epoch: [2][120/3461] Elapsed 0m 26s (remain 12m 9s) Loss: 1.6520 Grad: nan  LR: 0.00000945  
Epoch: [2][140/3461] Elapsed 0m 30s (remain 12m 1s) Loss: 1.6414 Grad: nan  LR: 0.00000936  
Epoch: [2][160/3461] Elapsed 0m 34s (remain 11m 54s) Loss: 1.6377 Grad: nan  LR: 0.00000927  
Epoch: [2][180/3461] Elapsed 0m 39s (remain 11m 49s) Loss: 1.6389 Grad: nan  LR: 0.00000918  
Epoch: [2][200/3461] Elapsed 0m 43s (remain 11m 44s) Loss: 1.6405 Grad: 

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 11s) Loss: 2.1280 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 52s) Loss: 1.7387 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 51s) Loss: 1.7028 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.6866 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 48s) Loss: 1.6911 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 1.6757 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.6681 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.6581 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 43s) Loss: 1.6559 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 41s) Loss: 1.6556 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.6528 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.6505 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.6565 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 1.6587 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 1.6608 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.6606 
EVAL: [320/866] Elapsed 0m 19s (remain 

Epoch 2 - avg_train_loss: 1.6530  avg_val_loss: 1.6544  time: 803s
Epoch 2 - Score: -0.0151


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.6544 


Score: -0.0151


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [1][0/3461] Elapsed 0m 0s (remain 14m 26s) Loss: 2.3740 Grad: inf  LR: 0.00002000  
Epoch: [1][20/3461] Elapsed 0m 4s (remain 13m 1s) Loss: 2.1342 Grad: nan  LR: 0.00002000  
Epoch: [1][40/3461] Elapsed 0m 9s (remain 12m 39s) Loss: 2.0829 Grad: nan  LR: 0.00002000  
Epoch: [1][60/3461] Elapsed 0m 13s (remain 12m 28s) Loss: 2.0918 Grad: nan  LR: 0.00002000  
Epoch: [1][80/3461] Elapsed 0m 17s (remain 12m 20s) Loss: 2.0753 Grad: nan  LR: 0.00001999  
Epoch: [1][100/3461] Elapsed 0m 22s (remain 12m 16s) Loss: 2.0739 Grad: nan  LR: 0.00001999  
Epoch: [1][120/3461] Elapsed 0m 26s (remain 12m 9s) Loss: 2.0787 Grad: nan  LR: 0.00001998  
Epoch: [1][140/3461] Elapsed 0m 30s (remain 12m 6s) Loss: 2.0818 Grad: nan  LR: 0.00001998  
Epoch: [1][160/3461] Elapsed 0m 35s (remain 12m 1s) Loss: 2.0782 Grad: nan  LR: 0.00001997  
Epoch: [1][180/3461] Elapsed 0m 39s (remain 11m 55s) Loss: 2.0767 Grad: nan  LR: 0.00001997  
Epoch: [1][200/3461] Elapsed 0m 43s (remain 11m 47s) Loss: 2.0708 Grad: n

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 6s) Loss: 2.0380 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 53s) Loss: 2.2079 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 51s) Loss: 2.2035 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 2.1643 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 47s) Loss: 2.1423 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 2.1204 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 2.1131 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 2.1071 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 42s) Loss: 2.0984 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 41s) Loss: 2.1022 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 2.0930 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 2.0893 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 2.0887 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 2.0833 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 2.0820 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 2.0831 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 1 - avg_train_loss: 2.0659  avg_val_loss: 2.0655  time: 802s
Epoch 1 - Score: -0.0738
Epoch 1 - Save Best Score: -0.0738 Model


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 2.0655 


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [2][0/3461] Elapsed 0m 0s (remain 13m 57s) Loss: 2.1550 Grad: nan  LR: 0.00001000  
Epoch: [2][20/3461] Elapsed 0m 4s (remain 12m 58s) Loss: 2.1089 Grad: nan  LR: 0.00000991  
Epoch: [2][40/3461] Elapsed 0m 9s (remain 12m 35s) Loss: 2.0753 Grad: nan  LR: 0.00000982  
Epoch: [2][60/3461] Elapsed 0m 13s (remain 12m 23s) Loss: 2.0765 Grad: nan  LR: 0.00000973  
Epoch: [2][80/3461] Elapsed 0m 17s (remain 12m 13s) Loss: 2.0773 Grad: nan  LR: 0.00000963  
Epoch: [2][100/3461] Elapsed 0m 21s (remain 12m 11s) Loss: 2.0716 Grad: nan  LR: 0.00000954  
Epoch: [2][120/3461] Elapsed 0m 26s (remain 12m 8s) Loss: 2.0623 Grad: nan  LR: 0.00000945  
Epoch: [2][140/3461] Elapsed 0m 30s (remain 12m 5s) Loss: 2.0594 Grad: nan  LR: 0.00000936  
Epoch: [2][160/3461] Elapsed 0m 35s (remain 12m 0s) Loss: 2.0646 Grad: nan  LR: 0.00000927  
Epoch: [2][180/3461] Elapsed 0m 39s (remain 11m 57s) Loss: 2.0655 Grad: nan  LR: 0.00000918  
Epoch: [2][200/3461] Elapsed 0m 43s (remain 11m 52s) Loss: 2.0594 Grad: 

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 9s) Loss: 2.0380 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 53s) Loss: 2.2079 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 51s) Loss: 2.2035 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 2.1643 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 47s) Loss: 2.1423 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 2.1204 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 2.1131 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 2.1071 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 42s) Loss: 2.0984 
EVAL: [180/866] Elapsed 0m 10s (remain 0m 41s) Loss: 2.1022 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 2.0930 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 2.0893 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 2.0887 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 2.0833 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 2.0820 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 2.0831 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 2 - avg_train_loss: 2.0659  avg_val_loss: 2.0655  time: 803s
Epoch 2 - Score: -0.0738


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 2.0655 


Score: -0.0738


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [1][0/3461] Elapsed 0m 0s (remain 12m 17s) Loss: 1.6658 Grad: inf  LR: 0.00002000  
Epoch: [1][20/3461] Elapsed 0m 4s (remain 12m 11s) Loss: 1.8836 Grad: nan  LR: 0.00002000  
Epoch: [1][40/3461] Elapsed 0m 8s (remain 12m 16s) Loss: 1.9140 Grad: nan  LR: 0.00002000  
Epoch: [1][60/3461] Elapsed 0m 13s (remain 12m 17s) Loss: 1.9120 Grad: nan  LR: 0.00002000  
Epoch: [1][80/3461] Elapsed 0m 17s (remain 12m 17s) Loss: 1.9233 Grad: nan  LR: 0.00001999  
Epoch: [1][100/3461] Elapsed 0m 22s (remain 12m 12s) Loss: 1.9290 Grad: nan  LR: 0.00001999  
Epoch: [1][120/3461] Elapsed 0m 26s (remain 12m 6s) Loss: 1.9364 Grad: nan  LR: 0.00001998  
Epoch: [1][140/3461] Elapsed 0m 30s (remain 12m 0s) Loss: 1.9361 Grad: nan  LR: 0.00001998  
Epoch: [1][160/3461] Elapsed 0m 34s (remain 11m 56s) Loss: 1.9249 Grad: nan  LR: 0.00001997  
Epoch: [1][180/3461] Elapsed 0m 39s (remain 11m 50s) Loss: 1.9283 Grad: nan  LR: 0.00001997  
Epoch: [1][200/3461] Elapsed 0m 43s (remain 11m 46s) Loss: 1.9364 Grad:

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 4s) Loss: 1.5529 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 51s) Loss: 2.0396 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 50s) Loss: 1.9471 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.9398 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 48s) Loss: 1.9250 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 47s) Loss: 1.9101 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.9069 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.9117 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 43s) Loss: 1.9181 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 42s) Loss: 1.9275 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.9215 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.9246 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.9272 
EVAL: [260/866] Elapsed 0m 16s (remain 0m 37s) Loss: 1.9297 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 36s) Loss: 1.9252 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.9206 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 1 - avg_train_loss: 1.9168  avg_val_loss: 1.9155  time: 806s
Epoch 1 - Score: 0.0262
Epoch 1 - Save Best Score: 0.0262 Model


EVAL: [865/866] Elapsed 0m 53s (remain 0m 0s) Loss: 1.9155 


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [2][0/3461] Elapsed 0m 0s (remain 14m 2s) Loss: 1.5769 Grad: nan  LR: 0.00001000  
Epoch: [2][20/3461] Elapsed 0m 4s (remain 12m 31s) Loss: 1.8922 Grad: nan  LR: 0.00000991  
Epoch: [2][40/3461] Elapsed 0m 9s (remain 12m 36s) Loss: 1.8926 Grad: nan  LR: 0.00000982  
Epoch: [2][60/3461] Elapsed 0m 13s (remain 12m 15s) Loss: 1.9007 Grad: nan  LR: 0.00000973  
Epoch: [2][80/3461] Elapsed 0m 17s (remain 12m 8s) Loss: 1.8871 Grad: nan  LR: 0.00000963  
Epoch: [2][100/3461] Elapsed 0m 21s (remain 12m 6s) Loss: 1.8948 Grad: nan  LR: 0.00000954  
Epoch: [2][120/3461] Elapsed 0m 26s (remain 12m 4s) Loss: 1.8998 Grad: nan  LR: 0.00000945  
Epoch: [2][140/3461] Elapsed 0m 30s (remain 12m 1s) Loss: 1.9014 Grad: nan  LR: 0.00000936  
Epoch: [2][160/3461] Elapsed 0m 35s (remain 11m 57s) Loss: 1.9113 Grad: nan  LR: 0.00000927  
Epoch: [2][180/3461] Elapsed 0m 39s (remain 11m 53s) Loss: 1.9080 Grad: nan  LR: 0.00000918  
Epoch: [2][200/3461] Elapsed 0m 43s (remain 11m 48s) Loss: 1.9089 Grad: na

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 8s) Loss: 1.5529 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 51s) Loss: 2.0396 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 50s) Loss: 1.9471 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.9398 
EVAL: [80/866] Elapsed 0m 4s (remain 0m 47s) Loss: 1.9250 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 46s) Loss: 1.9101 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.9069 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.9117 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 43s) Loss: 1.9181 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 41s) Loss: 1.9275 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.9215 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.9246 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.9272 
EVAL: [260/866] Elapsed 0m 16s (remain 0m 37s) Loss: 1.9297 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 1.9252 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.9206 
EVAL: [320/866] Elapsed 0m 19s (remain 0

Epoch 2 - avg_train_loss: 1.9167  avg_val_loss: 1.9155  time: 805s
Epoch 2 - Score: 0.0262


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.9155 


Score: 0.0262


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [1][0/3461] Elapsed 0m 0s (remain 14m 6s) Loss: 1.4993 Grad: inf  LR: 0.00002000  
Epoch: [1][20/3461] Elapsed 0m 4s (remain 12m 58s) Loss: 1.6444 Grad: nan  LR: 0.00002000  
Epoch: [1][40/3461] Elapsed 0m 9s (remain 12m 51s) Loss: 1.6557 Grad: nan  LR: 0.00002000  
Epoch: [1][60/3461] Elapsed 0m 13s (remain 12m 45s) Loss: 1.6391 Grad: nan  LR: 0.00002000  
Epoch: [1][80/3461] Elapsed 0m 18s (remain 12m 36s) Loss: 1.6353 Grad: nan  LR: 0.00001999  
Epoch: [1][100/3461] Elapsed 0m 22s (remain 12m 25s) Loss: 1.6300 Grad: nan  LR: 0.00001999  
Epoch: [1][120/3461] Elapsed 0m 26s (remain 12m 16s) Loss: 1.6384 Grad: nan  LR: 0.00001998  
Epoch: [1][140/3461] Elapsed 0m 31s (remain 12m 12s) Loss: 1.6409 Grad: nan  LR: 0.00001998  
Epoch: [1][160/3461] Elapsed 0m 35s (remain 12m 6s) Loss: 1.6441 Grad: nan  LR: 0.00001997  
Epoch: [1][180/3461] Elapsed 0m 39s (remain 11m 58s) Loss: 1.6433 Grad: nan  LR: 0.00001997  
Epoch: [1][200/3461] Elapsed 0m 43s (remain 11m 53s) Loss: 1.6418 Grad:

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 15s) Loss: 1.7488 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 54s) Loss: 1.5532 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 52s) Loss: 1.5821 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 50s) Loss: 1.5704 
EVAL: [80/866] Elapsed 0m 5s (remain 0m 49s) Loss: 1.5926 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 48s) Loss: 1.6081 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 46s) Loss: 1.6160 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 45s) Loss: 1.6185 
EVAL: [160/866] Elapsed 0m 10s (remain 0m 43s) Loss: 1.6194 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 42s) Loss: 1.6221 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 41s) Loss: 1.6196 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.6261 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.6269 
EVAL: [260/866] Elapsed 0m 16s (remain 0m 37s) Loss: 1.6304 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 36s) Loss: 1.6301 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.6315 
EVAL: [320/866] Elapsed 0m 19s (remain

Epoch 1 - avg_train_loss: 1.6291  avg_val_loss: 1.6310  time: 805s
Epoch 1 - Score: 0.0598
Epoch 1 - Save Best Score: 0.0598 Model


EVAL: [865/866] Elapsed 0m 53s (remain 0m 0s) Loss: 1.6310 


Train:   0%|          | 0/3461 [00:00<?, ?train_batch/s]

Epoch: [2][0/3461] Elapsed 0m 0s (remain 14m 23s) Loss: 1.4280 Grad: nan  LR: 0.00001000  
Epoch: [2][20/3461] Elapsed 0m 4s (remain 12m 53s) Loss: 1.6313 Grad: nan  LR: 0.00000991  
Epoch: [2][40/3461] Elapsed 0m 9s (remain 12m 42s) Loss: 1.6190 Grad: nan  LR: 0.00000982  
Epoch: [2][60/3461] Elapsed 0m 13s (remain 12m 37s) Loss: 1.6095 Grad: nan  LR: 0.00000973  
Epoch: [2][80/3461] Elapsed 0m 17s (remain 12m 27s) Loss: 1.6212 Grad: nan  LR: 0.00000963  
Epoch: [2][100/3461] Elapsed 0m 22s (remain 12m 19s) Loss: 1.6179 Grad: nan  LR: 0.00000954  
Epoch: [2][120/3461] Elapsed 0m 26s (remain 12m 9s) Loss: 1.6280 Grad: nan  LR: 0.00000945  
Epoch: [2][140/3461] Elapsed 0m 30s (remain 12m 4s) Loss: 1.6429 Grad: nan  LR: 0.00000936  
Epoch: [2][160/3461] Elapsed 0m 34s (remain 11m 56s) Loss: 1.6443 Grad: nan  LR: 0.00000927  
Epoch: [2][180/3461] Elapsed 0m 39s (remain 11m 51s) Loss: 1.6404 Grad: nan  LR: 0.00000918  
Epoch: [2][200/3461] Elapsed 0m 43s (remain 11m 48s) Loss: 1.6450 Grad:

Validation:   0%|          | 0/866 [00:00<?, ?valid_batch/s]

EVAL: [0/866] Elapsed 0m 0s (remain 1m 10s) Loss: 1.7488 
EVAL: [20/866] Elapsed 0m 1s (remain 0m 53s) Loss: 1.5532 
EVAL: [40/866] Elapsed 0m 2s (remain 0m 51s) Loss: 1.5821 
EVAL: [60/866] Elapsed 0m 3s (remain 0m 49s) Loss: 1.5704 
EVAL: [80/866] Elapsed 0m 5s (remain 0m 48s) Loss: 1.5926 
EVAL: [100/866] Elapsed 0m 6s (remain 0m 47s) Loss: 1.6081 
EVAL: [120/866] Elapsed 0m 7s (remain 0m 45s) Loss: 1.6160 
EVAL: [140/866] Elapsed 0m 8s (remain 0m 44s) Loss: 1.6185 
EVAL: [160/866] Elapsed 0m 9s (remain 0m 43s) Loss: 1.6194 
EVAL: [180/866] Elapsed 0m 11s (remain 0m 41s) Loss: 1.6221 
EVAL: [200/866] Elapsed 0m 12s (remain 0m 40s) Loss: 1.6196 
EVAL: [220/866] Elapsed 0m 13s (remain 0m 39s) Loss: 1.6261 
EVAL: [240/866] Elapsed 0m 14s (remain 0m 38s) Loss: 1.6269 
EVAL: [260/866] Elapsed 0m 15s (remain 0m 36s) Loss: 1.6304 
EVAL: [280/866] Elapsed 0m 17s (remain 0m 35s) Loss: 1.6301 
EVAL: [300/866] Elapsed 0m 18s (remain 0m 34s) Loss: 1.6315 
EVAL: [320/866] Elapsed 0m 19s (remain 

Epoch 2 - avg_train_loss: 1.6290  avg_val_loss: 1.6310  time: 804s
Epoch 2 - Score: 0.0598


EVAL: [865/866] Elapsed 0m 52s (remain 0m 0s) Loss: 1.6310 


Score: 0.0598
Score: -0.0048
