# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Library

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import ast
import time

from utils.utils import (micro_f1, spans_to_binary, span_micro_f1, create_labels_for_scoring,
                        get_char_probs, get_results, get_predictions, get_score, get_logger, seed_everything,
                        get_result)
from utils.load_data import load_data
from utils.dataset import TrainDataset
from utils.dice_loss import DiceLoss
from utils.train import train_fn, valid_fn, get_optimizer_params, get_scheduler
from models import (DebertaBaseLastFourLayer, DebertaBaseLastMLMLastFourLayer,
                    DebertaBaseLastMLM, DebertaBaseVanila)

import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

import transformers
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transformers.__version__: 4.19.0.dev0
env: TOKENIZERS_PARALLELISM=true


# CFG

In [2]:
class CFG:
    print_freq=100
    num_workers=4
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    apex=True
    
    epochs=15
    batch_size=4 #32
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
    


class CFG_DebertaBaseVanila(CFG):
    path="DebertaBaseVanila"
    model=DebertaBaseVanila

class CFG_DebertaBaseLastFourLayer(CFG):
    path="DebertaBaseLastFourLayer"
    model=DebertaBaseLastFourLayer

class CFG_DebertaBaseLastMLM(CFG):
    path="DebertaBaseLastMLM"
    model=DebertaBaseLastMLM

class CFG_DebertaBaseLastMLMLastFourLayer(CFG):
    path="DebertaBaseLastMLMLastFourLayer"
    model=DebertaBaseLastMLMLastFourLayer
    dropout_num = 4
    
CFGs = [
    CFG_DebertaBaseVanila,
    CFG_DebertaBaseLastFourLayer,
    CFG_DebertaBaseLastMLM,
    CFG_DebertaBaseLastMLMLastFourLayer
]

# Data Loading

In [3]:
# ====================================================
# Data Loading
# ====================================================
train, patient_notes, features = load_data()
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,annotation_length
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,1
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,1
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,1
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,2
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,1


In [4]:
first = train.loc[125]
example = {
    "feature_text": first.feature_text,
    "pn_history": first.pn_history,
    "location_list": first.location,
    "annotation_list": first.annotation
}
for key in example.keys():
    print(key)
    print(example[key])
    print("=" * 100)

feature_text
Caffeine-use
pn_history
17 y/o previously healthy male here with heart pounding. States a few months ago, he started having episodes of feeling like his heart is pound/jumping out of his chest. Denies any anxiety, sweating, shaking, or sense of impending doom during these episodes. States he also had shortness of breath with his last episode. Denies any temperature intolerance, jitteriness, chest pain, abdominal pain, GI symptoms, GU symptoms, or easy bruising/bleeding. States he drinks about 3-4 cups coffee/day and several energy drinks throughout the week. Denies any recent stressors in his life.
ROS: negative except for above
PMH, Surg hx, hospitalizations - none
Meds: occasionally take roommate's adderall
NKDA
FH: mom has thyroid problems, dad had heart attack last year
Social: started college 6-7 months ago, denies tobacco use, drinks 3-4 drinks/day on weekends (neg CAGE screening), tried MJ a few months ago. Sexually active with gf, no STD concerns
location_lis

# CV split

In [5]:
# ====================================================
# CV split
# ====================================================
if CFG.n_fold == 1:
    train['fold'] = 0
else:
    Fold = GroupKFold(n_splits=CFG.n_fold)
    groups = train['pn_num'].values
    for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
        train.loc[val_index, 'fold'] = int(n)
    train['fold'] = train['fold'].astype(int)
train.groupby('fold').size()

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

# Dataset

In [6]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(CFGs[0].tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    print(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(CFGs[0].tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    print(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
# CFG.max_len = 466
print(f"max_len: {CFG.max_len}")

max_len: 466


# Helpler functions

In [7]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold, criterion, OUTPUT_DIR):
    
    LOGGER.info(f"========== {OUTPUT_DIR} fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    
    train_loader = DataLoader(TrainDataset(CFG, train_folds),
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(TrainDataset(CFG, valid_folds),
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CFG.model(CFG, config_path=None, pretrained=True)
    torch.save(model.config, CFG.config_path)
    
    model.to(device)

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    
    best_score = -0.1

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(CFG, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(CFG, valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        score, max_th = get_result(valid_folds, predictions, CFG)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}, Th: {max_th:.1f}')
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.path}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.path}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
#     valid_folds[[i for i in range(CFG.max_len)]] = predictions
    pd1 = pd.DataFrame(predictions)
    valid_folds = valid_folds.join(pd1)

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [8]:
if __name__ == '__main__':

    def get_result_score(oof_df, CFG):
        
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        score, max_th = get_result(oof_df, predictions, CFG)
        LOGGER.info(f'Score: {score:<.4f}, Th: {max_th:.1f}')
        return score, max_th
        

#     criterions = {'BCE': nn.BCEWithLogitsLoss(reduction="none"), 'Dice': DiceLoss()}
    criterions = {'BCE': nn.BCEWithLogitsLoss(reduction="none")}
    
    df_scores = pd.DataFrame({'Name': [], 'Score': [], 'Th':[]})
    
    for criterion_name, criterion in criterions.items():
        for CFG in CFGs:
            OUTPUT_DIR = f'../input/{CFG.path}_{criterion_name}/'
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            CFG.config_path = OUTPUT_DIR + 'config.pth'
            LOGGER = get_logger(OUTPUT_DIR + 'train')
            seed_everything(seed=42)

            oof_df = pd.DataFrame()
            for fold in range(CFG.n_fold):
                if fold in CFG.trn_fold:
                    _oof_df = train_loop(train, fold, criterion, OUTPUT_DIR)
                    oof_df = pd.concat([oof_df, _oof_df])
                    LOGGER.info(f"========== fold: {fold} result ==========")
                    score, max_th = get_result_score(_oof_df, CFG)
                    df_score = pd.DataFrame([[f'{CFG.path}_{criterion_name}_fold{fold}', score, max_th]], columns=df_scores.columns)
                    df_scores = pd.concat([df_scores, df_score], ignore_index=True)
                    df_scores.to_csv('../input/scores.csv', index=False)
            oof_df = oof_df.reset_index(drop=True)
            LOGGER.info(f"========== CV ==========")
            score, max_th = get_result_score(oof_df, CFG)
            df_score = pd.DataFrame([[f'{CFG.path}_{criterion_name}', score, max_th]], columns=df_scores.columns)
            df_scores = pd.concat([df_scores, df_score], ignore_index=True)
            df_scores.to_csv('../input/scores.csv', index=False)
            oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
NVIDIA GeForce RTX 3070 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_52 sm_60 sm_61 sm_70 sm_75 sm_80 compute_80.
If you want to use the NVIDIA

Epoch: [1][0/2860] Elapsed 0m 0s (remain 44m 27s) Loss: 0.7056(0.7056) Grad: 159619.4062  LR: 0.00002000  
Epoch: [1][100/2860] Elapsed 0m 24s (remain 11m 20s) Loss: 0.1400(0.2668) Grad: 66073.7266  LR: 0.00002000  
Epoch: [1][200/2860] Elapsed 0m 47s (remain 10m 33s) Loss: 0.1092(0.1812) Grad: 4833.3818  LR: 0.00002000  
Epoch: [1][300/2860] Elapsed 1m 10s (remain 10m 3s) Loss: 0.0753(0.1486) Grad: 14049.8672  LR: 0.00002000  
Epoch: [1][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.0825(0.1280) Grad: 43387.6328  LR: 0.00002000  
Epoch: [1][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0539(0.1113) Grad: 26501.4238  LR: 0.00001999  
Epoch: [1][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0310(0.0997) Grad: 43691.4180  LR: 0.00001999  
Epoch: [1][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0163(0.0906) Grad: 39833.2305  LR: 0.00001999  
Epoch: [1][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0291(0.0840) Grad: 50525.4609  LR: 0.00001998  
Epoch: [1][900/2860] Elapsed 

Epoch 1 - avg_train_loss: 0.0398  avg_val_loss: 0.0172  time: 737s
Epoch 1 - Score: 0.8223, Th: 0.7
Epoch 1 - Save Best Score: 0.8223 Model


Epoch: [2][0/2860] Elapsed 0m 0s (remain 15m 45s) Loss: 0.0007(0.0007) Grad: 1843.1735  LR: 0.00001978  
Epoch: [2][100/2860] Elapsed 0m 23s (remain 10m 52s) Loss: 0.0256(0.0167) Grad: 63640.4727  LR: 0.00001977  
Epoch: [2][200/2860] Elapsed 0m 47s (remain 10m 25s) Loss: 0.0245(0.0190) Grad: 10791.1260  LR: 0.00001975  
Epoch: [2][300/2860] Elapsed 1m 10s (remain 10m 0s) Loss: 0.0156(0.0198) Grad: 42546.3516  LR: 0.00001973  
Epoch: [2][400/2860] Elapsed 1m 34s (remain 9m 36s) Loss: 0.0058(0.0190) Grad: 25542.5977  LR: 0.00001972  
Epoch: [2][500/2860] Elapsed 1m 57s (remain 9m 13s) Loss: 0.0063(0.0186) Grad: 43732.1602  LR: 0.00001970  
Epoch: [2][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0320(0.0185) Grad: 47262.0508  LR: 0.00001968  
Epoch: [2][700/2860] Elapsed 2m 44s (remain 8m 26s) Loss: 0.0245(0.0183) Grad: 62982.9570  LR: 0.00001966  
Epoch: [2][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.1475(0.0182) Grad: 292471.6562  LR: 0.00001964  
Epoch: [2][900/2860] Elapsed 3

Epoch 2 - avg_train_loss: 0.0169  avg_val_loss: 0.0133  time: 740s
Epoch 2 - Score: 0.7937, Th: 0.5


Epoch: [3][0/2860] Elapsed 0m 0s (remain 17m 21s) Loss: 0.0302(0.0302) Grad: 68537.8906  LR: 0.00001914  
Epoch: [3][100/2860] Elapsed 0m 23s (remain 10m 49s) Loss: 0.0118(0.0135) Grad: 38856.2070  LR: 0.00001911  
Epoch: [3][200/2860] Elapsed 0m 47s (remain 10m 24s) Loss: 0.0101(0.0119) Grad: 31305.6973  LR: 0.00001907  
Epoch: [3][300/2860] Elapsed 1m 10s (remain 10m 0s) Loss: 0.0029(0.0126) Grad: 11728.5703  LR: 0.00001904  
Epoch: [3][400/2860] Elapsed 1m 34s (remain 9m 38s) Loss: 0.0030(0.0125) Grad: 22623.0449  LR: 0.00001901  
Epoch: [3][500/2860] Elapsed 1m 57s (remain 9m 14s) Loss: 0.0015(0.0128) Grad: 25135.0625  LR: 0.00001898  
Epoch: [3][600/2860] Elapsed 2m 21s (remain 8m 50s) Loss: 0.0008(0.0126) Grad: 25621.7012  LR: 0.00001895  
Epoch: [3][700/2860] Elapsed 2m 44s (remain 8m 26s) Loss: 0.0058(0.0124) Grad: 11311.8555  LR: 0.00001891  
Epoch: [3][800/2860] Elapsed 3m 7s (remain 8m 3s) Loss: 0.0485(0.0128) Grad: 270812.0625  LR: 0.00001888  
Epoch: [3][900/2860] Elapsed 

Epoch 3 - avg_train_loss: 0.0124  avg_val_loss: 0.0183  time: 741s
Epoch 3 - Score: 0.8281, Th: 0.3
Epoch 3 - Save Best Score: 0.8281 Model


Epoch: [4][0/2860] Elapsed 0m 0s (remain 16m 40s) Loss: 0.0030(0.0030) Grad: 39162.9883  LR: 0.00001809  
Epoch: [4][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0002(0.0126) Grad: 1305.3169  LR: 0.00001805  
Epoch: [4][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0094(0.0114) Grad: 19661.0098  LR: 0.00001800  
Epoch: [4][300/2860] Elapsed 1m 10s (remain 9m 59s) Loss: 0.0183(0.0106) Grad: 9998.0020  LR: 0.00001796  
Epoch: [4][400/2860] Elapsed 1m 34s (remain 9m 36s) Loss: 0.0003(0.0096) Grad: 4931.0674  LR: 0.00001791  
Epoch: [4][500/2860] Elapsed 1m 57s (remain 9m 12s) Loss: 0.0068(0.0100) Grad: 26361.3613  LR: 0.00001787  
Epoch: [4][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0463(0.0101) Grad: 155126.7031  LR: 0.00001782  
Epoch: [4][700/2860] Elapsed 2m 44s (remain 8m 25s) Loss: 0.0000(0.0100) Grad: 110.8094  LR: 0.00001778  
Epoch: [4][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0000(0.0097) Grad: 103.0347  LR: 0.00001773  
Epoch: [4][900/2860] Elapsed 3m 30s 

Epoch 4 - avg_train_loss: 0.0109  avg_val_loss: 0.0154  time: 739s
Epoch 4 - Score: 0.8454, Th: 0.6
Epoch 4 - Save Best Score: 0.8454 Model


Epoch: [5][0/2860] Elapsed 0m 0s (remain 16m 43s) Loss: 0.0012(0.0012) Grad: 13901.9043  LR: 0.00001669  
Epoch: [5][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0001(0.0103) Grad: 333.8930  LR: 0.00001664  
Epoch: [5][200/2860] Elapsed 0m 47s (remain 10m 23s) Loss: 0.0000(0.0090) Grad: 167.6672  LR: 0.00001658  
Epoch: [5][300/2860] Elapsed 1m 10s (remain 9m 59s) Loss: 0.0089(0.0090) Grad: 66273.3359  LR: 0.00001653  
Epoch: [5][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.0116(0.0087) Grad: 13632.8428  LR: 0.00001647  
Epoch: [5][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0010(0.0087) Grad: 38599.9648  LR: 0.00001641  
Epoch: [5][600/2860] Elapsed 2m 20s (remain 8m 48s) Loss: 0.0060(0.0084) Grad: 4675.4438  LR: 0.00001636  
Epoch: [5][700/2860] Elapsed 2m 44s (remain 8m 25s) Loss: 0.0008(0.0086) Grad: 2994.9531  LR: 0.00001630  
Epoch: [5][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0011(0.0088) Grad: 5929.2090  LR: 0.00001624  
Epoch: [5][900/2860] Elapsed 3m 30s (

Epoch 5 - avg_train_loss: 0.0086  avg_val_loss: 0.0203  time: 739s
Epoch 5 - Score: 0.8482, Th: 0.2
Epoch 5 - Save Best Score: 0.8482 Model


Epoch: [6][0/2860] Elapsed 0m 0s (remain 16m 36s) Loss: 0.0006(0.0006) Grad: 14088.4824  LR: 0.00001500  
Epoch: [6][100/2860] Elapsed 0m 23s (remain 10m 49s) Loss: 0.0002(0.0062) Grad: 801.1408  LR: 0.00001494  
Epoch: [6][200/2860] Elapsed 0m 47s (remain 10m 23s) Loss: 0.0723(0.0074) Grad: 107237.6562  LR: 0.00001487  
Epoch: [6][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0189(0.0070) Grad: 39114.2930  LR: 0.00001481  
Epoch: [6][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0003(0.0074) Grad: 5209.4150  LR: 0.00001474  
Epoch: [6][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0004(0.0076) Grad: 9845.4648  LR: 0.00001468  
Epoch: [6][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0001(0.0075) Grad: 436.3958  LR: 0.00001461  
Epoch: [6][700/2860] Elapsed 2m 43s (remain 8m 24s) Loss: 0.0084(0.0072) Grad: 58202.4609  LR: 0.00001455  
Epoch: [6][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0057(0.0073) Grad: 34073.2930  LR: 0.00001448  
Epoch: [6][900/2860] Elapsed 3m 30s

Epoch 6 - avg_train_loss: 0.0072  avg_val_loss: 0.0189  time: 738s
Epoch 6 - Score: 0.8542, Th: 0.5
Epoch 6 - Save Best Score: 0.8542 Model


Epoch: [7][0/2860] Elapsed 0m 0s (remain 16m 44s) Loss: 0.0052(0.0052) Grad: 13901.3564  LR: 0.00001309  
Epoch: [7][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0052(0.0044) Grad: 5346.3218  LR: 0.00001302  
Epoch: [7][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0284(0.0055) Grad: 215502.5156  LR: 0.00001295  
Epoch: [7][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0000(0.0054) Grad: 45.9732  LR: 0.00001288  
Epoch: [7][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0002(0.0054) Grad: 2400.9236  LR: 0.00001281  
Epoch: [7][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0001(0.0054) Grad: 2650.9402  LR: 0.00001274  
Epoch: [7][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0069(0.0053) Grad: 8120.1606  LR: 0.00001267  
Epoch: [7][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0000(0.0052) Grad: 210.6035  LR: 0.00001260  
Epoch: [7][800/2860] Elapsed 3m 6s (remain 8m 0s) Loss: 0.0000(0.0053) Grad: 637.6508  LR: 0.00001253  
Epoch: [7][900/2860] Elapsed 3m 30s (rema

Epoch 7 - avg_train_loss: 0.0059  avg_val_loss: 0.0212  time: 737s
Epoch 7 - Score: 0.8505, Th: 0.5


Epoch: [8][0/2860] Elapsed 0m 0s (remain 16m 37s) Loss: 0.0000(0.0000) Grad: 16.7096  LR: 0.00001104  
Epoch: [8][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0000(0.0046) Grad: 329.2519  LR: 0.00001097  
Epoch: [8][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0027(0.0048) Grad: 65267.8320  LR: 0.00001090  
Epoch: [8][300/2860] Elapsed 1m 9s (remain 9m 55s) Loss: 0.0028(0.0043) Grad: 25469.7812  LR: 0.00001083  
Epoch: [8][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0001(0.0046) Grad: 2072.0479  LR: 0.00001075  
Epoch: [8][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0000(0.0044) Grad: 20.4135  LR: 0.00001068  
Epoch: [8][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0000(0.0045) Grad: 13.5852  LR: 0.00001061  
Epoch: [8][700/2860] Elapsed 2m 43s (remain 8m 22s) Loss: 0.0000(0.0045) Grad: 27.4896  LR: 0.00001053  
Epoch: [8][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0004(0.0044) Grad: 4816.9360  LR: 0.00001046  
Epoch: [8][900/2860] Elapsed 3m 29s (remain 7m 

Epoch 8 - avg_train_loss: 0.0048  avg_val_loss: 0.0246  time: 736s
Epoch 8 - Score: 0.8548, Th: 0.2
Epoch 8 - Save Best Score: 0.8548 Model


Epoch: [9][0/2860] Elapsed 0m 0s (remain 17m 9s) Loss: 0.0070(0.0070) Grad: 71205.0391  LR: 0.00000895  
Epoch: [9][100/2860] Elapsed 0m 23s (remain 10m 42s) Loss: 0.0038(0.0045) Grad: 73147.9453  LR: 0.00000888  
Epoch: [9][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0002(0.0038) Grad: 1553.5858  LR: 0.00000881  
Epoch: [9][300/2860] Elapsed 1m 9s (remain 9m 54s) Loss: 0.0000(0.0035) Grad: 40.9078  LR: 0.00000874  
Epoch: [9][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0030(0.0036) Grad: 77796.1562  LR: 0.00000866  
Epoch: [9][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0000(0.0037) Grad: 110.4779  LR: 0.00000859  
Epoch: [9][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0088(0.0038) Grad: 52422.9883  LR: 0.00000852  
Epoch: [9][700/2860] Elapsed 2m 42s (remain 8m 21s) Loss: 0.0000(0.0035) Grad: 61.5770  LR: 0.00000845  
Epoch: [9][800/2860] Elapsed 3m 6s (remain 7m 58s) Loss: 0.0000(0.0037) Grad: 449.7606  LR: 0.00000837  
Epoch: [9][900/2860] Elapsed 3m 29s (remain

Epoch 9 - avg_train_loss: 0.0037  avg_val_loss: 0.0291  time: 734s
Epoch 9 - Score: 0.8557, Th: 0.3
Epoch 9 - Save Best Score: 0.8557 Model


Epoch: [10][0/2860] Elapsed 0m 0s (remain 20m 39s) Loss: 0.0000(0.0000) Grad: 11.0148  LR: 0.00000691  
Epoch: [10][100/2860] Elapsed 0m 23s (remain 10m 41s) Loss: 0.0000(0.0024) Grad: 20.9104  LR: 0.00000684  
Epoch: [10][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0000(0.0033) Grad: 11.7362  LR: 0.00000677  
Epoch: [10][300/2860] Elapsed 1m 9s (remain 9m 52s) Loss: 0.0034(0.0033) Grad: 12321.8037  LR: 0.00000670  
Epoch: [10][400/2860] Elapsed 1m 32s (remain 9m 29s) Loss: 0.0000(0.0029) Grad: 15.0418  LR: 0.00000663  
Epoch: [10][500/2860] Elapsed 1m 56s (remain 9m 6s) Loss: 0.0000(0.0031) Grad: 8.9514  LR: 0.00000656  
Epoch: [10][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0000(0.0031) Grad: 76.4012  LR: 0.00000649  
Epoch: [10][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0000(0.0032) Grad: 21.6916  LR: 0.00000643  
Epoch: [10][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0092(0.0031) Grad: 4179.9673  LR: 0.00000636  
Epoch: [10][900/2860] Elapsed 3m 28s (remain 

Epoch 10 - avg_train_loss: 0.0030  avg_val_loss: 0.0285  time: 733s
Epoch 10 - Score: 0.8577, Th: 0.6
Epoch 10 - Save Best Score: 0.8577 Model


Epoch: [11][0/2860] Elapsed 0m 0s (remain 16m 32s) Loss: 0.0000(0.0000) Grad: 6.5785  LR: 0.00000500  
Epoch: [11][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0242(0.0028) Grad: 722823.6250  LR: 0.00000494  
Epoch: [11][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0040(0.0023) Grad: 22894.9512  LR: 0.00000487  
Epoch: [11][300/2860] Elapsed 1m 9s (remain 9m 54s) Loss: 0.0009(0.0024) Grad: 50632.8555  LR: 0.00000481  
Epoch: [11][400/2860] Elapsed 1m 33s (remain 9m 30s) Loss: 0.0000(0.0025) Grad: 45.0688  LR: 0.00000475  
Epoch: [11][500/2860] Elapsed 1m 56s (remain 9m 7s) Loss: 0.0000(0.0028) Grad: 14.2535  LR: 0.00000469  
Epoch: [11][600/2860] Elapsed 2m 19s (remain 8m 44s) Loss: 0.0000(0.0029) Grad: 173.7751  LR: 0.00000462  
Epoch: [11][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0001(0.0029) Grad: 1283.6940  LR: 0.00000456  
Epoch: [11][800/2860] Elapsed 3m 5s (remain 7m 58s) Loss: 0.0000(0.0027) Grad: 20.6572  LR: 0.00000450  
Epoch: [11][900/2860] Elapsed 3m 29s 

Epoch 11 - avg_train_loss: 0.0026  avg_val_loss: 0.0290  time: 733s
Epoch 11 - Score: 0.8614, Th: 0.8
Epoch 11 - Save Best Score: 0.8614 Model


Epoch: [12][0/2860] Elapsed 0m 0s (remain 16m 48s) Loss: 0.0000(0.0000) Grad: 13.5071  LR: 0.00000331  
Epoch: [12][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0147(0.0033) Grad: 58432.5938  LR: 0.00000325  
Epoch: [12][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0056(0.0029) Grad: 15272.7197  LR: 0.00000320  
Epoch: [12][300/2860] Elapsed 1m 9s (remain 9m 53s) Loss: 0.0007(0.0024) Grad: 37164.9688  LR: 0.00000315  
Epoch: [12][400/2860] Elapsed 1m 32s (remain 9m 29s) Loss: 0.0000(0.0022) Grad: 16.7798  LR: 0.00000309  
Epoch: [12][500/2860] Elapsed 1m 55s (remain 9m 5s) Loss: 0.0000(0.0020) Grad: 20.6105  LR: 0.00000304  
Epoch: [12][600/2860] Elapsed 2m 18s (remain 8m 42s) Loss: 0.0084(0.0020) Grad: 320448.0938  LR: 0.00000299  
Epoch: [12][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0000(0.0019) Grad: 39.7628  LR: 0.00000294  
Epoch: [12][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0020) Grad: 153.0918  LR: 0.00000288  
Epoch: [12][900/2860] Elapsed 3m 28

Epoch 12 - avg_train_loss: 0.0021  avg_val_loss: 0.0287  time: 732s
Epoch 12 - Score: 0.8626, Th: 0.3
Epoch 12 - Save Best Score: 0.8626 Model


Epoch: [13][0/2860] Elapsed 0m 0s (remain 16m 48s) Loss: 0.0000(0.0000) Grad: 64.3930  LR: 0.00000191  
Epoch: [13][100/2860] Elapsed 0m 23s (remain 10m 36s) Loss: 0.0000(0.0020) Grad: 22.2853  LR: 0.00000187  
Epoch: [13][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0221(0.0020) Grad: 133695.5938  LR: 0.00000182  
Epoch: [13][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0021) Grad: 106.2298  LR: 0.00000178  
Epoch: [13][400/2860] Elapsed 1m 32s (remain 9m 28s) Loss: 0.0000(0.0021) Grad: 1.5977  LR: 0.00000174  
Epoch: [13][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0020) Grad: 42.2935  LR: 0.00000170  
Epoch: [13][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0166(0.0019) Grad: 16531.3379  LR: 0.00000166  
Epoch: [13][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0000(0.0019) Grad: 15.3164  LR: 0.00000162  
Epoch: [13][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0020) Grad: 449.0172  LR: 0.00000158  
Epoch: [13][900/2860] Elapsed 3m 28s (rem

Epoch 13 - avg_train_loss: 0.0019  avg_val_loss: 0.0310  time: 731s
Epoch 13 - Score: 0.8621, Th: 0.8


Epoch: [14][0/2860] Elapsed 0m 0s (remain 16m 26s) Loss: 0.0000(0.0000) Grad: 1.5016  LR: 0.00000086  
Epoch: [14][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0000(0.0014) Grad: 6.0807  LR: 0.00000083  
Epoch: [14][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0000(0.0019) Grad: 20.4382  LR: 0.00000081  
Epoch: [14][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0018) Grad: 17.0187  LR: 0.00000078  
Epoch: [14][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0018) Grad: 31.4425  LR: 0.00000075  
Epoch: [14][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0018) Grad: 65.5231  LR: 0.00000072  
Epoch: [14][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0018) Grad: 10.7900  LR: 0.00000069  
Epoch: [14][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0018) Grad: 80.7932  LR: 0.00000067  
Epoch: [14][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0000(0.0017) Grad: 4.4903  LR: 0.00000064  
Epoch: [14][900/2860] Elapsed 3m 28s (remain 7m 32s)

Epoch 14 - avg_train_loss: 0.0018  avg_val_loss: 0.0311  time: 732s
Epoch 14 - Score: 0.8623, Th: 0.6


Epoch: [15][0/2860] Elapsed 0m 0s (remain 16m 39s) Loss: 0.0000(0.0000) Grad: 2.2410  LR: 0.00000022  
Epoch: [15][100/2860] Elapsed 0m 23s (remain 10m 39s) Loss: 0.0000(0.0023) Grad: 1.8989  LR: 0.00000020  
Epoch: [15][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0000(0.0022) Grad: 3.5763  LR: 0.00000019  
Epoch: [15][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0115(0.0022) Grad: 89480.5547  LR: 0.00000018  
Epoch: [15][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0020) Grad: 12.0071  LR: 0.00000016  
Epoch: [15][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0000(0.0020) Grad: 9.3881  LR: 0.00000015  
Epoch: [15][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0018) Grad: 8.5498  LR: 0.00000014  
Epoch: [15][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0017) Grad: 12.8290  LR: 0.00000012  
Epoch: [15][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0000(0.0017) Grad: 41.6115  LR: 0.00000011  
Epoch: [15][900/2860] Elapsed 3m 28s (remain 7m 32s

Epoch 15 - avg_train_loss: 0.0016  avg_val_loss: 0.0312  time: 731s
Epoch 15 - Score: 0.8624, Th: 0.7
Score: 0.8626, Th: 0.3
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2860] Elapsed 0m 0s (remain 19m 3s) Loss: 0.6455(0.6455) Grad: 153248.9062  LR: 0.00002000  
Epoch: [1][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0837(0.1982) Grad: 21382.9375  LR: 0.00002000  
Epoch: [1][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0854(0.1390) Grad: 6289.5811  LR: 0.00002000  
Epoch: [1][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0581(0.1160) Grad: 5217.1084  LR: 0.00002000  
Epoch: [1][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.0104(0.0986) Grad: 13821.9941  LR: 0.00002000  
Epoch: [1][500/2860] Elapsed 1m 57s (remain 9m 13s) Loss: 0.0701(0.0878) Grad: 83820.4062  LR: 0.00001999  
Epoch: [1][600/2860] Elapsed 2m 21s (remain 8m 50s) Loss: 0.0345(0.0797) Grad: 81646.2266  LR: 0.00001999  
Epoch: [1][700/2860] Elapsed 2m 44s (remain 8m 27s) Loss: 0.0159(0.0735) Grad: 25174.5117  LR: 0.00001999  
Epoch: [1][800/2860] Elapsed 3m 8s (remain 8m 4s) Loss: 0.0220(0.0680) Grad: 31690.0898  LR: 0.00001998  
Epoch: [1][900/2860] Elapsed 3m 

Epoch 1 - avg_train_loss: 0.0355  avg_val_loss: 0.0184  time: 742s
Epoch 1 - Score: 0.7682, Th: 0.2
Epoch 1 - Save Best Score: 0.7682 Model


Epoch: [2][0/2860] Elapsed 0m 0s (remain 17m 32s) Loss: 0.0062(0.0062) Grad: 15471.0391  LR: 0.00001978  
Epoch: [2][100/2860] Elapsed 0m 23s (remain 10m 47s) Loss: 0.0109(0.0194) Grad: 27649.8301  LR: 0.00001977  
Epoch: [2][200/2860] Elapsed 0m 47s (remain 10m 22s) Loss: 0.0486(0.0171) Grad: 72377.9453  LR: 0.00001975  
Epoch: [2][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0072(0.0155) Grad: 128088.9062  LR: 0.00001973  
Epoch: [2][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.0097(0.0163) Grad: 72029.7109  LR: 0.00001972  
Epoch: [2][500/2860] Elapsed 1m 57s (remain 9m 12s) Loss: 0.0150(0.0167) Grad: 33926.8711  LR: 0.00001970  
Epoch: [2][600/2860] Elapsed 2m 20s (remain 8m 48s) Loss: 0.0186(0.0167) Grad: 61740.2930  LR: 0.00001968  
Epoch: [2][700/2860] Elapsed 2m 43s (remain 8m 25s) Loss: 0.0002(0.0170) Grad: 533.8589  LR: 0.00001966  
Epoch: [2][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0081(0.0173) Grad: 18074.4453  LR: 0.00001964  
Epoch: [2][900/2860] Elapsed 3m

Epoch 2 - avg_train_loss: 0.0162  avg_val_loss: 0.0158  time: 740s
Epoch 2 - Score: 0.8371, Th: 0.4
Epoch 2 - Save Best Score: 0.8371 Model


Epoch: [3][0/2860] Elapsed 0m 0s (remain 18m 42s) Loss: 0.0059(0.0059) Grad: 20024.7266  LR: 0.00001914  
Epoch: [3][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0153(0.0116) Grad: 31334.8398  LR: 0.00001911  
Epoch: [3][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0111(0.0132) Grad: 25096.5430  LR: 0.00001907  
Epoch: [3][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0090(0.0126) Grad: 5857.9526  LR: 0.00001904  
Epoch: [3][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0001(0.0122) Grad: 237.7055  LR: 0.00001901  
Epoch: [3][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0068(0.0120) Grad: 33562.2188  LR: 0.00001898  
Epoch: [3][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0018(0.0124) Grad: 27662.9297  LR: 0.00001895  
Epoch: [3][700/2860] Elapsed 2m 43s (remain 8m 24s) Loss: 0.0050(0.0130) Grad: 13495.2500  LR: 0.00001891  
Epoch: [3][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0096(0.0133) Grad: 38523.6602  LR: 0.00001888  
Epoch: [3][900/2860] Elapsed 3m 3

Epoch 3 - avg_train_loss: 0.0126  avg_val_loss: 0.0148  time: 738s
Epoch 3 - Score: 0.8411, Th: 0.4
Epoch 3 - Save Best Score: 0.8411 Model


Epoch: [4][0/2860] Elapsed 0m 0s (remain 18m 50s) Loss: 0.0001(0.0001) Grad: 346.9121  LR: 0.00001809  
Epoch: [4][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0027(0.0094) Grad: 20101.0117  LR: 0.00001805  
Epoch: [4][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0027(0.0093) Grad: 66428.0469  LR: 0.00001800  
Epoch: [4][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0294(0.0094) Grad: 22442.9766  LR: 0.00001796  
Epoch: [4][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0066(0.0103) Grad: 15954.9189  LR: 0.00001791  
Epoch: [4][500/2860] Elapsed 1m 57s (remain 9m 10s) Loss: 0.0000(0.0103) Grad: 107.6442  LR: 0.00001787  
Epoch: [4][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0973(0.0102) Grad: 275084.8125  LR: 0.00001782  
Epoch: [4][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0001(0.0109) Grad: 1515.6489  LR: 0.00001778  
Epoch: [4][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0091(0.0107) Grad: 9137.7490  LR: 0.00001773  
Epoch: [4][900/2860] Elapsed 3m 30s

Epoch 4 - avg_train_loss: 0.0104  avg_val_loss: 0.0174  time: 738s
Epoch 4 - Score: 0.8545, Th: 0.2
Epoch 4 - Save Best Score: 0.8545 Model


Epoch: [5][0/2860] Elapsed 0m 0s (remain 19m 2s) Loss: 0.0002(0.0002) Grad: 1856.0474  LR: 0.00001669  
Epoch: [5][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0001(0.0069) Grad: 838.5149  LR: 0.00001664  
Epoch: [5][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0034(0.0089) Grad: 47895.4844  LR: 0.00001658  
Epoch: [5][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0001(0.0082) Grad: 328.1581  LR: 0.00001653  
Epoch: [5][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0003(0.0084) Grad: 3097.6963  LR: 0.00001647  
Epoch: [5][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0001(0.0084) Grad: 850.7562  LR: 0.00001641  
Epoch: [5][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0000(0.0082) Grad: 48.8394  LR: 0.00001636  
Epoch: [5][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0088(0.0086) Grad: 21741.7012  LR: 0.00001630  
Epoch: [5][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0000(0.0087) Grad: 33.5224  LR: 0.00001624  
Epoch: [5][900/2860] Elapsed 3m 29s (remain 7

Epoch 5 - avg_train_loss: 0.0086  avg_val_loss: 0.0202  time: 737s
Epoch 5 - Score: 0.8583, Th: 0.4
Epoch 5 - Save Best Score: 0.8583 Model


Epoch: [6][0/2860] Elapsed 0m 0s (remain 18m 16s) Loss: 0.0035(0.0035) Grad: 10360.8164  LR: 0.00001500  
Epoch: [6][100/2860] Elapsed 0m 23s (remain 10m 47s) Loss: 0.0004(0.0062) Grad: 14811.8711  LR: 0.00001494  
Epoch: [6][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0002(0.0078) Grad: 2140.5171  LR: 0.00001487  
Epoch: [6][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0026(0.0071) Grad: 11083.4717  LR: 0.00001481  
Epoch: [6][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0000(0.0069) Grad: 79.2184  LR: 0.00001474  
Epoch: [6][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0000(0.0070) Grad: 696.6137  LR: 0.00001468  
Epoch: [6][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0000(0.0072) Grad: 267.7326  LR: 0.00001461  
Epoch: [6][700/2860] Elapsed 2m 43s (remain 8m 22s) Loss: 0.0068(0.0072) Grad: 4065.0786  LR: 0.00001455  
Epoch: [6][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0041(0.0072) Grad: 28690.6934  LR: 0.00001448  
Epoch: [6][900/2860] Elapsed 3m 29s (re

Epoch 6 - avg_train_loss: 0.0073  avg_val_loss: 0.0203  time: 736s
Epoch 6 - Score: 0.8538, Th: 0.5


Epoch: [7][0/2860] Elapsed 0m 0s (remain 18m 31s) Loss: 0.0220(0.0220) Grad: 16375.5469  LR: 0.00001309  
Epoch: [7][100/2860] Elapsed 0m 23s (remain 10m 41s) Loss: 0.0001(0.0062) Grad: 1754.3256  LR: 0.00001302  
Epoch: [7][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0085(0.0059) Grad: 48959.3594  LR: 0.00001295  
Epoch: [7][300/2860] Elapsed 1m 9s (remain 9m 53s) Loss: 0.0065(0.0059) Grad: 7272.1138  LR: 0.00001288  
Epoch: [7][400/2860] Elapsed 1m 33s (remain 9m 30s) Loss: 0.0001(0.0054) Grad: 1942.8693  LR: 0.00001281  
Epoch: [7][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0011(0.0057) Grad: 9526.2676  LR: 0.00001274  
Epoch: [7][600/2860] Elapsed 2m 19s (remain 8m 44s) Loss: 0.0000(0.0057) Grad: 102.0264  LR: 0.00001267  
Epoch: [7][700/2860] Elapsed 2m 42s (remain 8m 21s) Loss: 0.0000(0.0056) Grad: 29.1555  LR: 0.00001260  
Epoch: [7][800/2860] Elapsed 3m 6s (remain 7m 58s) Loss: 0.0000(0.0054) Grad: 35.5125  LR: 0.00001253  
Epoch: [7][900/2860] Elapsed 3m 29s (remain

Epoch 7 - avg_train_loss: 0.0060  avg_val_loss: 0.0235  time: 735s
Epoch 7 - Score: 0.8592, Th: 0.3
Epoch 7 - Save Best Score: 0.8592 Model


Epoch: [8][0/2860] Elapsed 0m 0s (remain 18m 36s) Loss: 0.0006(0.0006) Grad: 8118.7104  LR: 0.00001104  
Epoch: [8][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0028(0.0056) Grad: 25978.8887  LR: 0.00001097  
Epoch: [8][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0050(0.0050) Grad: 22001.4961  LR: 0.00001090  
Epoch: [8][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0052) Grad: 39.6412  LR: 0.00001083  
Epoch: [8][400/2860] Elapsed 1m 32s (remain 9m 28s) Loss: 0.0008(0.0053) Grad: 11372.9980  LR: 0.00001075  
Epoch: [8][500/2860] Elapsed 1m 55s (remain 9m 5s) Loss: 0.0000(0.0052) Grad: 5.5666  LR: 0.00001068  
Epoch: [8][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0001(0.0052) Grad: 1504.4194  LR: 0.00001061  
Epoch: [8][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0427(0.0053) Grad: 47809.8672  LR: 0.00001053  
Epoch: [8][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0169(0.0053) Grad: 23132.4102  LR: 0.00001046  
Epoch: [8][900/2860] Elapsed 3m 28s (rem

Epoch 8 - avg_train_loss: 0.0053  avg_val_loss: 0.0229  time: 734s
Epoch 8 - Score: 0.8599, Th: 0.3
Epoch 8 - Save Best Score: 0.8599 Model


Epoch: [9][0/2860] Elapsed 0m 0s (remain 18m 2s) Loss: 0.0000(0.0000) Grad: 315.0605  LR: 0.00000895  
Epoch: [9][100/2860] Elapsed 0m 23s (remain 10m 39s) Loss: 0.0073(0.0031) Grad: 10817.9453  LR: 0.00000888  
Epoch: [9][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0000(0.0033) Grad: 151.4070  LR: 0.00000881  
Epoch: [9][300/2860] Elapsed 1m 9s (remain 9m 52s) Loss: 0.0008(0.0039) Grad: 12325.0156  LR: 0.00000874  
Epoch: [9][400/2860] Elapsed 1m 32s (remain 9m 30s) Loss: 0.0000(0.0034) Grad: 9.3104  LR: 0.00000866  
Epoch: [9][500/2860] Elapsed 1m 55s (remain 9m 6s) Loss: 0.0005(0.0033) Grad: 10624.9658  LR: 0.00000859  
Epoch: [9][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0003(0.0035) Grad: 19088.4219  LR: 0.00000852  
Epoch: [9][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0384(0.0036) Grad: 760624.6250  LR: 0.00000845  
Epoch: [9][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0001(0.0035) Grad: 3682.2876  LR: 0.00000837  
Epoch: [9][900/2860] Elapsed 3m 28s (rem

Epoch 9 - avg_train_loss: 0.0039  avg_val_loss: 0.0237  time: 733s
Epoch 9 - Score: 0.8625, Th: 0.3
Epoch 9 - Save Best Score: 0.8625 Model


Epoch: [10][0/2860] Elapsed 0m 0s (remain 18m 18s) Loss: 0.0000(0.0000) Grad: 32.6821  LR: 0.00000691  
Epoch: [10][100/2860] Elapsed 0m 23s (remain 10m 41s) Loss: 0.0020(0.0026) Grad: 115515.4297  LR: 0.00000684  
Epoch: [10][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0126(0.0024) Grad: 69638.3438  LR: 0.00000677  
Epoch: [10][300/2860] Elapsed 1m 9s (remain 9m 53s) Loss: 0.0000(0.0028) Grad: 12.3155  LR: 0.00000670  
Epoch: [10][400/2860] Elapsed 1m 33s (remain 9m 30s) Loss: 0.0000(0.0030) Grad: 156.7170  LR: 0.00000663  
Epoch: [10][500/2860] Elapsed 1m 56s (remain 9m 6s) Loss: 0.0000(0.0032) Grad: 13.4462  LR: 0.00000656  
Epoch: [10][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0000(0.0032) Grad: 132.3127  LR: 0.00000649  
Epoch: [10][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0057(0.0030) Grad: 14975.2568  LR: 0.00000643  
Epoch: [10][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0029) Grad: 18.1576  LR: 0.00000636  
Epoch: [10][900/2860] Elapsed 3m 28s 

Epoch 10 - avg_train_loss: 0.0032  avg_val_loss: 0.0275  time: 733s
Epoch 10 - Score: 0.8635, Th: 0.6
Epoch 10 - Save Best Score: 0.8635 Model


Epoch: [11][0/2860] Elapsed 0m 0s (remain 18m 33s) Loss: 0.0000(0.0000) Grad: 242.9830  LR: 0.00000500  
Epoch: [11][100/2860] Elapsed 0m 23s (remain 10m 36s) Loss: 0.0000(0.0014) Grad: 2201.7168  LR: 0.00000494  
Epoch: [11][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0000(0.0022) Grad: 235.2699  LR: 0.00000487  
Epoch: [11][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0000(0.0020) Grad: 15.1661  LR: 0.00000481  
Epoch: [11][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0023) Grad: 13.2956  LR: 0.00000475  
Epoch: [11][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0000(0.0024) Grad: 9.8248  LR: 0.00000469  
Epoch: [11][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0023) Grad: 48.6894  LR: 0.00000462  
Epoch: [11][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0024) Grad: 13.4031  LR: 0.00000456  
Epoch: [11][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0000(0.0023) Grad: 14.7318  LR: 0.00000450  
Epoch: [11][900/2860] Elapsed 3m 27s (remain 7

Epoch 11 - avg_train_loss: 0.0024  avg_val_loss: 0.0307  time: 731s
Epoch 11 - Score: 0.8624, Th: 0.4


Epoch: [12][0/2860] Elapsed 0m 0s (remain 18m 26s) Loss: 0.0000(0.0000) Grad: 5.4723  LR: 0.00000331  
Epoch: [12][100/2860] Elapsed 0m 23s (remain 10m 39s) Loss: 0.0000(0.0010) Grad: 13.2267  LR: 0.00000325  
Epoch: [12][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0000(0.0016) Grad: 13.1563  LR: 0.00000320  
Epoch: [12][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0017) Grad: 19.0779  LR: 0.00000315  
Epoch: [12][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0004(0.0019) Grad: 7010.5835  LR: 0.00000309  
Epoch: [12][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0020) Grad: 37.9340  LR: 0.00000304  
Epoch: [12][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0020) Grad: 8.2190  LR: 0.00000299  
Epoch: [12][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0021) Grad: 1.7715  LR: 0.00000294  
Epoch: [12][800/2860] Elapsed 3m 5s (remain 7m 55s) Loss: 0.0000(0.0021) Grad: 90.5468  LR: 0.00000288  
Epoch: [12][900/2860] Elapsed 3m 28s (remain 7m 32

Epoch 12 - avg_train_loss: 0.0021  avg_val_loss: 0.0315  time: 731s
Epoch 12 - Score: 0.8608, Th: 0.3


Epoch: [13][0/2860] Elapsed 0m 0s (remain 18m 10s) Loss: 0.0000(0.0000) Grad: 23.4444  LR: 0.00000191  
Epoch: [13][100/2860] Elapsed 0m 23s (remain 10m 36s) Loss: 0.0016(0.0019) Grad: 94676.8828  LR: 0.00000187  
Epoch: [13][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0022) Grad: 66.7749  LR: 0.00000182  
Epoch: [13][300/2860] Elapsed 1m 9s (remain 9m 49s) Loss: 0.0000(0.0020) Grad: 54.3868  LR: 0.00000178  
Epoch: [13][400/2860] Elapsed 1m 32s (remain 9m 25s) Loss: 0.0000(0.0020) Grad: 17.9701  LR: 0.00000174  
Epoch: [13][500/2860] Elapsed 1m 55s (remain 9m 2s) Loss: 0.0000(0.0019) Grad: 4.7382  LR: 0.00000170  
Epoch: [13][600/2860] Elapsed 2m 18s (remain 8m 39s) Loss: 0.0000(0.0018) Grad: 2.2758  LR: 0.00000166  
Epoch: [13][700/2860] Elapsed 2m 41s (remain 8m 16s) Loss: 0.0000(0.0017) Grad: 33.3183  LR: 0.00000162  
Epoch: [13][800/2860] Elapsed 3m 4s (remain 7m 53s) Loss: 0.0083(0.0016) Grad: 96917.8125  LR: 0.00000158  
Epoch: [13][900/2860] Elapsed 3m 27s (remain 

Epoch 13 - avg_train_loss: 0.0018  avg_val_loss: 0.0315  time: 729s
Epoch 13 - Score: 0.8606, Th: 0.5


Epoch: [14][0/2860] Elapsed 0m 0s (remain 18m 11s) Loss: 0.0000(0.0000) Grad: 92.4497  LR: 0.00000086  
Epoch: [14][100/2860] Elapsed 0m 23s (remain 10m 42s) Loss: 0.0000(0.0020) Grad: 1407.7843  LR: 0.00000083  
Epoch: [14][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0000(0.0022) Grad: 87.8685  LR: 0.00000081  
Epoch: [14][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0018) Grad: 9.9329  LR: 0.00000078  
Epoch: [14][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0017) Grad: 4.3608  LR: 0.00000075  
Epoch: [14][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0017) Grad: 17.0197  LR: 0.00000072  
Epoch: [14][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0017) Grad: 8.7665  LR: 0.00000069  
Epoch: [14][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0017) Grad: 47.7777  LR: 0.00000067  
Epoch: [14][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0010(0.0016) Grad: 75120.1484  LR: 0.00000064  
Epoch: [14][900/2860] Elapsed 3m 27s (remain 7m

Epoch 14 - avg_train_loss: 0.0015  avg_val_loss: 0.0318  time: 730s
Epoch 14 - Score: 0.8583, Th: 0.4


Epoch: [15][0/2860] Elapsed 0m 0s (remain 18m 27s) Loss: 0.0000(0.0000) Grad: 7.1820  LR: 0.00000022  
Epoch: [15][100/2860] Elapsed 0m 23s (remain 10m 42s) Loss: 0.0002(0.0018) Grad: 6551.6060  LR: 0.00000020  
Epoch: [15][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0017) Grad: 276.2313  LR: 0.00000019  
Epoch: [15][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0015) Grad: 51.2626  LR: 0.00000018  
Epoch: [15][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0014) Grad: 23.1612  LR: 0.00000016  
Epoch: [15][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0014) Grad: 24.1890  LR: 0.00000015  
Epoch: [15][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0014) Grad: 6.0577  LR: 0.00000014  
Epoch: [15][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0123(0.0015) Grad: 176011.4688  LR: 0.00000012  
Epoch: [15][800/2860] Elapsed 3m 4s (remain 7m 54s) Loss: 0.0000(0.0015) Grad: 29.1845  LR: 0.00000011  
Epoch: [15][900/2860] Elapsed 3m 27s (remain

Epoch 15 - avg_train_loss: 0.0016  avg_val_loss: 0.0320  time: 730s
Epoch 15 - Score: 0.8582, Th: 0.4
Score: 0.8635, Th: 0.6
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2860] Elapsed 0m 0s (remain 18m 50s) Loss: 0.7209(0.7209) Grad: 177294.7031  LR: 0.00002000  
Epoch: [1][100/2860] Elapsed 0m 23s (remain 10m 45s) Loss: 0.1008(0.2057) Grad: 27369.6250  LR: 0.00002000  
Epoch: [1][200/2860] Elapsed 0m 47s (remain 10m 22s) Loss: 0.0472(0.1442) Grad: 16114.9004  LR: 0.00002000  
Epoch: [1][300/2860] Elapsed 1m 10s (remain 9m 59s) Loss: 0.0576(0.1210) Grad: 14991.3350  LR: 0.00002000  
Epoch: [1][400/2860] Elapsed 1m 34s (remain 9m 37s) Loss: 0.0140(0.1038) Grad: 16673.4531  LR: 0.00002000  
Epoch: [1][500/2860] Elapsed 1m 57s (remain 9m 15s) Loss: 0.0877(0.0913) Grad: 32232.2520  LR: 0.00001999  
Epoch: [1][600/2860] Elapsed 2m 21s (remain 8m 51s) Loss: 0.0233(0.0814) Grad: 27358.8828  LR: 0.00001999  
Epoch: [1][700/2860] Elapsed 2m 45s (remain 8m 28s) Loss: 0.0169(0.0747) Grad: 55400.8828  LR: 0.00001999  
Epoch: [1][800/2860] Elapsed 3m 8s (remain 8m 4s) Loss: 0.0215(0.0699) Grad: 40208.4102  LR: 0.00001998  
Epoch: [1][900/2860] Elapsed 

Epoch 1 - avg_train_loss: 0.0355  avg_val_loss: 0.0171  time: 742s
Epoch 1 - Score: 0.8139, Th: 0.5
Epoch 1 - Save Best Score: 0.8139 Model


Epoch: [2][0/2860] Elapsed 0m 0s (remain 19m 28s) Loss: 0.0097(0.0097) Grad: 26750.9023  LR: 0.00001978  
Epoch: [2][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0114(0.0197) Grad: 10506.1445  LR: 0.00001977  
Epoch: [2][200/2860] Elapsed 0m 47s (remain 10m 21s) Loss: 0.0714(0.0178) Grad: 179397.5781  LR: 0.00001975  
Epoch: [2][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0019(0.0181) Grad: 8240.9932  LR: 0.00001973  
Epoch: [2][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0112(0.0196) Grad: 34537.2109  LR: 0.00001972  
Epoch: [2][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0116(0.0187) Grad: 18673.0352  LR: 0.00001970  
Epoch: [2][600/2860] Elapsed 2m 20s (remain 8m 48s) Loss: 0.0031(0.0185) Grad: 5695.2188  LR: 0.00001968  
Epoch: [2][700/2860] Elapsed 2m 43s (remain 8m 24s) Loss: 0.0010(0.0180) Grad: 6349.8071  LR: 0.00001966  
Epoch: [2][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0207(0.0179) Grad: 27802.4434  LR: 0.00001964  
Epoch: [2][900/2860] Elapsed 3m 

Epoch 2 - avg_train_loss: 0.0162  avg_val_loss: 0.0193  time: 739s
Epoch 2 - Score: 0.8386, Th: 0.3
Epoch 2 - Save Best Score: 0.8386 Model


Epoch: [3][0/2860] Elapsed 0m 0s (remain 19m 9s) Loss: 0.0223(0.0223) Grad: 230719.8594  LR: 0.00001914  
Epoch: [3][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0019(0.0109) Grad: 10059.7891  LR: 0.00001911  
Epoch: [3][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0168(0.0123) Grad: 107167.7578  LR: 0.00001907  
Epoch: [3][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0021(0.0114) Grad: 28724.8379  LR: 0.00001904  
Epoch: [3][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0003(0.0115) Grad: 4587.4395  LR: 0.00001901  
Epoch: [3][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0426(0.0124) Grad: 98124.1875  LR: 0.00001898  
Epoch: [3][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0143(0.0127) Grad: 14236.0039  LR: 0.00001895  
Epoch: [3][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0686(0.0136) Grad: 226856.8594  LR: 0.00001891  
Epoch: [3][800/2860] Elapsed 3m 6s (remain 8m 0s) Loss: 0.0007(0.0137) Grad: 12805.9688  LR: 0.00001888  
Epoch: [3][900/2860] Elapsed 

Epoch 3 - avg_train_loss: 0.0130  avg_val_loss: 0.0160  time: 738s
Epoch 3 - Score: 0.8449, Th: 0.4
Epoch 3 - Save Best Score: 0.8449 Model


Epoch: [4][0/2860] Elapsed 0m 0s (remain 18m 59s) Loss: 0.0025(0.0025) Grad: 5810.7310  LR: 0.00001809  
Epoch: [4][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0199(0.0084) Grad: 21782.2383  LR: 0.00001805  
Epoch: [4][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0167(0.0099) Grad: 51966.4805  LR: 0.00001800  
Epoch: [4][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0437(0.0104) Grad: 54176.7500  LR: 0.00001796  
Epoch: [4][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0003(0.0100) Grad: 3189.8730  LR: 0.00001791  
Epoch: [4][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0000(0.0100) Grad: 41.0260  LR: 0.00001787  
Epoch: [4][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0001(0.0102) Grad: 151.0408  LR: 0.00001782  
Epoch: [4][700/2860] Elapsed 2m 43s (remain 8m 22s) Loss: 0.0208(0.0101) Grad: 34347.2539  LR: 0.00001778  
Epoch: [4][800/2860] Elapsed 3m 6s (remain 8m 0s) Loss: 0.0031(0.0098) Grad: 22910.8066  LR: 0.00001773  
Epoch: [4][900/2860] Elapsed 3m 30s (r

Epoch 4 - avg_train_loss: 0.0106  avg_val_loss: 0.0163  time: 737s
Epoch 4 - Score: 0.8407, Th: 0.4


Epoch: [5][0/2860] Elapsed 0m 0s (remain 19m 14s) Loss: 0.0001(0.0001) Grad: 225.4708  LR: 0.00001669  
Epoch: [5][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0014(0.0064) Grad: 14495.6816  LR: 0.00001664  
Epoch: [5][200/2860] Elapsed 0m 47s (remain 10m 22s) Loss: 0.0146(0.0077) Grad: 27360.6758  LR: 0.00001658  
Epoch: [5][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0001(0.0077) Grad: 688.4828  LR: 0.00001653  
Epoch: [5][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0001(0.0080) Grad: 629.1749  LR: 0.00001647  
Epoch: [5][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0325(0.0082) Grad: 60688.3125  LR: 0.00001641  
Epoch: [5][600/2860] Elapsed 2m 19s (remain 8m 46s) Loss: 0.0000(0.0082) Grad: 46.3809  LR: 0.00001636  
Epoch: [5][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0031(0.0085) Grad: 31381.0352  LR: 0.00001630  
Epoch: [5][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0001(0.0085) Grad: 295.9506  LR: 0.00001624  
Epoch: [5][900/2860] Elapsed 3m 29s (rema

Epoch 5 - avg_train_loss: 0.0086  avg_val_loss: 0.0184  time: 736s
Epoch 5 - Score: 0.8647, Th: 0.6
Epoch 5 - Save Best Score: 0.8647 Model


Epoch: [6][0/2860] Elapsed 0m 0s (remain 19m 12s) Loss: 0.0001(0.0001) Grad: 447.6857  LR: 0.00001500  
Epoch: [6][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0000(0.0064) Grad: 33.0593  LR: 0.00001494  
Epoch: [6][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0033(0.0062) Grad: 128195.9297  LR: 0.00001487  
Epoch: [6][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0003(0.0065) Grad: 4424.8179  LR: 0.00001481  
Epoch: [6][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0033(0.0068) Grad: 68887.2422  LR: 0.00001474  
Epoch: [6][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0020(0.0067) Grad: 10541.4141  LR: 0.00001468  
Epoch: [6][600/2860] Elapsed 2m 19s (remain 8m 46s) Loss: 0.0008(0.0072) Grad: 26084.2715  LR: 0.00001461  
Epoch: [6][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0002(0.0071) Grad: 2850.3135  LR: 0.00001455  
Epoch: [6][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0076(0.0072) Grad: 27863.3828  LR: 0.00001448  
Epoch: [6][900/2860] Elapsed 3m 29s 

Epoch 6 - avg_train_loss: 0.0072  avg_val_loss: 0.0195  time: 735s
Epoch 6 - Score: 0.8691, Th: 0.8
Epoch 6 - Save Best Score: 0.8691 Model


Epoch: [7][0/2860] Elapsed 0m 0s (remain 19m 0s) Loss: 0.0000(0.0000) Grad: 333.6046  LR: 0.00001309  
Epoch: [7][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0007(0.0064) Grad: 38907.8203  LR: 0.00001302  
Epoch: [7][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0002(0.0067) Grad: 2055.1404  LR: 0.00001295  
Epoch: [7][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0000(0.0060) Grad: 44.4045  LR: 0.00001288  
Epoch: [7][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0017(0.0058) Grad: 9249.6846  LR: 0.00001281  
Epoch: [7][500/2860] Elapsed 1m 56s (remain 9m 7s) Loss: 0.0065(0.0056) Grad: 107660.4688  LR: 0.00001274  
Epoch: [7][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0070(0.0058) Grad: 35317.2109  LR: 0.00001267  
Epoch: [7][700/2860] Elapsed 2m 42s (remain 8m 21s) Loss: 0.0000(0.0058) Grad: 83.8086  LR: 0.00001260  
Epoch: [7][800/2860] Elapsed 3m 6s (remain 7m 58s) Loss: 0.0000(0.0057) Grad: 112.3415  LR: 0.00001253  
Epoch: [7][900/2860] Elapsed 3m 29s (remai

Epoch 7 - avg_train_loss: 0.0062  avg_val_loss: 0.0209  time: 735s
Epoch 7 - Score: 0.8623, Th: 0.6


Epoch: [8][0/2860] Elapsed 0m 0s (remain 19m 23s) Loss: 0.0000(0.0000) Grad: 476.2170  LR: 0.00001104  
Epoch: [8][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0000(0.0040) Grad: 459.7944  LR: 0.00001097  
Epoch: [8][200/2860] Elapsed 0m 46s (remain 10m 15s) Loss: 0.0018(0.0037) Grad: 94215.2109  LR: 0.00001090  
Epoch: [8][300/2860] Elapsed 1m 9s (remain 9m 52s) Loss: 0.0001(0.0042) Grad: 998.2664  LR: 0.00001083  
Epoch: [8][400/2860] Elapsed 1m 32s (remain 9m 28s) Loss: 0.0134(0.0047) Grad: 14515.1641  LR: 0.00001075  
Epoch: [8][500/2860] Elapsed 1m 56s (remain 9m 6s) Loss: 0.0149(0.0048) Grad: 147904.1719  LR: 0.00001068  
Epoch: [8][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0076(0.0045) Grad: 10866.5908  LR: 0.00001061  
Epoch: [8][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0000(0.0047) Grad: 36.9310  LR: 0.00001053  
Epoch: [8][800/2860] Elapsed 3m 5s (remain 7m 57s) Loss: 0.0075(0.0047) Grad: 65657.9375  LR: 0.00001046  
Epoch: [8][900/2860] Elapsed 3m 28s (re

Epoch 8 - avg_train_loss: 0.0048  avg_val_loss: 0.0226  time: 733s
Epoch 8 - Score: 0.8630, Th: 0.8


Epoch: [9][0/2860] Elapsed 0m 0s (remain 19m 7s) Loss: 0.0000(0.0000) Grad: 32.9894  LR: 0.00000895  
Epoch: [9][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0070(0.0021) Grad: 159222.0781  LR: 0.00000888  
Epoch: [9][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0030) Grad: 18.7339  LR: 0.00000881  
Epoch: [9][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0020(0.0028) Grad: 11541.6670  LR: 0.00000874  
Epoch: [9][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0029) Grad: 332.3193  LR: 0.00000866  
Epoch: [9][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0002(0.0028) Grad: 2663.1550  LR: 0.00000859  
Epoch: [9][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0030) Grad: 51.1986  LR: 0.00000852  
Epoch: [9][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0503(0.0029) Grad: 1245815.5000  LR: 0.00000845  
Epoch: [9][800/2860] Elapsed 3m 5s (remain 7m 55s) Loss: 0.0334(0.0030) Grad: 95388.9922  LR: 0.00000837  
Epoch: [9][900/2860] Elapsed 3m 28s (rema

Epoch 9 - avg_train_loss: 0.0038  avg_val_loss: 0.0235  time: 731s
Epoch 9 - Score: 0.8709, Th: 0.6
Epoch 9 - Save Best Score: 0.8709 Model


Epoch: [10][0/2860] Elapsed 0m 0s (remain 19m 11s) Loss: 0.0000(0.0000) Grad: 9.1581  LR: 0.00000691  
Epoch: [10][100/2860] Elapsed 0m 23s (remain 10m 39s) Loss: 0.0000(0.0033) Grad: 47.3571  LR: 0.00000684  
Epoch: [10][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0001(0.0030) Grad: 343.4442  LR: 0.00000677  
Epoch: [10][300/2860] Elapsed 1m 9s (remain 9m 48s) Loss: 0.0027(0.0028) Grad: 27613.4336  LR: 0.00000670  
Epoch: [10][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0032) Grad: 761.7670  LR: 0.00000663  
Epoch: [10][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0006(0.0029) Grad: 8351.9482  LR: 0.00000656  
Epoch: [10][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0030) Grad: 49.9023  LR: 0.00000649  
Epoch: [10][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0029) Grad: 414.4295  LR: 0.00000643  
Epoch: [10][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0129(0.0030) Grad: 129039.0703  LR: 0.00000636  
Epoch: [10][900/2860] Elapsed 3m 28s (

Epoch 10 - avg_train_loss: 0.0032  avg_val_loss: 0.0285  time: 731s
Epoch 10 - Score: 0.8692, Th: 0.6


Epoch: [11][0/2860] Elapsed 0m 0s (remain 18m 23s) Loss: 0.0000(0.0000) Grad: 1.8543  LR: 0.00000500  
Epoch: [11][100/2860] Elapsed 0m 23s (remain 10m 36s) Loss: 0.0000(0.0029) Grad: 20.1165  LR: 0.00000494  
Epoch: [11][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0000(0.0025) Grad: 7.7679  LR: 0.00000487  
Epoch: [11][300/2860] Elapsed 1m 9s (remain 9m 49s) Loss: 0.0000(0.0023) Grad: 85.5477  LR: 0.00000481  
Epoch: [11][400/2860] Elapsed 1m 32s (remain 9m 26s) Loss: 0.0000(0.0023) Grad: 29.1909  LR: 0.00000475  
Epoch: [11][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0024) Grad: 6.1399  LR: 0.00000469  
Epoch: [11][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0026) Grad: 16.8918  LR: 0.00000462  
Epoch: [11][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0098(0.0025) Grad: 55904.8086  LR: 0.00000456  
Epoch: [11][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0024) Grad: 384.1678  LR: 0.00000450  
Epoch: [11][900/2860] Elapsed 3m 28s (remain 7m 

Epoch 11 - avg_train_loss: 0.0027  avg_val_loss: 0.0284  time: 730s
Epoch 11 - Score: 0.8658, Th: 0.3


Epoch: [12][0/2860] Elapsed 0m 0s (remain 19m 8s) Loss: 0.0000(0.0000) Grad: 123.7990  LR: 0.00000331  
Epoch: [12][100/2860] Elapsed 0m 23s (remain 10m 36s) Loss: 0.0001(0.0018) Grad: 612.1036  LR: 0.00000325  
Epoch: [12][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0000(0.0027) Grad: 3.6866  LR: 0.00000320  
Epoch: [12][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0000(0.0025) Grad: 4.3848  LR: 0.00000315  
Epoch: [12][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0024) Grad: 7.4760  LR: 0.00000309  
Epoch: [12][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0002(0.0026) Grad: 8025.7266  LR: 0.00000304  
Epoch: [12][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0027) Grad: 5.5893  LR: 0.00000299  
Epoch: [12][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0008(0.0025) Grad: 11483.8994  LR: 0.00000294  
Epoch: [12][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0024) Grad: 10.0884  LR: 0.00000288  
Epoch: [12][900/2860] Elapsed 3m 28s (remain 7m

Epoch 12 - avg_train_loss: 0.0021  avg_val_loss: 0.0291  time: 731s
Epoch 12 - Score: 0.8704, Th: 0.5


Epoch: [13][0/2860] Elapsed 0m 0s (remain 19m 20s) Loss: 0.0000(0.0000) Grad: 4.1515  LR: 0.00000191  
Epoch: [13][100/2860] Elapsed 0m 23s (remain 10m 37s) Loss: 0.0000(0.0012) Grad: 78.5555  LR: 0.00000187  
Epoch: [13][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0000(0.0014) Grad: 6.1307  LR: 0.00000182  
Epoch: [13][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0023(0.0017) Grad: 22698.6465  LR: 0.00000178  
Epoch: [13][400/2860] Elapsed 1m 32s (remain 9m 26s) Loss: 0.0000(0.0017) Grad: 5.0995  LR: 0.00000174  
Epoch: [13][500/2860] Elapsed 1m 55s (remain 9m 2s) Loss: 0.0001(0.0018) Grad: 4545.3564  LR: 0.00000170  
Epoch: [13][600/2860] Elapsed 2m 18s (remain 8m 39s) Loss: 0.0102(0.0021) Grad: 102387.2266  LR: 0.00000166  
Epoch: [13][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0021) Grad: 83.9002  LR: 0.00000162  
Epoch: [13][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0000(0.0019) Grad: 5.3550  LR: 0.00000158  
Epoch: [13][900/2860] Elapsed 3m 28s (remain

Epoch 13 - avg_train_loss: 0.0019  avg_val_loss: 0.0292  time: 729s
Epoch 13 - Score: 0.8707, Th: 0.7


Epoch: [14][0/2860] Elapsed 0m 0s (remain 18m 50s) Loss: 0.0000(0.0000) Grad: 1.2787  LR: 0.00000086  
Epoch: [14][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0000(0.0011) Grad: 1.3370  LR: 0.00000083  
Epoch: [14][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0182(0.0015) Grad: 616314.6875  LR: 0.00000081  
Epoch: [14][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0000(0.0015) Grad: 9.2013  LR: 0.00000078  
Epoch: [14][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0015) Grad: 5.6283  LR: 0.00000075  
Epoch: [14][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0016) Grad: 487.0037  LR: 0.00000072  
Epoch: [14][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0016) Grad: 4.3790  LR: 0.00000069  
Epoch: [14][700/2860] Elapsed 2m 42s (remain 8m 19s) Loss: 0.0000(0.0018) Grad: 37.4058  LR: 0.00000067  
Epoch: [14][800/2860] Elapsed 3m 5s (remain 7m 57s) Loss: 0.0000(0.0018) Grad: 123.2911  LR: 0.00000064  
Epoch: [14][900/2860] Elapsed 3m 28s (remain 7m 

Epoch 14 - avg_train_loss: 0.0017  avg_val_loss: 0.0292  time: 729s
Epoch 14 - Score: 0.8702, Th: 0.8


Epoch: [15][0/2860] Elapsed 0m 0s (remain 19m 8s) Loss: 0.0001(0.0001) Grad: 2020.0651  LR: 0.00000022  
Epoch: [15][100/2860] Elapsed 0m 23s (remain 10m 37s) Loss: 0.0000(0.0008) Grad: 115.3217  LR: 0.00000020  
Epoch: [15][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0010) Grad: 6.7173  LR: 0.00000019  
Epoch: [15][300/2860] Elapsed 1m 9s (remain 9m 49s) Loss: 0.0000(0.0012) Grad: 4.9157  LR: 0.00000018  
Epoch: [15][400/2860] Elapsed 1m 32s (remain 9m 26s) Loss: 0.0000(0.0011) Grad: 1.3102  LR: 0.00000016  
Epoch: [15][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0000(0.0012) Grad: 238.7610  LR: 0.00000015  
Epoch: [15][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0014) Grad: 4.8916  LR: 0.00000014  
Epoch: [15][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0014) Grad: 29.1280  LR: 0.00000012  
Epoch: [15][800/2860] Elapsed 3m 5s (remain 7m 57s) Loss: 0.0000(0.0013) Grad: 15.6790  LR: 0.00000011  
Epoch: [15][900/2860] Elapsed 3m 28s (remain 7m 33

Epoch 15 - avg_train_loss: 0.0015  avg_val_loss: 0.0293  time: 729s
Epoch 15 - Score: 0.8700, Th: 0.7
Score: 0.8709, Th: 0.6
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2860] Elapsed 0m 0s (remain 19m 42s) Loss: 0.6935(0.6935) Grad: 166179.1875  LR: 0.00002000  
Epoch: [1][100/2860] Elapsed 0m 23s (remain 10m 48s) Loss: 0.0627(0.2365) Grad: 59060.5469  LR: 0.00002000  
Epoch: [1][200/2860] Elapsed 0m 47s (remain 10m 23s) Loss: 0.0394(0.1582) Grad: 18814.5586  LR: 0.00002000  
Epoch: [1][300/2860] Elapsed 1m 10s (remain 10m 0s) Loss: 0.0515(0.1314) Grad: 31909.8555  LR: 0.00002000  
Epoch: [1][400/2860] Elapsed 1m 34s (remain 9m 37s) Loss: 0.0807(0.1131) Grad: 93305.4609  LR: 0.00002000  
Epoch: [1][500/2860] Elapsed 1m 57s (remain 9m 14s) Loss: 0.0189(0.1003) Grad: 10844.2812  LR: 0.00001999  
Epoch: [1][600/2860] Elapsed 2m 21s (remain 8m 52s) Loss: 0.0218(0.0895) Grad: 13390.3418  LR: 0.00001999  
Epoch: [1][700/2860] Elapsed 2m 45s (remain 8m 30s) Loss: 0.0121(0.0811) Grad: 13808.0615  LR: 0.00001999  
Epoch: [1][800/2860] Elapsed 3m 9s (remain 8m 6s) Loss: 0.0182(0.0749) Grad: 19721.8867  LR: 0.00001998  
Epoch: [1][900/2860] Elapsed 

Epoch 1 - avg_train_loss: 0.0361  avg_val_loss: 0.0174  time: 741s
Epoch 1 - Score: 0.8066, Th: 0.3
Epoch 1 - Save Best Score: 0.8066 Model


Epoch: [2][0/2860] Elapsed 0m 0s (remain 19m 55s) Loss: 0.0259(0.0259) Grad: 155751.6562  LR: 0.00001978  
Epoch: [2][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0115(0.0169) Grad: 44290.0000  LR: 0.00001977  
Epoch: [2][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0009(0.0145) Grad: 8377.0342  LR: 0.00001975  
Epoch: [2][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0576(0.0154) Grad: 47331.5703  LR: 0.00001973  
Epoch: [2][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0092(0.0157) Grad: 3557.2886  LR: 0.00001972  
Epoch: [2][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0002(0.0153) Grad: 449.2982  LR: 0.00001970  
Epoch: [2][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0068(0.0148) Grad: 35574.5977  LR: 0.00001968  
Epoch: [2][700/2860] Elapsed 2m 44s (remain 8m 26s) Loss: 0.0024(0.0146) Grad: 37230.4141  LR: 0.00001966  
Epoch: [2][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.0314(0.0145) Grad: 107482.5000  LR: 0.00001964  
Epoch: [2][900/2860] Elapsed 3m 

Epoch 2 - avg_train_loss: 0.0142  avg_val_loss: 0.0147  time: 737s
Epoch 2 - Score: 0.8377, Th: 0.4
Epoch 2 - Save Best Score: 0.8377 Model


Epoch: [3][0/2860] Elapsed 0m 0s (remain 19m 53s) Loss: 0.0171(0.0171) Grad: 47861.1094  LR: 0.00001914  
Epoch: [3][100/2860] Elapsed 0m 23s (remain 10m 45s) Loss: 0.0143(0.0133) Grad: 23674.5039  LR: 0.00001911  
Epoch: [3][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0161(0.0131) Grad: 32458.5605  LR: 0.00001907  
Epoch: [3][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0009(0.0130) Grad: 7666.3198  LR: 0.00001904  
Epoch: [3][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0000(0.0132) Grad: 82.6025  LR: 0.00001901  
Epoch: [3][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0153(0.0132) Grad: 33380.9453  LR: 0.00001898  
Epoch: [3][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0269(0.0132) Grad: 142372.0312  LR: 0.00001895  
Epoch: [3][700/2860] Elapsed 2m 44s (remain 8m 26s) Loss: 0.0001(0.0129) Grad: 288.5792  LR: 0.00001891  
Epoch: [3][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.0215(0.0127) Grad: 60634.8906  LR: 0.00001888  
Epoch: [3][900/2860] Elapsed 3m 31s

Epoch 3 - avg_train_loss: 0.0122  avg_val_loss: 0.0170  time: 737s
Epoch 3 - Score: 0.8439, Th: 0.4
Epoch 3 - Save Best Score: 0.8439 Model


Epoch: [4][0/2860] Elapsed 0m 0s (remain 20m 10s) Loss: 0.0013(0.0013) Grad: 12337.6660  LR: 0.00001809  
Epoch: [4][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0153(0.0076) Grad: 15345.0010  LR: 0.00001805  
Epoch: [4][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0027(0.0086) Grad: 14045.6016  LR: 0.00001800  
Epoch: [4][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0002(0.0091) Grad: 626.4117  LR: 0.00001796  
Epoch: [4][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.1317(0.0093) Grad: 418929.0000  LR: 0.00001791  
Epoch: [4][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0343(0.0094) Grad: 138124.7969  LR: 0.00001787  
Epoch: [4][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0017(0.0095) Grad: 13758.1709  LR: 0.00001782  
Epoch: [4][700/2860] Elapsed 2m 44s (remain 8m 26s) Loss: 0.0001(0.0097) Grad: 169.6564  LR: 0.00001778  
Epoch: [4][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.0113(0.0097) Grad: 16603.6094  LR: 0.00001773  
Epoch: [4][900/2860] Elapsed 3m 

Epoch 4 - avg_train_loss: 0.0101  avg_val_loss: 0.0245  time: 736s
Epoch 4 - Score: 0.8130, Th: 0.2


Epoch: [5][0/2860] Elapsed 0m 0s (remain 20m 21s) Loss: 0.0046(0.0046) Grad: 6794.2183  LR: 0.00001669  
Epoch: [5][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0000(0.0069) Grad: 64.5003  LR: 0.00001664  
Epoch: [5][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0003(0.0084) Grad: 10958.1777  LR: 0.00001658  
Epoch: [5][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0049(0.0079) Grad: 26291.2227  LR: 0.00001653  
Epoch: [5][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0068(0.0083) Grad: 34707.3125  LR: 0.00001647  
Epoch: [5][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0035(0.0082) Grad: 6663.1851  LR: 0.00001641  
Epoch: [5][600/2860] Elapsed 2m 20s (remain 8m 49s) Loss: 0.0000(0.0086) Grad: 93.4201  LR: 0.00001636  
Epoch: [5][700/2860] Elapsed 2m 44s (remain 8m 25s) Loss: 0.0003(0.0083) Grad: 9957.7090  LR: 0.00001630  
Epoch: [5][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.0079(0.0088) Grad: 81801.9375  LR: 0.00001624  
Epoch: [5][900/2860] Elapsed 3m 30s (re

Epoch 5 - avg_train_loss: 0.0086  avg_val_loss: 0.0199  time: 735s
Epoch 5 - Score: 0.8562, Th: 0.4
Epoch 5 - Save Best Score: 0.8562 Model


Epoch: [6][0/2860] Elapsed 0m 0s (remain 20m 8s) Loss: 0.0000(0.0000) Grad: 37.2227  LR: 0.00001500  
Epoch: [6][100/2860] Elapsed 0m 23s (remain 10m 44s) Loss: 0.0096(0.0072) Grad: 81274.2656  LR: 0.00001494  
Epoch: [6][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0017(0.0069) Grad: 23957.6797  LR: 0.00001487  
Epoch: [6][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0642(0.0070) Grad: 621895.8750  LR: 0.00001481  
Epoch: [6][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0001(0.0070) Grad: 559.6129  LR: 0.00001474  
Epoch: [6][500/2860] Elapsed 1m 57s (remain 9m 12s) Loss: 0.0186(0.0067) Grad: 76571.0938  LR: 0.00001468  
Epoch: [6][600/2860] Elapsed 2m 20s (remain 8m 48s) Loss: 0.0029(0.0067) Grad: 59722.4805  LR: 0.00001461  
Epoch: [6][700/2860] Elapsed 2m 44s (remain 8m 25s) Loss: 0.0000(0.0066) Grad: 119.7721  LR: 0.00001455  
Epoch: [6][800/2860] Elapsed 3m 7s (remain 8m 1s) Loss: 0.0219(0.0069) Grad: 54857.4297  LR: 0.00001448  
Epoch: [6][900/2860] Elapsed 3m 30s (

Epoch 6 - avg_train_loss: 0.0069  avg_val_loss: 0.0251  time: 734s
Epoch 6 - Score: 0.8603, Th: 0.8
Epoch 6 - Save Best Score: 0.8603 Model


Epoch: [7][0/2860] Elapsed 0m 0s (remain 19m 43s) Loss: 0.0222(0.0222) Grad: 327428.3438  LR: 0.00001309  
Epoch: [7][100/2860] Elapsed 0m 23s (remain 10m 41s) Loss: 0.0000(0.0068) Grad: 62.0567  LR: 0.00001302  
Epoch: [7][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0000(0.0054) Grad: 70.9763  LR: 0.00001295  
Epoch: [7][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0003(0.0057) Grad: 2544.8696  LR: 0.00001288  
Epoch: [7][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0000(0.0056) Grad: 67.7267  LR: 0.00001281  
Epoch: [7][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0325(0.0058) Grad: 10697.0068  LR: 0.00001274  
Epoch: [7][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0012(0.0057) Grad: 5908.2700  LR: 0.00001267  
Epoch: [7][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0000(0.0056) Grad: 16.1644  LR: 0.00001260  
Epoch: [7][800/2860] Elapsed 3m 6s (remain 8m 0s) Loss: 0.0050(0.0056) Grad: 25596.8145  LR: 0.00001253  
Epoch: [7][900/2860] Elapsed 3m 30s (remain

Epoch 7 - avg_train_loss: 0.0056  avg_val_loss: 0.0246  time: 734s
Epoch 7 - Score: 0.8578, Th: 0.4


Epoch: [8][0/2860] Elapsed 0m 0s (remain 20m 4s) Loss: 0.0033(0.0033) Grad: 8736.7031  LR: 0.00001104  
Epoch: [8][100/2860] Elapsed 0m 23s (remain 10m 48s) Loss: 0.0000(0.0037) Grad: 3.6677  LR: 0.00001097  
Epoch: [8][200/2860] Elapsed 0m 46s (remain 10m 21s) Loss: 0.0000(0.0035) Grad: 57.6137  LR: 0.00001090  
Epoch: [8][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0000(0.0038) Grad: 47.3242  LR: 0.00001083  
Epoch: [8][400/2860] Elapsed 1m 33s (remain 9m 35s) Loss: 0.0000(0.0037) Grad: 610.2460  LR: 0.00001075  
Epoch: [8][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0247(0.0041) Grad: 101855.6719  LR: 0.00001068  
Epoch: [8][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0048(0.0041) Grad: 10432.9648  LR: 0.00001061  
Epoch: [8][700/2860] Elapsed 2m 43s (remain 8m 24s) Loss: 0.0006(0.0043) Grad: 9076.8281  LR: 0.00001053  
Epoch: [8][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0001(0.0043) Grad: 1063.0195  LR: 0.00001046  
Epoch: [8][900/2860] Elapsed 3m 29s (remain 

Epoch 8 - avg_train_loss: 0.0048  avg_val_loss: 0.0252  time: 732s
Epoch 8 - Score: 0.8535, Th: 0.8


Epoch: [9][0/2860] Elapsed 0m 0s (remain 19m 52s) Loss: 0.0000(0.0000) Grad: 109.5571  LR: 0.00000895  
Epoch: [9][100/2860] Elapsed 0m 23s (remain 10m 49s) Loss: 0.0000(0.0031) Grad: 84.8771  LR: 0.00000888  
Epoch: [9][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0000(0.0036) Grad: 19.5865  LR: 0.00000881  
Epoch: [9][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0000(0.0034) Grad: 43.8203  LR: 0.00000874  
Epoch: [9][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0000(0.0040) Grad: 10.0818  LR: 0.00000866  
Epoch: [9][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0032(0.0040) Grad: 4322.5112  LR: 0.00000859  
Epoch: [9][600/2860] Elapsed 2m 20s (remain 8m 47s) Loss: 0.0004(0.0039) Grad: 4097.6792  LR: 0.00000852  
Epoch: [9][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0099(0.0041) Grad: 30512.3066  LR: 0.00000845  
Epoch: [9][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0250(0.0040) Grad: 57889.0195  LR: 0.00000837  
Epoch: [9][900/2860] Elapsed 3m 29s (remain 7

Epoch 9 - avg_train_loss: 0.0040  avg_val_loss: 0.0301  time: 732s
Epoch 9 - Score: 0.8577, Th: 0.7


Epoch: [10][0/2860] Elapsed 0m 0s (remain 19m 51s) Loss: 0.0000(0.0000) Grad: 165.6911  LR: 0.00000691  
Epoch: [10][100/2860] Elapsed 0m 23s (remain 10m 48s) Loss: 0.0000(0.0022) Grad: 95.5330  LR: 0.00000684  
Epoch: [10][200/2860] Elapsed 0m 47s (remain 10m 23s) Loss: 0.0006(0.0025) Grad: 19519.7324  LR: 0.00000677  
Epoch: [10][300/2860] Elapsed 1m 10s (remain 10m 0s) Loss: 0.0000(0.0023) Grad: 348.4499  LR: 0.00000670  
Epoch: [10][400/2860] Elapsed 1m 33s (remain 9m 34s) Loss: 0.0000(0.0026) Grad: 16.2244  LR: 0.00000663  
Epoch: [10][500/2860] Elapsed 1m 57s (remain 9m 11s) Loss: 0.0001(0.0027) Grad: 311.5181  LR: 0.00000656  
Epoch: [10][600/2860] Elapsed 2m 20s (remain 8m 46s) Loss: 0.0009(0.0025) Grad: 20373.8477  LR: 0.00000649  
Epoch: [10][700/2860] Elapsed 2m 43s (remain 8m 22s) Loss: 0.0000(0.0026) Grad: 25.7200  LR: 0.00000643  
Epoch: [10][800/2860] Elapsed 3m 6s (remain 7m 58s) Loss: 0.0023(0.0025) Grad: 89582.1172  LR: 0.00000636  
Epoch: [10][900/2860] Elapsed 3m 29

Epoch 10 - avg_train_loss: 0.0029  avg_val_loss: 0.0326  time: 730s
Epoch 10 - Score: 0.8561, Th: 0.6


Epoch: [11][0/2860] Elapsed 0m 0s (remain 19m 29s) Loss: 0.0000(0.0000) Grad: 86.0277  LR: 0.00000500  
Epoch: [11][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0000(0.0022) Grad: 152.2357  LR: 0.00000494  
Epoch: [11][200/2860] Elapsed 0m 46s (remain 10m 19s) Loss: 0.0000(0.0024) Grad: 13.9080  LR: 0.00000487  
Epoch: [11][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0000(0.0022) Grad: 3.2522  LR: 0.00000481  
Epoch: [11][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0000(0.0022) Grad: 23.3210  LR: 0.00000475  
Epoch: [11][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0041(0.0024) Grad: 9558.4697  LR: 0.00000469  
Epoch: [11][600/2860] Elapsed 2m 19s (remain 8m 44s) Loss: 0.0000(0.0024) Grad: 45.2911  LR: 0.00000462  
Epoch: [11][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0000(0.0024) Grad: 28.0275  LR: 0.00000456  
Epoch: [11][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0023) Grad: 53.3553  LR: 0.00000450  
Epoch: [11][900/2860] Elapsed 3m 28s (remain 7

Epoch 11 - avg_train_loss: 0.0024  avg_val_loss: 0.0325  time: 730s
Epoch 11 - Score: 0.8570, Th: 0.2


Epoch: [12][0/2860] Elapsed 0m 0s (remain 20m 1s) Loss: 0.0000(0.0000) Grad: 19.1886  LR: 0.00000331  
Epoch: [12][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0000(0.0028) Grad: 23.2661  LR: 0.00000325  
Epoch: [12][200/2860] Elapsed 0m 47s (remain 10m 24s) Loss: 0.0000(0.0024) Grad: 43.3163  LR: 0.00000320  
Epoch: [12][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0000(0.0028) Grad: 18.5521  LR: 0.00000315  
Epoch: [12][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0000(0.0028) Grad: 93.6470  LR: 0.00000309  
Epoch: [12][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0286(0.0026) Grad: 20125.3594  LR: 0.00000304  
Epoch: [12][600/2860] Elapsed 2m 19s (remain 8m 44s) Loss: 0.0000(0.0026) Grad: 368.4036  LR: 0.00000299  
Epoch: [12][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0275(0.0025) Grad: 92621.1250  LR: 0.00000294  
Epoch: [12][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0024) Grad: 20.0614  LR: 0.00000288  
Epoch: [12][900/2860] Elapsed 3m 28s (rema

Epoch 12 - avg_train_loss: 0.0018  avg_val_loss: 0.0332  time: 729s
Epoch 12 - Score: 0.8583, Th: 0.7


Epoch: [13][0/2860] Elapsed 0m 0s (remain 19m 6s) Loss: 0.0000(0.0000) Grad: 22.7191  LR: 0.00000191  
Epoch: [13][100/2860] Elapsed 0m 23s (remain 10m 49s) Loss: 0.0000(0.0022) Grad: 31.8849  LR: 0.00000187  
Epoch: [13][200/2860] Elapsed 0m 47s (remain 10m 23s) Loss: 0.0002(0.0027) Grad: 18367.9961  LR: 0.00000182  
Epoch: [13][300/2860] Elapsed 1m 10s (remain 9m 57s) Loss: 0.0044(0.0028) Grad: 63022.2500  LR: 0.00000178  
Epoch: [13][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0000(0.0024) Grad: 20.3881  LR: 0.00000174  
Epoch: [13][500/2860] Elapsed 1m 56s (remain 9m 7s) Loss: 0.0000(0.0025) Grad: 33.3490  LR: 0.00000170  
Epoch: [13][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0000(0.0023) Grad: 34.5809  LR: 0.00000166  
Epoch: [13][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0001(0.0023) Grad: 1601.5258  LR: 0.00000162  
Epoch: [13][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0024) Grad: 51.9570  LR: 0.00000158  
Epoch: [13][900/2860] Elapsed 3m 28s (rem

Epoch 13 - avg_train_loss: 0.0017  avg_val_loss: 0.0338  time: 729s
Epoch 13 - Score: 0.8584, Th: 0.6


Epoch: [14][0/2860] Elapsed 0m 0s (remain 19m 48s) Loss: 0.0143(0.0143) Grad: 256178.4219  LR: 0.00000086  
Epoch: [14][100/2860] Elapsed 0m 23s (remain 10m 48s) Loss: 0.0000(0.0019) Grad: 46.7364  LR: 0.00000083  
Epoch: [14][200/2860] Elapsed 0m 46s (remain 10m 21s) Loss: 0.0000(0.0016) Grad: 105.4760  LR: 0.00000081  
Epoch: [14][300/2860] Elapsed 1m 9s (remain 9m 54s) Loss: 0.0001(0.0018) Grad: 1414.2671  LR: 0.00000078  
Epoch: [14][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0000(0.0018) Grad: 41.0678  LR: 0.00000075  
Epoch: [14][500/2860] Elapsed 1m 56s (remain 9m 6s) Loss: 0.0004(0.0018) Grad: 6853.3721  LR: 0.00000072  
Epoch: [14][600/2860] Elapsed 2m 18s (remain 8m 42s) Loss: 0.0000(0.0017) Grad: 13.0553  LR: 0.00000069  
Epoch: [14][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0000(0.0018) Grad: 1.3935  LR: 0.00000067  
Epoch: [14][800/2860] Elapsed 3m 4s (remain 7m 55s) Loss: 0.0000(0.0017) Grad: 7.9249  LR: 0.00000064  
Epoch: [14][900/2860] Elapsed 3m 28s (rema

Epoch 14 - avg_train_loss: 0.0015  avg_val_loss: 0.0340  time: 728s
Epoch 14 - Score: 0.8590, Th: 0.7


Epoch: [15][0/2860] Elapsed 0m 0s (remain 19m 46s) Loss: 0.0037(0.0037) Grad: 123362.8828  LR: 0.00000022  
Epoch: [15][100/2860] Elapsed 0m 23s (remain 10m 54s) Loss: 0.0106(0.0011) Grad: 126786.3984  LR: 0.00000020  
Epoch: [15][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0000(0.0017) Grad: 20.3795  LR: 0.00000019  
Epoch: [15][300/2860] Elapsed 1m 10s (remain 9m 55s) Loss: 0.0004(0.0014) Grad: 4775.7339  LR: 0.00000018  
Epoch: [15][400/2860] Elapsed 1m 33s (remain 9m 30s) Loss: 0.0000(0.0012) Grad: 13.7246  LR: 0.00000016  
Epoch: [15][500/2860] Elapsed 1m 55s (remain 9m 5s) Loss: 0.0000(0.0013) Grad: 168.4799  LR: 0.00000015  
Epoch: [15][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0013) Grad: 5.1989  LR: 0.00000014  
Epoch: [15][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0013) Grad: 18.3367  LR: 0.00000012  
Epoch: [15][800/2860] Elapsed 3m 4s (remain 7m 54s) Loss: 0.0001(0.0013) Grad: 365.9675  LR: 0.00000011  
Epoch: [15][900/2860] Elapsed 3m 27s 

Epoch 15 - avg_train_loss: 0.0015  avg_val_loss: 0.0342  time: 729s
Epoch 15 - Score: 0.8591, Th: 0.6
Score: 0.8603, Th: 0.8
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2860] Elapsed 0m 0s (remain 19m 59s) Loss: 0.7217(0.7217) Grad: 155482.9688  LR: 0.00002000  
Epoch: [1][100/2860] Elapsed 0m 24s (remain 10m 57s) Loss: 0.1164(0.2228) Grad: 21017.1211  LR: 0.00002000  
Epoch: [1][200/2860] Elapsed 0m 47s (remain 10m 29s) Loss: 0.1585(0.1530) Grad: 29715.8145  LR: 0.00002000  
Epoch: [1][300/2860] Elapsed 1m 11s (remain 10m 6s) Loss: 0.0233(0.1288) Grad: 18960.6602  LR: 0.00002000  
Epoch: [1][400/2860] Elapsed 1m 34s (remain 9m 41s) Loss: 0.0951(0.1142) Grad: 66358.7656  LR: 0.00002000  
Epoch: [1][500/2860] Elapsed 1m 58s (remain 9m 17s) Loss: 0.0964(0.1014) Grad: 157286.3594  LR: 0.00001999  
Epoch: [1][600/2860] Elapsed 2m 21s (remain 8m 53s) Loss: 0.0315(0.0923) Grad: 46437.7148  LR: 0.00001999  
Epoch: [1][700/2860] Elapsed 2m 45s (remain 8m 29s) Loss: 0.0432(0.0850) Grad: 37874.4961  LR: 0.00001999  
Epoch: [1][800/2860] Elapsed 3m 9s (remain 8m 6s) Loss: 0.0813(0.0784) Grad: 37759.8086  LR: 0.00001998  
Epoch: [1][900/2860] Elapsed

Epoch 1 - avg_train_loss: 0.0385  avg_val_loss: 0.0165  time: 741s
Epoch 1 - Score: 0.8166, Th: 0.4
Epoch 1 - Save Best Score: 0.8166 Model


Epoch: [2][0/2860] Elapsed 0m 0s (remain 19m 54s) Loss: 0.0036(0.0036) Grad: 8373.6045  LR: 0.00001978  
Epoch: [2][100/2860] Elapsed 0m 23s (remain 10m 53s) Loss: 0.0121(0.0155) Grad: 34387.8359  LR: 0.00001977  
Epoch: [2][200/2860] Elapsed 0m 47s (remain 10m 27s) Loss: 0.0066(0.0154) Grad: 17532.2383  LR: 0.00001975  
Epoch: [2][300/2860] Elapsed 1m 10s (remain 10m 1s) Loss: 0.0144(0.0155) Grad: 24936.5117  LR: 0.00001973  
Epoch: [2][400/2860] Elapsed 1m 34s (remain 9m 36s) Loss: 0.0006(0.0150) Grad: 3978.7732  LR: 0.00001972  
Epoch: [2][500/2860] Elapsed 1m 57s (remain 9m 12s) Loss: 0.0092(0.0150) Grad: 23414.2324  LR: 0.00001970  
Epoch: [2][600/2860] Elapsed 2m 20s (remain 8m 48s) Loss: 0.0019(0.0145) Grad: 6442.1724  LR: 0.00001968  
Epoch: [2][700/2860] Elapsed 2m 44s (remain 8m 25s) Loss: 0.0110(0.0144) Grad: 114169.4375  LR: 0.00001966  
Epoch: [2][800/2860] Elapsed 3m 7s (remain 8m 2s) Loss: 0.0002(0.0148) Grad: 1898.5476  LR: 0.00001964  
Epoch: [2][900/2860] Elapsed 3m 3

Epoch 2 - avg_train_loss: 0.0148  avg_val_loss: 0.0142  time: 739s
Epoch 2 - Score: 0.8428, Th: 0.4
Epoch 2 - Save Best Score: 0.8428 Model


Epoch: [3][0/2860] Elapsed 0m 0s (remain 19m 36s) Loss: 0.0065(0.0065) Grad: 18114.9453  LR: 0.00001914  
Epoch: [3][100/2860] Elapsed 0m 23s (remain 10m 51s) Loss: 0.0131(0.0099) Grad: 34736.4570  LR: 0.00001911  
Epoch: [3][200/2860] Elapsed 0m 47s (remain 10m 22s) Loss: 0.0092(0.0112) Grad: 80949.0938  LR: 0.00001907  
Epoch: [3][300/2860] Elapsed 1m 10s (remain 9m 58s) Loss: 0.0282(0.0119) Grad: 38955.5352  LR: 0.00001904  
Epoch: [3][400/2860] Elapsed 1m 33s (remain 9m 33s) Loss: 0.0001(0.0121) Grad: 445.6589  LR: 0.00001901  
Epoch: [3][500/2860] Elapsed 1m 56s (remain 9m 10s) Loss: 0.0001(0.0121) Grad: 347.5888  LR: 0.00001898  
Epoch: [3][600/2860] Elapsed 2m 20s (remain 8m 46s) Loss: 0.0098(0.0118) Grad: 13349.7412  LR: 0.00001895  
Epoch: [3][700/2860] Elapsed 2m 43s (remain 8m 23s) Loss: 0.0001(0.0119) Grad: 1128.3588  LR: 0.00001891  
Epoch: [3][800/2860] Elapsed 3m 6s (remain 8m 0s) Loss: 0.0018(0.0120) Grad: 9321.6484  LR: 0.00001888  
Epoch: [3][900/2860] Elapsed 3m 30s 

Epoch 3 - avg_train_loss: 0.0123  avg_val_loss: 0.0140  time: 737s
Epoch 3 - Score: 0.8565, Th: 0.7
Epoch 3 - Save Best Score: 0.8565 Model


Epoch: [4][0/2860] Elapsed 0m 0s (remain 20m 0s) Loss: 0.0096(0.0096) Grad: 26995.0020  LR: 0.00001809  
Epoch: [4][100/2860] Elapsed 0m 23s (remain 10m 46s) Loss: 0.0206(0.0082) Grad: 33588.0664  LR: 0.00001805  
Epoch: [4][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0231(0.0081) Grad: 105257.6328  LR: 0.00001800  
Epoch: [4][300/2860] Elapsed 1m 10s (remain 9m 56s) Loss: 0.0001(0.0084) Grad: 265.2169  LR: 0.00001796  
Epoch: [4][400/2860] Elapsed 1m 33s (remain 9m 32s) Loss: 0.0001(0.0089) Grad: 390.7101  LR: 0.00001791  
Epoch: [4][500/2860] Elapsed 1m 56s (remain 9m 9s) Loss: 0.0000(0.0089) Grad: 1265.0457  LR: 0.00001787  
Epoch: [4][600/2860] Elapsed 2m 20s (remain 8m 46s) Loss: 0.0001(0.0090) Grad: 2921.5911  LR: 0.00001782  
Epoch: [4][700/2860] Elapsed 2m 43s (remain 8m 22s) Loss: 0.0080(0.0096) Grad: 26397.3438  LR: 0.00001778  
Epoch: [4][800/2860] Elapsed 3m 6s (remain 7m 59s) Loss: 0.0105(0.0100) Grad: 13122.4287  LR: 0.00001773  
Epoch: [4][900/2860] Elapsed 3m 29s 

Epoch 4 - avg_train_loss: 0.0101  avg_val_loss: 0.0183  time: 737s
Epoch 4 - Score: 0.8524, Th: 0.4


Epoch: [5][0/2860] Elapsed 0m 0s (remain 19m 49s) Loss: 0.0025(0.0025) Grad: 22442.5820  LR: 0.00001669  
Epoch: [5][100/2860] Elapsed 0m 23s (remain 10m 45s) Loss: 0.0344(0.0087) Grad: 23922.4316  LR: 0.00001664  
Epoch: [5][200/2860] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0209(0.0082) Grad: 90719.8594  LR: 0.00001658  
Epoch: [5][300/2860] Elapsed 1m 9s (remain 9m 55s) Loss: 0.0005(0.0079) Grad: 9802.6543  LR: 0.00001653  
Epoch: [5][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0017(0.0084) Grad: 8953.6201  LR: 0.00001647  
Epoch: [5][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.1394(0.0087) Grad: 720199.0625  LR: 0.00001641  
Epoch: [5][600/2860] Elapsed 2m 19s (remain 8m 45s) Loss: 0.0000(0.0086) Grad: 553.8911  LR: 0.00001636  
Epoch: [5][700/2860] Elapsed 2m 42s (remain 8m 21s) Loss: 0.0089(0.0084) Grad: 120361.0938  LR: 0.00001630  
Epoch: [5][800/2860] Elapsed 3m 6s (remain 7m 58s) Loss: 0.0000(0.0083) Grad: 43.3686  LR: 0.00001624  
Epoch: [5][900/2860] Elapsed 3m 29s 

Epoch 5 - avg_train_loss: 0.0087  avg_val_loss: 0.0201  time: 736s
Epoch 5 - Score: 0.8430, Th: 0.3


Epoch: [6][0/2860] Elapsed 0m 0s (remain 19m 40s) Loss: 0.0000(0.0000) Grad: 174.0874  LR: 0.00001500  
Epoch: [6][100/2860] Elapsed 0m 23s (remain 10m 43s) Loss: 0.0267(0.0057) Grad: 76669.7188  LR: 0.00001494  
Epoch: [6][200/2860] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0128(0.0055) Grad: 36956.5859  LR: 0.00001487  
Epoch: [6][300/2860] Elapsed 1m 9s (remain 9m 54s) Loss: 0.0039(0.0073) Grad: 17892.6035  LR: 0.00001481  
Epoch: [6][400/2860] Elapsed 1m 33s (remain 9m 31s) Loss: 0.0001(0.0066) Grad: 718.8654  LR: 0.00001474  
Epoch: [6][500/2860] Elapsed 1m 56s (remain 9m 8s) Loss: 0.0008(0.0064) Grad: 11491.0420  LR: 0.00001468  
Epoch: [6][600/2860] Elapsed 2m 19s (remain 8m 44s) Loss: 0.0003(0.0067) Grad: 3955.9871  LR: 0.00001461  
Epoch: [6][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0069(0.0067) Grad: 35355.4531  LR: 0.00001455  
Epoch: [6][800/2860] Elapsed 3m 5s (remain 7m 57s) Loss: 0.0000(0.0068) Grad: 75.0188  LR: 0.00001448  
Epoch: [6][900/2860] Elapsed 3m 29s (re

Epoch 6 - avg_train_loss: 0.0072  avg_val_loss: 0.0212  time: 734s
Epoch 6 - Score: 0.8611, Th: 0.4
Epoch 6 - Save Best Score: 0.8611 Model


Epoch: [7][0/2860] Elapsed 0m 0s (remain 19m 51s) Loss: 0.0014(0.0014) Grad: 12320.5537  LR: 0.00001309  
Epoch: [7][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0001(0.0045) Grad: 2112.2808  LR: 0.00001302  
Epoch: [7][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0415(0.0043) Grad: 314604.7188  LR: 0.00001295  
Epoch: [7][300/2860] Elapsed 1m 9s (remain 9m 54s) Loss: 0.0103(0.0049) Grad: 25714.9453  LR: 0.00001288  
Epoch: [7][400/2860] Elapsed 1m 33s (remain 9m 30s) Loss: 0.0259(0.0048) Grad: 41886.7422  LR: 0.00001281  
Epoch: [7][500/2860] Elapsed 1m 56s (remain 9m 7s) Loss: 0.0000(0.0053) Grad: 23.2252  LR: 0.00001274  
Epoch: [7][600/2860] Elapsed 2m 19s (remain 8m 43s) Loss: 0.0003(0.0054) Grad: 3550.1372  LR: 0.00001267  
Epoch: [7][700/2860] Elapsed 2m 42s (remain 8m 20s) Loss: 0.0000(0.0054) Grad: 408.7957  LR: 0.00001260  
Epoch: [7][800/2860] Elapsed 3m 5s (remain 7m 57s) Loss: 0.0125(0.0055) Grad: 63202.8242  LR: 0.00001253  
Epoch: [7][900/2860] Elapsed 3m 29s (

Epoch 7 - avg_train_loss: 0.0057  avg_val_loss: 0.0241  time: 735s
Epoch 7 - Score: 0.8635, Th: 0.7
Epoch 7 - Save Best Score: 0.8635 Model


Epoch: [8][0/2860] Elapsed 0m 0s (remain 19m 54s) Loss: 0.0069(0.0069) Grad: 70446.0625  LR: 0.00001104  
Epoch: [8][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0044(0.0052) Grad: 20027.0098  LR: 0.00001097  
Epoch: [8][200/2860] Elapsed 0m 46s (remain 10m 13s) Loss: 0.0000(0.0040) Grad: 24.1573  LR: 0.00001090  
Epoch: [8][300/2860] Elapsed 1m 9s (remain 9m 52s) Loss: 0.0001(0.0037) Grad: 2039.9407  LR: 0.00001083  
Epoch: [8][400/2860] Elapsed 1m 32s (remain 9m 28s) Loss: 0.0063(0.0039) Grad: 21798.1895  LR: 0.00001075  
Epoch: [8][500/2860] Elapsed 1m 55s (remain 9m 5s) Loss: 0.0092(0.0040) Grad: 8264.6709  LR: 0.00001068  
Epoch: [8][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0045) Grad: 40.2182  LR: 0.00001061  
Epoch: [8][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0009(0.0046) Grad: 6882.5234  LR: 0.00001053  
Epoch: [8][800/2860] Elapsed 3m 5s (remain 7m 56s) Loss: 0.0000(0.0046) Grad: 25.9545  LR: 0.00001046  
Epoch: [8][900/2860] Elapsed 3m 28s (remain

Epoch 8 - avg_train_loss: 0.0050  avg_val_loss: 0.0242  time: 733s
Epoch 8 - Score: 0.8696, Th: 0.8
Epoch 8 - Save Best Score: 0.8696 Model


Epoch: [9][0/2860] Elapsed 0m 0s (remain 20m 2s) Loss: 0.0000(0.0000) Grad: 17.4583  LR: 0.00000895  
Epoch: [9][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0000(0.0035) Grad: 18.6554  LR: 0.00000888  
Epoch: [9][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0038) Grad: 37.8640  LR: 0.00000881  
Epoch: [9][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0000(0.0031) Grad: 5.9754  LR: 0.00000874  
Epoch: [9][400/2860] Elapsed 1m 32s (remain 9m 26s) Loss: 0.0424(0.0032) Grad: 177791.7969  LR: 0.00000866  
Epoch: [9][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0002(0.0033) Grad: 17734.5430  LR: 0.00000859  
Epoch: [9][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0000(0.0032) Grad: 1371.1324  LR: 0.00000852  
Epoch: [9][700/2860] Elapsed 2m 41s (remain 8m 18s) Loss: 0.0003(0.0032) Grad: 4557.3701  LR: 0.00000845  
Epoch: [9][800/2860] Elapsed 3m 5s (remain 7m 55s) Loss: 0.0162(0.0031) Grad: 53841.4141  LR: 0.00000837  
Epoch: [9][900/2860] Elapsed 3m 28s (remain 7m

Epoch 9 - avg_train_loss: 0.0035  avg_val_loss: 0.0252  time: 732s
Epoch 9 - Score: 0.8680, Th: 0.4


Epoch: [10][0/2860] Elapsed 0m 0s (remain 19m 19s) Loss: 0.0000(0.0000) Grad: 2.2461  LR: 0.00000691  
Epoch: [10][100/2860] Elapsed 0m 23s (remain 10m 40s) Loss: 0.0072(0.0024) Grad: 96721.8750  LR: 0.00000684  
Epoch: [10][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0027) Grad: 235.8577  LR: 0.00000677  
Epoch: [10][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0002(0.0031) Grad: 3557.7991  LR: 0.00000670  
Epoch: [10][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0055(0.0029) Grad: 69602.3594  LR: 0.00000663  
Epoch: [10][500/2860] Elapsed 1m 55s (remain 9m 4s) Loss: 0.0000(0.0027) Grad: 44.4277  LR: 0.00000656  
Epoch: [10][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0028) Grad: 31.8599  LR: 0.00000649  
Epoch: [10][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0026) Grad: 13.4210  LR: 0.00000643  
Epoch: [10][800/2860] Elapsed 3m 4s (remain 7m 54s) Loss: 0.0000(0.0026) Grad: 35.9345  LR: 0.00000636  
Epoch: [10][900/2860] Elapsed 3m 27s (rem

Epoch 10 - avg_train_loss: 0.0027  avg_val_loss: 0.0299  time: 732s
Epoch 10 - Score: 0.8677, Th: 0.3


Epoch: [11][0/2860] Elapsed 0m 0s (remain 20m 29s) Loss: 0.0000(0.0000) Grad: 5.3502  LR: 0.00000500  
Epoch: [11][100/2860] Elapsed 0m 23s (remain 10m 45s) Loss: 0.0101(0.0028) Grad: 10285.0068  LR: 0.00000494  
Epoch: [11][200/2860] Elapsed 0m 46s (remain 10m 16s) Loss: 0.0001(0.0018) Grad: 5730.5464  LR: 0.00000487  
Epoch: [11][300/2860] Elapsed 1m 9s (remain 9m 51s) Loss: 0.0000(0.0019) Grad: 65.7553  LR: 0.00000481  
Epoch: [11][400/2860] Elapsed 1m 32s (remain 9m 27s) Loss: 0.0000(0.0019) Grad: 13.7274  LR: 0.00000475  
Epoch: [11][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0000(0.0019) Grad: 640.3650  LR: 0.00000469  
Epoch: [11][600/2860] Elapsed 2m 18s (remain 8m 40s) Loss: 0.0000(0.0018) Grad: 3.6943  LR: 0.00000462  
Epoch: [11][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0019) Grad: 62.4531  LR: 0.00000456  
Epoch: [11][800/2860] Elapsed 3m 4s (remain 7m 54s) Loss: 0.0000(0.0020) Grad: 605.8786  LR: 0.00000450  
Epoch: [11][900/2860] Elapsed 3m 27s (remain

Epoch 11 - avg_train_loss: 0.0022  avg_val_loss: 0.0289  time: 731s
Epoch 11 - Score: 0.8635, Th: 0.4


Epoch: [12][0/2860] Elapsed 0m 0s (remain 19m 38s) Loss: 0.0000(0.0000) Grad: 6.6163  LR: 0.00000331  
Epoch: [12][100/2860] Elapsed 0m 23s (remain 10m 39s) Loss: 0.0000(0.0010) Grad: 187.3898  LR: 0.00000325  
Epoch: [12][200/2860] Elapsed 0m 46s (remain 10m 14s) Loss: 0.0000(0.0012) Grad: 13.9846  LR: 0.00000320  
Epoch: [12][300/2860] Elapsed 1m 9s (remain 9m 50s) Loss: 0.0000(0.0016) Grad: 17.9245  LR: 0.00000315  
Epoch: [12][400/2860] Elapsed 1m 32s (remain 9m 26s) Loss: 0.0000(0.0015) Grad: 11.6896  LR: 0.00000309  
Epoch: [12][500/2860] Elapsed 1m 55s (remain 9m 3s) Loss: 0.0000(0.0015) Grad: 5.6762  LR: 0.00000304  
Epoch: [12][600/2860] Elapsed 2m 18s (remain 8m 41s) Loss: 0.0015(0.0017) Grad: 21433.8008  LR: 0.00000299  
Epoch: [12][700/2860] Elapsed 2m 41s (remain 8m 17s) Loss: 0.0000(0.0016) Grad: 12.5887  LR: 0.00000294  
Epoch: [12][800/2860] Elapsed 3m 4s (remain 7m 54s) Loss: 0.0000(0.0016) Grad: 5.8091  LR: 0.00000288  
Epoch: [12][900/2860] Elapsed 3m 27s (remain 7m 

Epoch 12 - avg_train_loss: 0.0018  avg_val_loss: 0.0313  time: 731s
Epoch 12 - Score: 0.8684, Th: 0.7


Epoch: [13][0/2860] Elapsed 0m 0s (remain 19m 15s) Loss: 0.0000(0.0000) Grad: 11.7803  LR: 0.00000191  
Epoch: [13][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0000(0.0015) Grad: 1.8848  LR: 0.00000187  
Epoch: [13][200/2860] Elapsed 0m 46s (remain 10m 13s) Loss: 0.0000(0.0011) Grad: 5.7519  LR: 0.00000182  
Epoch: [13][300/2860] Elapsed 1m 9s (remain 9m 48s) Loss: 0.0000(0.0010) Grad: 162.9175  LR: 0.00000178  
Epoch: [13][400/2860] Elapsed 1m 32s (remain 9m 24s) Loss: 0.0000(0.0013) Grad: 6.2059  LR: 0.00000174  
Epoch: [13][500/2860] Elapsed 1m 55s (remain 9m 2s) Loss: 0.0000(0.0012) Grad: 5.7435  LR: 0.00000170  
Epoch: [13][600/2860] Elapsed 2m 18s (remain 8m 39s) Loss: 0.0000(0.0011) Grad: 118.2631  LR: 0.00000166  
Epoch: [13][700/2860] Elapsed 2m 41s (remain 8m 16s) Loss: 0.0000(0.0013) Grad: 24.5248  LR: 0.00000162  
Epoch: [13][800/2860] Elapsed 3m 4s (remain 7m 53s) Loss: 0.0000(0.0013) Grad: 8.0158  LR: 0.00000158  
Epoch: [13][900/2860] Elapsed 3m 27s (remain 7m 30s)

Epoch 13 - avg_train_loss: 0.0015  avg_val_loss: 0.0313  time: 730s
Epoch 13 - Score: 0.8679, Th: 0.5


Epoch: [14][0/2860] Elapsed 0m 0s (remain 19m 39s) Loss: 0.0000(0.0000) Grad: 22.7737  LR: 0.00000086  
Epoch: [14][100/2860] Elapsed 0m 23s (remain 10m 38s) Loss: 0.0156(0.0015) Grad: 20809.4102  LR: 0.00000083  
Epoch: [14][200/2860] Elapsed 0m 46s (remain 10m 12s) Loss: 0.0000(0.0015) Grad: 4.0594  LR: 0.00000081  
Epoch: [14][300/2860] Elapsed 1m 9s (remain 9m 47s) Loss: 0.0000(0.0013) Grad: 20.2979  LR: 0.00000078  
Epoch: [14][400/2860] Elapsed 1m 32s (remain 9m 24s) Loss: 0.0000(0.0013) Grad: 3.0234  LR: 0.00000075  
Epoch: [14][500/2860] Elapsed 1m 55s (remain 9m 2s) Loss: 0.0009(0.0013) Grad: 53748.6523  LR: 0.00000072  
Epoch: [14][600/2860] Elapsed 2m 18s (remain 8m 39s) Loss: 0.0000(0.0012) Grad: 12.9259  LR: 0.00000069  
Epoch: [14][700/2860] Elapsed 2m 41s (remain 8m 16s) Loss: 0.0000(0.0014) Grad: 4.5043  LR: 0.00000067  
Epoch: [14][800/2860] Elapsed 3m 4s (remain 7m 53s) Loss: 0.0000(0.0015) Grad: 6.3702  LR: 0.00000064  
Epoch: [14][900/2860] Elapsed 3m 27s (remain 7m

Epoch 14 - avg_train_loss: 0.0016  avg_val_loss: 0.0313  time: 727s
Epoch 14 - Score: 0.8680, Th: 0.7


Epoch: [15][0/2860] Elapsed 0m 0s (remain 19m 44s) Loss: 0.0000(0.0000) Grad: 10.3647  LR: 0.00000022  
Epoch: [15][100/2860] Elapsed 0m 23s (remain 10m 33s) Loss: 0.0180(0.0023) Grad: 119601.8828  LR: 0.00000020  
Epoch: [15][200/2860] Elapsed 0m 45s (remain 10m 7s) Loss: 0.0000(0.0015) Grad: 1.5572  LR: 0.00000019  
Epoch: [15][300/2860] Elapsed 1m 8s (remain 9m 44s) Loss: 0.0000(0.0015) Grad: 0.9534  LR: 0.00000018  
Epoch: [15][400/2860] Elapsed 1m 31s (remain 9m 20s) Loss: 0.0000(0.0014) Grad: 5.7390  LR: 0.00000016  
Epoch: [15][500/2860] Elapsed 1m 54s (remain 8m 57s) Loss: 0.0002(0.0013) Grad: 9826.6162  LR: 0.00000015  
Epoch: [15][600/2860] Elapsed 2m 16s (remain 8m 34s) Loss: 0.0081(0.0013) Grad: 19829.9434  LR: 0.00000014  
Epoch: [15][700/2860] Elapsed 2m 39s (remain 8m 11s) Loss: 0.0000(0.0013) Grad: 9.3813  LR: 0.00000012  
Epoch: [15][800/2860] Elapsed 3m 2s (remain 7m 48s) Loss: 0.0053(0.0014) Grad: 12722.1807  LR: 0.00000011  
Epoch: [15][900/2860] Elapsed 3m 24s (rem

Epoch 15 - avg_train_loss: 0.0013  avg_val_loss: 0.0312  time: 716s
Epoch 15 - Score: 0.8677, Th: 0.5
Score: 0.8696, Th: 0.8
Score: 0.8654, Th: 0.5
