# About this notebook
- This notebook is a modified version of the PyTorch pipeline from Y.Nakama's starter NLP notebook from Feedback Prize 3 competition [here](https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train). Don't forget to upvote his work!
- Inference notebook is [here](https://www.kaggle.com/mohammad2012191/debertav3-pytorch-baseline-inference-cv-0-467)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip install -q transformers')
os.system('pip install -q tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7

  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)






Installing collected packages: iterative-stratification

Successfully installed iterative-stratification-0.1.7

tokenizers.__version__: 0.13.3

transformers.__version__: 4.30.2

env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/commonlit-evaluate-student-summaries/summaries_train.csv')
test = pd.read_csv('../input/commonlit-evaluate-student-summaries/summaries_test.csv')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


test.shape: (4, 3)


Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


submission.shape: (4, 3)


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


# CV split

In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1791
1    1791
2    1792
3    1791
dtype: int64

In [8]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 822


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 2)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [13]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [14]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [15]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [16]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()


DebertaV2Config {

  "_name_or_path": "microsoft/deberta-v3-base",

  "attention_dropout": 0.0,

  "attention_probs_dropout_prob": 0.0,

  "hidden_act": "gelu",

  "hidden_dropout": 0.0,

  "hidden_dropout_prob": 0.0,

  "hidden_size": 768,

  "initializer_range": 0.02,

  "intermediate_size": 3072,

  "layer_norm_eps": 1e-07,

  "max_position_embeddings": 512,

  "max_relative_positions": -1,

  "model_type": "deberta-v2",

  "norm_rel_ebd": "layer_norm",

  "num_attention_heads": 12,

  "num_hidden_layers": 12,

  "output_hidden_states": true,

  "pad_token_id": 0,

  "pooler_dropout": 0,

  "pooler_hidden_act": "gelu",

  "pooler_hidden_size": 768,

  "pos_att_type": [

    "p2c",

    "c2p"

  ],

  "position_biased_input": false,

  "position_buckets": 256,

  "relative_attention": true,

  "share_att_key": true,

  "transformers_version": "4.30.2",

  "type_vocab_size": 0,

  "vocab_size": 128100

}




Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']

- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/671] Elapsed 0m 2s (remain 26m 47s) Loss: 0.9646(0.9646) Grad: inf  LR: 0.00002000  

Epoch: [1][20/671] Elapsed 0m 8s (remain 4m 23s) Loss: 0.7998(0.5248) Grad: 28478.7754  LR: 0.00002000  

Epoch: [1][40/671] Elapsed 0m 14s (remain 3m 45s) Loss: 0.2424(0.4071) Grad: 26163.5176  LR: 0.00001999  

Epoch: [1][60/671] Elapsed 0m 20s (remain 3m 27s) Loss: 0.2787(0.3422) Grad: 30125.7031  LR: 0.00001997  

Epoch: [1][80/671] Elapsed 0m 28s (remain 3m 25s) Loss: 0.2080(0.3032) Grad: 30702.1699  LR: 0.00001996  

Epoch: [1][100/671] Elapsed 0m 35s (remain 3m 18s) Loss: 0.1903(0.2720) Grad: 30658.7305  LR: 0.00001993  

Epoch: [1][120/671] Elapsed 0m 41s (remain 3m 6s) Loss: 0.1274(0.2548) Grad: 12393.4619  LR: 0.00001990  

Epoch: [1][140/671] Elapsed 0m 46s (remain 2m 55s) Loss: 0.2329(0.2422) Grad: 41183.6797  LR: 0.00001986  

Epoch: [1][160/671] Elapsed 0m 53s (remain 2m 48s) Loss: 0.0709(0.2325) Grad: 9546.2188  LR: 0.00001982  

Epoch: [1][180/671] Elapsed 0m 59s (remain 2

Epoch 1 - avg_train_loss: 0.1678  avg_val_loss: 0.1190  time: 240s

Epoch 1 - Score: 0.4971  Scores: [0.434792971075031, 0.5593829924712586]

Epoch 1 - Save Best Score: 0.4971 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0866(0.1190) 

Epoch: [2][0/671] Elapsed 0m 0s (remain 6m 22s) Loss: 0.0417(0.0417) Grad: inf  LR: 0.00001707  

Epoch: [2][20/671] Elapsed 0m 7s (remain 3m 42s) Loss: 0.0600(0.0982) Grad: 29621.4844  LR: 0.00001690  

Epoch: [2][40/671] Elapsed 0m 13s (remain 3m 21s) Loss: 0.0369(0.0898) Grad: 19586.9434  LR: 0.00001673  

Epoch: [2][60/671] Elapsed 0m 19s (remain 3m 11s) Loss: 0.0643(0.0868) Grad: 31344.2812  LR: 0.00001656  

Epoch: [2][80/671] Elapsed 0m 25s (remain 3m 6s) Loss: 0.1410(0.0883) Grad: 55203.1875  LR: 0.00001638  

Epoch: [2][100/671] Elapsed 0m 32s (remain 3m 1s) Loss: 0.1262(0.0878) Grad: 31514.5215  LR: 0.00001620  

Epoch: [2][120/671] Elapsed 0m 38s (remain 2m 56s) Loss: 0.0645(0.0897) Grad: 22231.8867  LR: 0.00001601  

Epoch: [2][140/671] Elapsed 0m 45s (remain 2m 51s) Loss: 0.2540(0.0916) Grad: 44957.6992  LR: 0.00001582  

Epoch: [2][160/671] Elapsed 0m 53s (remain 2m 48s) Loss: 0.0924(0.0922) Grad: 36994.

Epoch 2 - avg_train_loss: 0.0892  avg_val_loss: 0.1140  time: 239s

Epoch 2 - Score: 0.4824  Scores: [0.41742258865130405, 0.5474339073594124]

Epoch 2 - Save Best Score: 0.4824 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0640(0.1140) 

Epoch: [3][0/671] Elapsed 0m 0s (remain 10m 4s) Loss: 0.1634(0.1634) Grad: inf  LR: 0.00001001  

Epoch: [3][20/671] Elapsed 0m 7s (remain 3m 46s) Loss: 0.0455(0.0645) Grad: 54268.9961  LR: 0.00000977  

Epoch: [3][40/671] Elapsed 0m 12s (remain 3m 15s) Loss: 0.0349(0.0579) Grad: 25245.5430  LR: 0.00000954  

Epoch: [3][60/671] Elapsed 0m 19s (remain 3m 19s) Loss: 0.0735(0.0610) Grad: 39862.5742  LR: 0.00000930  

Epoch: [3][80/671] Elapsed 0m 25s (remain 3m 8s) Loss: 0.0419(0.0610) Grad: 53565.8984  LR: 0.00000907  

Epoch: [3][100/671] Elapsed 0m 32s (remain 3m 4s) Loss: 0.0460(0.0617) Grad: 42450.5000  LR: 0.00000884  

Epoch: [3][120/671] Elapsed 0m 38s (remain 2m 55s) Loss: 0.1108(0.0632) Grad: 56406.2656  LR: 0.00000861  

Epoch: [3][140/671] Elapsed 0m 45s (remain 2m 49s) Loss: 0.0758(0.0654) Grad: 49386.3594  LR: 0.00000838  

Epoch: [3][160/671] Elapsed 0m 51s (remain 2m 43s) Loss: 0.0246(0.0645) Grad: 20020.

Epoch 3 - avg_train_loss: 0.0595  avg_val_loss: 0.1057  time: 237s

Epoch 3 - Score: 0.4624  Scores: [0.3935299661762431, 0.531308450676688]

Epoch 3 - Save Best Score: 0.4624 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0697(0.1057) 

Epoch: [4][0/671] Elapsed 0m 0s (remain 4m 51s) Loss: 0.0252(0.0252) Grad: 142049.8438  LR: 0.00000294  

Epoch: [4][20/671] Elapsed 0m 6s (remain 3m 22s) Loss: 0.0447(0.0446) Grad: 39098.4492  LR: 0.00000278  

Epoch: [4][40/671] Elapsed 0m 12s (remain 3m 8s) Loss: 0.0436(0.0454) Grad: 42670.3906  LR: 0.00000262  

Epoch: [4][60/671] Elapsed 0m 18s (remain 3m 6s) Loss: 0.0422(0.0443) Grad: 54295.9375  LR: 0.00000246  

Epoch: [4][80/671] Elapsed 0m 26s (remain 3m 11s) Loss: 0.0558(0.0476) Grad: 40335.4180  LR: 0.00000231  

Epoch: [4][100/671] Elapsed 0m 33s (remain 3m 9s) Loss: 0.0428(0.0486) Grad: 55368.7305  LR: 0.00000216  

Epoch: [4][120/671] Elapsed 0m 39s (remain 3m 0s) Loss: 0.0745(0.0481) Grad: 56571.7188  LR: 0.00000202  

Epoch: [4][140/671] Elapsed 0m 46s (remain 2m 54s) Loss: 0.0464(0.0480) Grad: 31220.4102  LR: 0.00000188  

Epoch: [4][160/671] Elapsed 0m 52s (remain 2m 44s) Loss: 0.0371(0.0480) Grad: 

Epoch 4 - avg_train_loss: 0.0466  avg_val_loss: 0.1059  time: 238s

Epoch 4 - Score: 0.4623  Scores: [0.39076072702467185, 0.5337408548430956]

Epoch 4 - Save Best Score: 0.4623 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0766(0.1059) 



Score: 0.4623  Scores: [0.39076072702467185, 0.5337408548430956]


DebertaV2Config {

  "_name_or_path": "microsoft/deberta-v3-base",

  "attention_dropout": 0.0,

  "attention_probs_dropout_prob": 0.0,

  "hidden_act": "gelu",

  "hidden_dropout": 0.0,

  "hidden_dropout_prob": 0.0,

  "hidden_size": 768,

  "initializer_range": 0.02,

  "intermediate_size": 3072,

  "layer_norm_eps": 1e-07,

  "max_position_embeddings": 512,

  "max_relative_positions": -1,

  "model_type": "deberta-v2",

  "norm_rel_ebd": "layer_norm",

  "num_attention_heads": 12,

  "num_hidden_layers": 12,

  "output_hidden_states": true,

  "pad_token_id": 0,

  "pooler_dropout": 0,

  "pooler_hidden_act": "gelu",

  "pooler_hidden_size": 768,

  "pos_att_type": [

    "p2c",

    "c2p"

  ],

  "position_biased_input": false,

  "position_buckets": 256,

  "relative_attention": true,

  "share_att_key": true,

  "transformers_version": "4.30.2",

  "type_vocab_size": 0,

  "vocab_size": 128100

}



Some weigh

Epoch: [1][0/671] Elapsed 0m 0s (remain 5m 3s) Loss: 0.3736(0.3736) Grad: inf  LR: 0.00002000  

Epoch: [1][20/671] Elapsed 0m 7s (remain 3m 44s) Loss: 0.2846(0.3195) Grad: 105162.5156  LR: 0.00002000  

Epoch: [1][40/671] Elapsed 0m 13s (remain 3m 30s) Loss: 0.2152(0.2704) Grad: 20428.9180  LR: 0.00001999  

Epoch: [1][60/671] Elapsed 0m 20s (remain 3m 28s) Loss: 0.2072(0.2503) Grad: 33920.7930  LR: 0.00001997  

Epoch: [1][80/671] Elapsed 0m 26s (remain 3m 16s) Loss: 0.2529(0.2445) Grad: 33269.1680  LR: 0.00001996  

Epoch: [1][100/671] Elapsed 0m 33s (remain 3m 8s) Loss: 0.0958(0.2258) Grad: 60973.0508  LR: 0.00001993  

Epoch: [1][120/671] Elapsed 0m 39s (remain 2m 57s) Loss: 0.3469(0.2235) Grad: 105130.7188  LR: 0.00001990  

Epoch: [1][140/671] Elapsed 0m 45s (remain 2m 50s) Loss: 0.0516(0.2140) Grad: 30114.9688  LR: 0.00001986  

Epoch: [1][160/671] Elapsed 0m 51s (remain 2m 42s) Loss: 0.1913(0.2071) Grad: 109001.5312  LR: 0.00001982  

Epoch: [1][180/671] Elapsed 0m 57s (remain

Epoch 1 - avg_train_loss: 0.1575  avg_val_loss: 0.1239  time: 237s

Epoch 1 - Score: 0.5013  Scores: [0.43166249366631043, 0.570917939046402]

Epoch 1 - Save Best Score: 0.5013 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0882(0.1239) 

Epoch: [2][0/671] Elapsed 0m 0s (remain 7m 48s) Loss: 0.1656(0.1656) Grad: inf  LR: 0.00001707  

Epoch: [2][20/671] Elapsed 0m 7s (remain 3m 45s) Loss: 0.0863(0.1051) Grad: 102126.2266  LR: 0.00001690  

Epoch: [2][40/671] Elapsed 0m 15s (remain 4m 0s) Loss: 0.0575(0.0938) Grad: 44754.5508  LR: 0.00001673  

Epoch: [2][60/671] Elapsed 0m 22s (remain 3m 47s) Loss: 0.1011(0.0914) Grad: 51946.6992  LR: 0.00001656  

Epoch: [2][80/671] Elapsed 0m 28s (remain 3m 27s) Loss: 0.0573(0.0931) Grad: 46262.1328  LR: 0.00001638  

Epoch: [2][100/671] Elapsed 0m 34s (remain 3m 16s) Loss: 0.0417(0.0920) Grad: 21955.0703  LR: 0.00001620  

Epoch: [2][120/671] Elapsed 0m 40s (remain 3m 4s) Loss: 0.1000(0.0921) Grad: 33674.2422  LR: 0.00001601  

Epoch: [2][140/671] Elapsed 0m 46s (remain 2m 54s) Loss: 0.0553(0.0930) Grad: 16432.9316  LR: 0.00001582  

Epoch: [2][160/671] Elapsed 0m 53s (remain 2m 48s) Loss: 0.0671(0.0922) Grad: 16763

Epoch 2 - avg_train_loss: 0.0988  avg_val_loss: 0.1065  time: 237s

Epoch 2 - Score: 0.4612  Scores: [0.3915531404985714, 0.5308464326548927]

Epoch 2 - Save Best Score: 0.4612 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1183(0.1065) 

Epoch: [3][0/671] Elapsed 0m 0s (remain 4m 42s) Loss: 0.0707(0.0707) Grad: inf  LR: 0.00001001  

Epoch: [3][20/671] Elapsed 0m 7s (remain 3m 41s) Loss: 0.1105(0.0723) Grad: 54441.4062  LR: 0.00000977  

Epoch: [3][40/671] Elapsed 0m 15s (remain 3m 51s) Loss: 0.0377(0.0703) Grad: 54212.7969  LR: 0.00000954  

Epoch: [3][60/671] Elapsed 0m 21s (remain 3m 31s) Loss: 0.0927(0.0698) Grad: 84356.9453  LR: 0.00000930  

Epoch: [3][80/671] Elapsed 0m 28s (remain 3m 29s) Loss: 0.0635(0.0684) Grad: 30714.3145  LR: 0.00000907  

Epoch: [3][100/671] Elapsed 0m 34s (remain 3m 14s) Loss: 0.1304(0.0682) Grad: 45489.7852  LR: 0.00000884  

Epoch: [3][120/671] Elapsed 0m 40s (remain 3m 5s) Loss: 0.0542(0.0713) Grad: 60328.2070  LR: 0.00000861  

Epoch: [3][140/671] Elapsed 0m 47s (remain 2m 58s) Loss: 0.0655(0.0693) Grad: 75994.6250  LR: 0.00000838  

Epoch: [3][160/671] Elapsed 0m 54s (remain 2m 51s) Loss: 0.0791(0.0699) Grad: 72124

Epoch 3 - avg_train_loss: 0.0649  avg_val_loss: 0.1087  time: 239s

Epoch 3 - Score: 0.4667  Scores: [0.39778638570107927, 0.5356141443866703]


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1208(0.1087) 

Epoch: [4][0/671] Elapsed 0m 0s (remain 5m 1s) Loss: 0.0442(0.0442) Grad: 155564.0469  LR: 0.00000294  

Epoch: [4][20/671] Elapsed 0m 6s (remain 3m 31s) Loss: 0.0399(0.0493) Grad: 27813.7480  LR: 0.00000278  

Epoch: [4][40/671] Elapsed 0m 13s (remain 3m 33s) Loss: 0.0749(0.0525) Grad: 37908.5430  LR: 0.00000262  

Epoch: [4][60/671] Elapsed 0m 20s (remain 3m 21s) Loss: 0.0278(0.0495) Grad: 53335.8125  LR: 0.00000246  

Epoch: [4][80/671] Elapsed 0m 26s (remain 3m 11s) Loss: 0.0444(0.0485) Grad: 28541.8750  LR: 0.00000231  

Epoch: [4][100/671] Elapsed 0m 33s (remain 3m 6s) Loss: 0.0223(0.0469) Grad: 44563.9805  LR: 0.00000216  

Epoch: [4][120/671] Elapsed 0m 38s (remain 2m 54s) Loss: 0.1206(0.0463) Grad: 39751.7969  LR: 0.00000202  

Epoch: [4][140/671] Elapsed 0m 46s (remain 2m 53s) Loss: 0.0434(0.0471) Grad: 44738.1641  LR: 0.00000188  

Epoch: [4][160/671] Elapsed 0m 52s (remain 2m 46s) Loss: 0.0490(0.0456) Grad

Epoch 4 - avg_train_loss: 0.0441  avg_val_loss: 0.1065  time: 240s

Epoch 4 - Score: 0.4618  Scores: [0.39123384322580024, 0.5323950354920308]


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1185(0.1065) 



Score: 0.4612  Scores: [0.3915531404985714, 0.5308464326548927]


DebertaV2Config {

  "_name_or_path": "microsoft/deberta-v3-base",

  "attention_dropout": 0.0,

  "attention_probs_dropout_prob": 0.0,

  "hidden_act": "gelu",

  "hidden_dropout": 0.0,

  "hidden_dropout_prob": 0.0,

  "hidden_size": 768,

  "initializer_range": 0.02,

  "intermediate_size": 3072,

  "layer_norm_eps": 1e-07,

  "max_position_embeddings": 512,

  "max_relative_positions": -1,

  "model_type": "deberta-v2",

  "norm_rel_ebd": "layer_norm",

  "num_attention_heads": 12,

  "num_hidden_layers": 12,

  "output_hidden_states": true,

  "pad_token_id": 0,

  "pooler_dropout": 0,

  "pooler_hidden_act": "gelu",

  "pooler_hidden_size": 768,

  "pos_att_type": [

    "p2c",

    "c2p"

  ],

  "position_biased_input": false,

  "position_buckets": 256,

  "relative_attention": true,

  "share_att_key": true,

  "transformers_version": "4.30.2",

  "type_vocab_size": 0,

  "vocab_size": 128100

}



Some weight

Epoch: [1][0/671] Elapsed 0m 0s (remain 6m 2s) Loss: 0.6931(0.6931) Grad: inf  LR: 0.00002000  

Epoch: [1][20/671] Elapsed 0m 6s (remain 3m 19s) Loss: 0.2606(0.3963) Grad: 34165.6211  LR: 0.00002000  

Epoch: [1][40/671] Elapsed 0m 13s (remain 3m 30s) Loss: 0.1662(0.3288) Grad: 56741.6055  LR: 0.00001999  

Epoch: [1][60/671] Elapsed 0m 19s (remain 3m 15s) Loss: 0.3094(0.2822) Grad: 78726.8750  LR: 0.00001997  

Epoch: [1][80/671] Elapsed 0m 26s (remain 3m 13s) Loss: 0.2001(0.2586) Grad: 62206.6211  LR: 0.00001996  

Epoch: [1][100/671] Elapsed 0m 32s (remain 3m 5s) Loss: 0.1039(0.2395) Grad: 25654.5723  LR: 0.00001993  

Epoch: [1][120/671] Elapsed 0m 39s (remain 2m 59s) Loss: 0.1146(0.2303) Grad: 61796.9414  LR: 0.00001990  

Epoch: [1][140/671] Elapsed 0m 45s (remain 2m 52s) Loss: 0.1640(0.2160) Grad: 53275.7969  LR: 0.00001986  

Epoch: [1][160/671] Elapsed 0m 52s (remain 2m 46s) Loss: 0.1052(0.2122) Grad: 25878.7539  LR: 0.00001982  

Epoch: [1][180/671] Elapsed 0m 58s (remain 2m

Epoch 1 - avg_train_loss: 0.1573  avg_val_loss: 0.1373  time: 237s

Epoch 1 - Score: 0.5342  Scores: [0.5098404292781592, 0.5586359594049851]

Epoch 1 - Save Best Score: 0.5342 Model


Epoch: [2][0/671] Elapsed 0m 0s (remain 8m 22s) Loss: 0.0848(0.0848) Grad: inf  LR: 0.00001707  

Epoch: [2][20/671] Elapsed 0m 5s (remain 2m 52s) Loss: 0.0518(0.1130) Grad: 73935.4141  LR: 0.00001690  

Epoch: [2][40/671] Elapsed 0m 11s (remain 2m 53s) Loss: 0.0432(0.1026) Grad: 71008.0547  LR: 0.00001673  

Epoch: [2][60/671] Elapsed 0m 17s (remain 2m 59s) Loss: 0.1234(0.1009) Grad: 52334.8047  LR: 0.00001655  

Epoch: [2][80/671] Elapsed 0m 25s (remain 3m 4s) Loss: 0.0730(0.0945) Grad: 54970.8047  LR: 0.00001637  

Epoch: [2][100/671] Elapsed 0m 31s (remain 3m 0s) Loss: 0.0806(0.0962) Grad: 67059.7734  LR: 0.00001619  

Epoch: [2][120/671] Elapsed 0m 37s (remain 2m 50s) Loss: 0.1935(0.0968) Grad: 43097.0547  LR: 0.00001601  

Epoch: [2][140/671] Elapsed 0m 43s (remain 2m 43s) Loss: 0.1307(0.0974) Grad: 66373.2656  LR: 0.00001582  

Epoch: [2][160/671] Elapsed 0m 50s (remain 2m 38s) Loss: 0.0716(0.0968) Grad: 66120.9297  LR: 0.00001563  

Epoch: [2][180/671] Elapsed 0m 55s (remain 2m

Epoch 2 - avg_train_loss: 0.0964  avg_val_loss: 0.1176  time: 237s

Epoch 2 - Score: 0.4897  Scores: [0.4399299397146032, 0.5393973342294365]

Epoch 2 - Save Best Score: 0.4897 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1415(0.1176) 

Epoch: [3][0/671] Elapsed 0m 0s (remain 5m 48s) Loss: 0.0560(0.0560) Grad: inf  LR: 0.00001000  

Epoch: [3][20/671] Elapsed 0m 6s (remain 3m 31s) Loss: 0.0780(0.0798) Grad: 46113.8086  LR: 0.00000977  

Epoch: [3][40/671] Elapsed 0m 13s (remain 3m 24s) Loss: 0.0403(0.0825) Grad: 55498.3438  LR: 0.00000953  

Epoch: [3][60/671] Elapsed 0m 20s (remain 3m 20s) Loss: 0.0898(0.0805) Grad: 72155.8672  LR: 0.00000930  

Epoch: [3][80/671] Elapsed 0m 25s (remain 3m 9s) Loss: 0.0586(0.0755) Grad: 43661.3906  LR: 0.00000907  

Epoch: [3][100/671] Elapsed 0m 33s (remain 3m 8s) Loss: 0.1726(0.0736) Grad: 76698.6875  LR: 0.00000883  

Epoch: [3][120/671] Elapsed 0m 40s (remain 3m 2s) Loss: 0.0398(0.0734) Grad: 45022.7734  LR: 0.00000860  

Epoch: [3][140/671] Elapsed 0m 46s (remain 2m 54s) Loss: 0.0639(0.0725) Grad: 77711.5781  LR: 0.00000837  

Epoch: [3][160/671] Elapsed 0m 52s (remain 2m 45s) Loss: 0.0861(0.0714) Grad: 104927.

Epoch 3 - avg_train_loss: 0.0665  avg_val_loss: 0.1116  time: 236s

Epoch 3 - Score: 0.4732  Scores: [0.4039846242234985, 0.5423233771004446]

Epoch 3 - Save Best Score: 0.4732 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1387(0.1116) 

Epoch: [4][0/671] Elapsed 0m 0s (remain 6m 36s) Loss: 0.0554(0.0554) Grad: 217263.3438  LR: 0.00000293  

Epoch: [4][20/671] Elapsed 0m 7s (remain 3m 37s) Loss: 0.0668(0.0511) Grad: 34368.8945  LR: 0.00000277  

Epoch: [4][40/671] Elapsed 0m 14s (remain 3m 37s) Loss: 0.0478(0.0514) Grad: 70164.7578  LR: 0.00000261  

Epoch: [4][60/671] Elapsed 0m 20s (remain 3m 26s) Loss: 0.1097(0.0509) Grad: 46946.5352  LR: 0.00000245  

Epoch: [4][80/671] Elapsed 0m 26s (remain 3m 13s) Loss: 0.0369(0.0492) Grad: 54206.7734  LR: 0.00000230  

Epoch: [4][100/671] Elapsed 0m 32s (remain 3m 5s) Loss: 0.0309(0.0486) Grad: 26452.5020  LR: 0.00000216  

Epoch: [4][120/671] Elapsed 0m 40s (remain 3m 3s) Loss: 0.0553(0.0490) Grad: 32940.1172  LR: 0.00000201  

Epoch: [4][140/671] Elapsed 0m 47s (remain 2m 58s) Loss: 0.0641(0.0483) Grad: 28089.7461  LR: 0.00000187  

Epoch: [4][160/671] Elapsed 0m 53s (remain 2m 48s) Loss: 0.0295(0.0482) Grad

Epoch 4 - avg_train_loss: 0.0501  avg_val_loss: 0.1125  time: 237s

Epoch 4 - Score: 0.4753  Scores: [0.4068700782138248, 0.543703168217574]


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1378(0.1125) 



Score: 0.4732  Scores: [0.4039846242234985, 0.5423233771004446]


DebertaV2Config {

  "_name_or_path": "microsoft/deberta-v3-base",

  "attention_dropout": 0.0,

  "attention_probs_dropout_prob": 0.0,

  "hidden_act": "gelu",

  "hidden_dropout": 0.0,

  "hidden_dropout_prob": 0.0,

  "hidden_size": 768,

  "initializer_range": 0.02,

  "intermediate_size": 3072,

  "layer_norm_eps": 1e-07,

  "max_position_embeddings": 512,

  "max_relative_positions": -1,

  "model_type": "deberta-v2",

  "norm_rel_ebd": "layer_norm",

  "num_attention_heads": 12,

  "num_hidden_layers": 12,

  "output_hidden_states": true,

  "pad_token_id": 0,

  "pooler_dropout": 0,

  "pooler_hidden_act": "gelu",

  "pooler_hidden_size": 768,

  "pos_att_type": [

    "p2c",

    "c2p"

  ],

  "position_biased_input": false,

  "position_buckets": 256,

  "relative_attention": true,

  "share_att_key": true,

  "transformers_version": "4.30.2",

  "type_vocab_size": 0,

  "vocab_size": 128100

}



Some weight

Epoch: [1][0/671] Elapsed 0m 0s (remain 5m 29s) Loss: 0.5084(0.5084) Grad: inf  LR: 0.00002000  

Epoch: [1][20/671] Elapsed 0m 6s (remain 3m 23s) Loss: 0.3439(0.4188) Grad: 32316.9844  LR: 0.00002000  

Epoch: [1][40/671] Elapsed 0m 12s (remain 3m 15s) Loss: 0.2337(0.3669) Grad: 32981.0742  LR: 0.00001999  

Epoch: [1][60/671] Elapsed 0m 20s (remain 3m 26s) Loss: 0.2587(0.3135) Grad: 57121.4492  LR: 0.00001997  

Epoch: [1][80/671] Elapsed 0m 27s (remain 3m 22s) Loss: 0.2747(0.2782) Grad: 42730.0039  LR: 0.00001996  

Epoch: [1][100/671] Elapsed 0m 34s (remain 3m 14s) Loss: 0.0847(0.2515) Grad: 22637.1074  LR: 0.00001993  

Epoch: [1][120/671] Elapsed 0m 40s (remain 3m 6s) Loss: 0.2394(0.2343) Grad: 41158.3398  LR: 0.00001990  

Epoch: [1][140/671] Elapsed 0m 47s (remain 2m 57s) Loss: 0.2182(0.2286) Grad: 33114.7539  LR: 0.00001986  

Epoch: [1][160/671] Elapsed 0m 53s (remain 2m 48s) Loss: 0.0753(0.2203) Grad: 15545.2158  LR: 0.00001982  

Epoch: [1][180/671] Elapsed 0m 59s (remain 2

Epoch 1 - avg_train_loss: 0.1583  avg_val_loss: 0.1338  time: 236s

Epoch 1 - Score: 0.5263  Scores: [0.4562947393159735, 0.5962788203038255]

Epoch 1 - Save Best Score: 0.5263 Model


Epoch: [2][0/671] Elapsed 0m 0s (remain 9m 14s) Loss: 0.2316(0.2316) Grad: inf  LR: 0.00001707  

Epoch: [2][20/671] Elapsed 0m 5s (remain 3m 5s) Loss: 0.0684(0.1127) Grad: 49813.9297  LR: 0.00001690  

Epoch: [2][40/671] Elapsed 0m 12s (remain 3m 5s) Loss: 0.0974(0.1068) Grad: 70137.3906  LR: 0.00001673  

Epoch: [2][60/671] Elapsed 0m 18s (remain 3m 5s) Loss: 0.2525(0.1028) Grad: 77012.6641  LR: 0.00001656  

Epoch: [2][80/671] Elapsed 0m 25s (remain 3m 3s) Loss: 0.1593(0.0998) Grad: 53601.2305  LR: 0.00001638  

Epoch: [2][100/671] Elapsed 0m 31s (remain 3m 0s) Loss: 0.0811(0.0979) Grad: 39415.5273  LR: 0.00001620  

Epoch: [2][120/671] Elapsed 0m 39s (remain 3m 0s) Loss: 0.1185(0.0982) Grad: 53806.4102  LR: 0.00001601  

Epoch: [2][140/671] Elapsed 0m 46s (remain 2m 54s) Loss: 0.0256(0.0993) Grad: 23795.6113  LR: 0.00001582  

Epoch: [2][160/671] Elapsed 0m 52s (remain 2m 46s) Loss: 0.0904(0.0971) Grad: 29049.5059  LR: 0.00001563  

Epoch: [2][180/671] Elapsed 0m 58s (remain 2m 38s

Epoch 2 - avg_train_loss: 0.0994  avg_val_loss: 0.1170  time: 236s

Epoch 2 - Score: 0.4879  Scores: [0.41843396973978864, 0.5573273112109701]

Epoch 2 - Save Best Score: 0.4879 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0858(0.1170) 

Epoch: [3][0/671] Elapsed 0m 0s (remain 6m 39s) Loss: 0.0433(0.0433) Grad: inf  LR: 0.00001001  

Epoch: [3][20/671] Elapsed 0m 7s (remain 4m 7s) Loss: 0.0578(0.0706) Grad: 65442.2656  LR: 0.00000977  

Epoch: [3][40/671] Elapsed 0m 14s (remain 3m 47s) Loss: 0.0860(0.0658) Grad: 78936.7500  LR: 0.00000954  

Epoch: [3][60/671] Elapsed 0m 21s (remain 3m 34s) Loss: 0.1080(0.0728) Grad: 68353.2656  LR: 0.00000930  

Epoch: [3][80/671] Elapsed 0m 27s (remain 3m 19s) Loss: 0.0367(0.0704) Grad: 38981.8008  LR: 0.00000907  

Epoch: [3][100/671] Elapsed 0m 33s (remain 3m 11s) Loss: 0.0673(0.0702) Grad: 47475.3594  LR: 0.00000884  

Epoch: [3][120/671] Elapsed 0m 41s (remain 3m 7s) Loss: 0.0507(0.0705) Grad: 34966.3125  LR: 0.00000861  

Epoch: [3][140/671] Elapsed 0m 48s (remain 3m 2s) Loss: 0.0944(0.0724) Grad: 76960.4297  LR: 0.00000838  

Epoch: [3][160/671] Elapsed 0m 54s (remain 2m 52s) Loss: 0.0718(0.0719) Grad: 74934.3

Epoch 3 - avg_train_loss: 0.0642  avg_val_loss: 0.1098  time: 237s

Epoch 3 - Score: 0.4719  Scores: [0.40624884349553214, 0.5375453819913976]

Epoch 3 - Save Best Score: 0.4719 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0986(0.1098) 

Epoch: [4][0/671] Elapsed 0m 0s (remain 4m 56s) Loss: 0.0180(0.0180) Grad: 115517.8125  LR: 0.00000294  

Epoch: [4][20/671] Elapsed 0m 5s (remain 3m 3s) Loss: 0.0458(0.0479) Grad: 47687.4180  LR: 0.00000278  

Epoch: [4][40/671] Elapsed 0m 12s (remain 3m 11s) Loss: 0.0140(0.0496) Grad: 27247.0469  LR: 0.00000262  

Epoch: [4][60/671] Elapsed 0m 18s (remain 3m 8s) Loss: 0.0307(0.0484) Grad: 29133.2383  LR: 0.00000246  

Epoch: [4][80/671] Elapsed 0m 25s (remain 3m 6s) Loss: 0.0685(0.0490) Grad: 53436.0938  LR: 0.00000231  

Epoch: [4][100/671] Elapsed 0m 32s (remain 3m 1s) Loss: 0.0135(0.0475) Grad: 26636.8750  LR: 0.00000216  

Epoch: [4][120/671] Elapsed 0m 38s (remain 2m 55s) Loss: 0.0229(0.0470) Grad: 20958.5410  LR: 0.00000202  

Epoch: [4][140/671] Elapsed 0m 44s (remain 2m 47s) Loss: 0.0207(0.0469) Grad: 32815.7383  LR: 0.00000188  

Epoch: [4][160/671] Elapsed 0m 50s (remain 2m 39s) Loss: 0.0263(0.0468) Grad: 

Epoch 4 - avg_train_loss: 0.0482  avg_val_loss: 0.1099  time: 238s

Epoch 4 - Score: 0.4712  Scores: [0.4054069534502577, 0.5370317731230411]

Epoch 4 - Save Best Score: 0.4712 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0939(0.1099) 



Score: 0.4712  Scores: [0.4054069534502577, 0.5370317731230411]


Score: 0.4670  Scores: [0.3979851971315948, 0.5360034540441694]
