In [1]:
import os

run_type = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
from sklearn.metrics import f1_score, log_loss
import logging
from types import SimpleNamespace
from pathlib import Path
from datetime import datetime
import math
import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import log_loss
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
from scipy.special import softmax
from IPython.core.display import display, HTML
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import time
from transformers import DataCollatorWithPadding
#from datasets import Dataset, load_metric

# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

torch.__version__: 1.9.0+cu111


In [2]:
train_df = pd.read_csv('D:/feedback/train.csv')

topic_pred_df = pd.read_csv('D:/feedback/topic_model_feedback.csv')
topic_pred_df = topic_pred_df.drop(columns={'prob'})
topic_pred_df = topic_pred_df.rename(columns={'id': 'essay_id'})

topic_meta_df = pd.read_csv('D:/feedback/topic_model_metadata.csv')
topic_meta_df = topic_meta_df.rename(columns={'Topic': 'topic', 'Name': 'topic_name'}).drop(columns=['Count'])
topic_meta_df.topic_name = topic_meta_df.topic_name.apply(lambda n: ' '.join(n.split('_')[1:]))

topic_pred_df = topic_pred_df.merge(topic_meta_df, on='topic', how='left')

train_df = train_df.merge(topic_pred_df, on='essay_id', how='left')

In [3]:
labels = ['Adequate', 'Effective', 'Ineffective']

In [4]:
config = SimpleNamespace()

config.seed = 420
config.model = 'microsoft/deberta-v3-large'
config.output_path = Path('./')
config.input_path = Path('D:/feedback')

config.n_folds = 5
config.lr = 2e-5
config.weight_decay = 0.01
config.epochs = 4
config.batch_size = 4
config.gradient_accumulation_steps = 1
config.warm_up_ratio = 0.1
config.max_len = 512
config.hidden_dropout_prob = 0.2
config.label_smoothing_factor = 0.
config.eval_per_epoch = 2

logging.disable(logging.WARNING)

seed_everything(config.seed)

In [5]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [6]:
def get_essay(essay_id, is_train=True):
    parent_path = 'D:/feedback/train'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text


def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
tokenizer.model_max_length = config.max_len



In [8]:
train_df['discourse_text'] = train_df['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
train_df['essay_text']  = train_df['essay_id'].apply(lambda x: get_essay(x, is_train=True))
train_df['essay_text'] = train_df['essay_text'].apply(lambda x : resolve_encodings_and_normalize(x))
#test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']
def add_inputs(df):
    df['inputs'] = df.discourse_type.str.lower() + ' ' + df.discourse_text.str.lower() + tokenizer.sep_token + df.essay_text
    return df
train_df['label'] = train_df['discourse_effectiveness'].map({'Ineffective': 0, 'Adequate': 1, 'Effective': 2})
train_df = add_inputs(train_df)
#cv = GroupKFold(n_splits=config.n_folds)
cv = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=42)
train_df['fold'] = -1
for fold_num, (train_idxs, test_idxs) in enumerate(cv.split(train_df.index, train_df.discourse_effectiveness, train_df.essay_id)):
    train_df.loc[test_idxs, ['fold']] = fold_num

In [9]:
#train_df.to_csv('D:/feedback/trainsplit.csv',index=False)#

In [10]:
#train_df=pd.read_csv('D:/feedback/trainsplit.csv')

In [11]:
import sys
sys.path.append('D:/feedback/')
from datasetv1 import TrainDataset

In [12]:
config.tokenizer = tokenizer

In [13]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

collate_fn = Collate(tokenizer, isTrain=True)

In [14]:
from torch.cuda.amp import autocast
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        
            
        
        # self.pooler = MeanPooling()
        
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        
        self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size, 3))
        

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        transformer_out = self.model(**inputs)
        
        # LSTM/GRU header
#         all_hidden_states = torch.stack(transformer_out[1])
#         sequence_output = self.pooler(all_hidden_states)
        
        # simple CLS
        sequence_output = transformer_out[0][:, 0, :]

        
        # Main task
        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        return logits
    
def inplace_relu(m):
    classname = m.__class__.__name__
    if classname.find('ReLU') != -1:
        m.inplace=True

In [15]:
from torch import Tensor
from torch.nn import Module
from transformers import AutoModel, AutoConfig
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        criterion: _Loss,
        optimizer: Optimizer,
        apex: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.apex = True
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs,label) -> Tensor:
        with torch.cuda.amp.autocast(enabled=self.apex):
            self._save()
            self._attack_step() # モデルを近傍の悪い方へ改変
            preds = self.model(inputs)
            loss = self.criterion(preds, label)
            self.optimizer.zero_grad()
        return loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 直前に損失関数に通してパラメータの勾配を取得できるようにしておく必要あり
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [16]:


# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if CFG.nth_awp_start_epoch <= epoch+1:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    #preds = np.array([])
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        
#         if step > 0:
#             preds = np.row_stack((preds, y_preds.sigmoid().to('cpu').numpy()))
#         else:
#             preds = y_preds.sigmoid().to('cpu').numpy()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    #print(preds.shape, predictions.shape)
#     predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions




In [17]:
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=0
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=.5
    num_warmup_steps=0
    epochs=3
    encoder_lr=5e-7
    decoder_lr=5e-7
    min_lr=1e-8 
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=4
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    early_stop=4
    nth_awp_start_epoch=2
    awp_lr=1e-4
    awp_eps=1e-2

In [18]:
import gc

In [19]:
def train_loop(folds, fold):
    LOGGER.info(f"========== fold: {fold} training ==========")
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    
    train_dataset = TrainDataset(config, train_folds)
    valid_dataset = TrainDataset(config, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=1,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(config)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    best_score = 100.0
    es=0
    criterion = nn.CrossEntropyLoss()
    
    awp = AWP(
            model, 
            criterion, 
            optimizer,
            True,
            adv_lr=2e-5, 
            adv_eps=1e-2
        )
    for epoch in range(CFG.epochs):
        start_time = time.time()
        # train
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device,awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        score = get_score(valid_dataset.target, predictions)
        # eval
        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  - avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
       # LOGGER.info(f'Epoch {epoch+1} - Score: {avg_val_metric:.4f}')
        if score < best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
        else:
            es+=1
            if es == CFG.early_stop:
                break
        gc.collect()
    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    deb_ineffective = []
    deb_effective = []
    deb_adequate = []

    
    deb_ineffective.append(predictions[:, 0])
    deb_adequate.append(predictions[:, 1])
    deb_effective.append(predictions[:, 2])
    # list -> dataframe
    deb_ineffective = pd.DataFrame(deb_ineffective).T
    deb_adequate = pd.DataFrame(deb_adequate).T
    deb_effective = pd.DataFrame(deb_effective).T
    
    valid_folds['oof_ineffective']=deb_ineffective
    valid_folds['oof_adequate']=deb_adequate
    valid_folds['oof_effective']=deb_effective
    torch.cuda.empty_cache()
    gc.collect()
    return valid_folds

In [20]:
OUTPUT_DIR = 'D:/feedback/output/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [21]:
device = 'cuda'

In [22]:
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [23]:
if __name__ == '__main__':    
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train_df, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                #LOGGER.info(f"========== fold: {fold} result ==========")
                #get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        #LOGGER.info(f"========== CV ==========")
        #get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [1][0/7353] Elapsed 0m 1s (remain 136m 59s) Loss: 1.3826(1.3826) Grad: inf  LR: 0.00000050  
Epoch: [1][100/7353] Elapsed 0m 41s (remain 50m 3s) Loss: 1.1263(1.0505) Grad: 621882.6250  LR: 0.00000050  
Epoch: [1][200/7353] Elapsed 1m 22s (remain 49m 6s) Loss: 1.1527(1.0404) Grad: 589467.8125  LR: 0.00000050  
Epoch: [1][300/7353] Elapsed 2m 3s (remain 48m 18s) Loss: 0.9888(1.0437) Grad: 605109.1250  LR: 0.00000050  
Epoch: [1][400/7353] Elapsed 2m 44s (remain 47m 31s) Loss: 0.5361(1.0314) Grad: 675627.0625  LR: 0.00000050  
Epoch: [1][500/7353] Elapsed 3m 25s (remain 46m 49s) Loss: 0.6528(1.0164) Grad: 453738.8125  LR: 0.00000050  
Epoch: [1][600/7353] Elapsed 4m 6s (remain 46m 6s) Loss: 1.4624(1.0157) Grad: 777399.6250  LR: 0.00000050  
Epoch: [1][700/7353] Elapsed 4m 47s (remain 45m 24s) Loss: 0.7860(1.0084) Grad: 654568.7500  LR: 0.00000050  
Epoch: [1][800/7353] Elapsed 5m 27s (remain 44m 42s) Loss: 0.6813(1.0030) Grad: 802901.0625  LR: 0.00000050  
Epoch: [1][900/7353] Elap

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [2][0/7353] Elapsed 0m 0s (remain 101m 14s) Loss: 1.9287(1.9287) Grad: inf  LR: 0.00000037  
Epoch: [2][100/7353] Elapsed 1m 23s (remain 100m 20s) Loss: 0.9570(0.9599) Grad: 389343.3750  LR: 0.00000037  
Epoch: [2][200/7353] Elapsed 2m 46s (remain 98m 58s) Loss: 0.4670(0.9244) Grad: 207897.5156  LR: 0.00000037  
Epoch: [2][300/7353] Elapsed 4m 9s (remain 97m 25s) Loss: 0.7526(0.9085) Grad: 390935.6875  LR: 0.00000037  
Epoch: [2][400/7353] Elapsed 5m 32s (remain 95m 59s) Loss: 0.8688(0.8953) Grad: 212119.6875  LR: 0.00000036  
Epoch: [2][500/7353] Elapsed 6m 54s (remain 94m 33s) Loss: 0.8435(0.8851) Grad: 635593.2500  LR: 0.00000036  
Epoch: [2][600/7353] Elapsed 8m 17s (remain 93m 10s) Loss: 1.4818(0.8816) Grad: 723017.2500  LR: 0.00000036  
Epoch: [2][700/7353] Elapsed 9m 40s (remain 91m 45s) Loss: 0.7508(0.8802) Grad: 296538.8438  LR: 0.00000035  
Epoch: [2][800/7353] Elapsed 11m 2s (remain 90m 21s) Loss: 1.4698(0.8793) Grad: 487359.2812  LR: 0.00000035  
Epoch: [2][900/7353]

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [3][100/7353] Elapsed 1m 23s (remain 99m 49s) Loss: 1.2926(0.7598) Grad: 500159.1562  LR: 0.00000012  
Epoch: [3][200/7353] Elapsed 2m 46s (remain 98m 32s) Loss: 0.8946(0.7805) Grad: 264112.9062  LR: 0.00000012  
Epoch: [3][300/7353] Elapsed 4m 9s (remain 97m 14s) Loss: 0.8224(0.7907) Grad: 251634.5781  LR: 0.00000012  
Epoch: [3][400/7353] Elapsed 5m 31s (remain 95m 49s) Loss: 1.0052(0.7923) Grad: 333527.3438  LR: 0.00000011  
Epoch: [3][500/7353] Elapsed 6m 54s (remain 94m 25s) Loss: 1.1322(0.8054) Grad: 366649.1250  LR: 0.00000011  
Epoch: [3][600/7353] Elapsed 8m 16s (remain 93m 1s) Loss: 1.1505(0.8137) Grad: 483448.9375  LR: 0.00000011  
Epoch: [3][700/7353] Elapsed 9m 39s (remain 91m 39s) Loss: 0.6620(0.8128) Grad: 238343.6875  LR: 0.00000010  
Epoch: [3][800/7353] Elapsed 11m 2s (remain 90m 15s) Loss: 0.8829(0.8107) Grad: 221946.6250  LR: 0.00000010  
Epoch: [3][900/7353] Elapsed 12m 24s (remain 88m 53s) Loss: 0.7355(0.8100) Grad: 178962.3906  LR: 0.00000010  
Epoch: [3][

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [1][0/7353] Elapsed 0m 0s (remain 44m 5s) Loss: 0.9629(0.9629) Grad: inf  LR: 0.00000050  
Epoch: [1][100/7353] Elapsed 0m 41s (remain 49m 5s) Loss: 0.6503(1.0273) Grad: 454671.7500  LR: 0.00000050  
Epoch: [1][200/7353] Elapsed 1m 21s (remain 48m 29s) Loss: 1.1749(0.9971) Grad: 615997.3125  LR: 0.00000050  
Epoch: [1][300/7353] Elapsed 2m 2s (remain 47m 53s) Loss: 0.7081(1.0037) Grad: 472586.9688  LR: 0.00000050  
Epoch: [1][400/7353] Elapsed 2m 43s (remain 47m 13s) Loss: 1.0322(0.9951) Grad: 695097.2500  LR: 0.00000050  
Epoch: [1][500/7353] Elapsed 3m 24s (remain 46m 34s) Loss: 0.7720(0.9882) Grad: 581345.1875  LR: 0.00000050  
Epoch: [1][600/7353] Elapsed 4m 4s (remain 45m 52s) Loss: 0.3571(0.9755) Grad: 521273.6562  LR: 0.00000050  
Epoch: [1][700/7353] Elapsed 4m 45s (remain 45m 12s) Loss: 1.4233(0.9754) Grad: 808323.3750  LR: 0.00000050  
Epoch: [1][800/7353] Elapsed 5m 26s (remain 44m 32s) Loss: 0.8393(0.9756) Grad: 498836.6250  LR: 0.00000050  
Epoch: [1][900/7353] Elap

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [2][0/7353] Elapsed 0m 0s (remain 101m 58s) Loss: 0.6777(0.6777) Grad: inf  LR: 0.00000037  
Epoch: [2][100/7353] Elapsed 1m 23s (remain 99m 58s) Loss: 1.1419(0.8288) Grad: 340843.7812  LR: 0.00000037  
Epoch: [2][200/7353] Elapsed 2m 46s (remain 98m 41s) Loss: 1.4247(0.8507) Grad: 950418.0000  LR: 0.00000037  
Epoch: [2][300/7353] Elapsed 4m 9s (remain 97m 16s) Loss: 0.6057(0.8611) Grad: 348337.0938  LR: 0.00000037  
Epoch: [2][400/7353] Elapsed 5m 31s (remain 95m 51s) Loss: 0.3461(0.8554) Grad: 300258.7812  LR: 0.00000036  
Epoch: [2][500/7353] Elapsed 6m 54s (remain 94m 29s) Loss: 1.2875(0.8522) Grad: 630457.3750  LR: 0.00000036  
Epoch: [2][600/7353] Elapsed 8m 17s (remain 93m 6s) Loss: 0.9023(0.8545) Grad: 283211.0000  LR: 0.00000036  
Epoch: [2][700/7353] Elapsed 9m 40s (remain 91m 51s) Loss: 1.0517(0.8549) Grad: 334680.5625  LR: 0.00000035  
Epoch: [2][800/7353] Elapsed 11m 3s (remain 90m 29s) Loss: 0.8859(0.8545) Grad: 315763.5625  LR: 0.00000035  
Epoch: [2][900/7353] E

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [3][0/7353] Elapsed 0m 0s (remain 87m 52s) Loss: 0.9719(0.9719) Grad: inf  LR: 0.00000012  
Epoch: [3][100/7353] Elapsed 1m 23s (remain 99m 37s) Loss: 0.5045(0.7818) Grad: 186376.5781  LR: 0.00000012  
Epoch: [3][200/7353] Elapsed 2m 45s (remain 98m 22s) Loss: 1.0174(0.8170) Grad: 307359.5938  LR: 0.00000012  
Epoch: [3][300/7353] Elapsed 4m 8s (remain 97m 5s) Loss: 0.5466(0.8022) Grad: 380228.3750  LR: 0.00000012  
Epoch: [3][400/7353] Elapsed 5m 31s (remain 95m 47s) Loss: 0.8998(0.8067) Grad: 299002.9375  LR: 0.00000011  
Epoch: [3][500/7353] Elapsed 6m 54s (remain 94m 24s) Loss: 1.0682(0.8034) Grad: 382363.9062  LR: 0.00000011  
Epoch: [3][600/7353] Elapsed 8m 16s (remain 93m 2s) Loss: 0.8063(0.8037) Grad: 258571.5312  LR: 0.00000011  
Epoch: [3][700/7353] Elapsed 9m 39s (remain 91m 41s) Loss: 0.4827(0.7955) Grad: 203530.7344  LR: 0.00000010  
Epoch: [3][800/7353] Elapsed 11m 2s (remain 90m 18s) Loss: 0.7687(0.7953) Grad: 375046.7500  LR: 0.00000010  
Epoch: [3][900/7353] Ela

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [1][0/7353] Elapsed 0m 0s (remain 42m 7s) Loss: 0.9698(0.9698) Grad: inf  LR: 0.00000050  
Epoch: [1][100/7353] Elapsed 0m 41s (remain 49m 3s) Loss: 1.5286(0.9957) Grad: 997102.6875  LR: 0.00000050  
Epoch: [1][200/7353] Elapsed 1m 21s (remain 48m 26s) Loss: 1.7183(1.0101) Grad: 980012.8125  LR: 0.00000050  
Epoch: [1][300/7353] Elapsed 2m 2s (remain 47m 47s) Loss: 0.6753(1.0160) Grad: 649928.6875  LR: 0.00000050  
Epoch: [1][400/7353] Elapsed 2m 43s (remain 47m 7s) Loss: 0.9193(1.0114) Grad: 1004403.9375  LR: 0.00000050  
Epoch: [1][500/7353] Elapsed 3m 23s (remain 46m 28s) Loss: 0.8080(1.0113) Grad: 475682.0938  LR: 0.00000050  
Epoch: [1][600/7353] Elapsed 4m 4s (remain 45m 49s) Loss: 1.5771(1.0100) Grad: 883028.9375  LR: 0.00000050  
Epoch: [1][700/7353] Elapsed 4m 45s (remain 45m 8s) Loss: 0.5206(1.0027) Grad: 652313.6250  LR: 0.00000050  
Epoch: [1][800/7353] Elapsed 5m 26s (remain 44m 28s) Loss: 0.5433(0.9950) Grad: 377651.5000  LR: 0.00000050  
Epoch: [1][900/7353] Elaps

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [2][0/7353] Elapsed 0m 0s (remain 100m 52s) Loss: 1.6050(1.6050) Grad: inf  LR: 0.00000037  
Epoch: [2][100/7353] Elapsed 1m 23s (remain 99m 51s) Loss: 0.4471(1.0665) Grad: 367252.8125  LR: 0.00000037  
Epoch: [2][200/7353] Elapsed 2m 46s (remain 98m 32s) Loss: 0.5869(0.9970) Grad: 219467.4844  LR: 0.00000037  
Epoch: [2][300/7353] Elapsed 4m 8s (remain 97m 8s) Loss: 0.8706(0.9633) Grad: 426527.4688  LR: 0.00000037  
Epoch: [2][400/7353] Elapsed 5m 31s (remain 95m 47s) Loss: 1.1405(0.9384) Grad: 556436.5625  LR: 0.00000036  
Epoch: [2][500/7353] Elapsed 6m 54s (remain 94m 25s) Loss: 0.6469(0.9185) Grad: 190175.0938  LR: 0.00000036  
Epoch: [2][600/7353] Elapsed 8m 16s (remain 93m 0s) Loss: 0.5782(0.9025) Grad: 190287.7031  LR: 0.00000036  
Epoch: [2][700/7353] Elapsed 9m 39s (remain 91m 38s) Loss: 1.5349(0.8903) Grad: 558875.6875  LR: 0.00000035  
Epoch: [2][800/7353] Elapsed 11m 2s (remain 90m 16s) Loss: 0.8328(0.8927) Grad: 350791.3125  LR: 0.00000035  
Epoch: [2][900/7353] El

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [3][0/7353] Elapsed 0m 0s (remain 88m 6s) Loss: 0.4741(0.4741) Grad: inf  LR: 0.00000012  
Epoch: [3][100/7353] Elapsed 1m 23s (remain 99m 40s) Loss: 0.8476(0.7151) Grad: 465134.0625  LR: 0.00000012  
Epoch: [3][200/7353] Elapsed 2m 46s (remain 98m 33s) Loss: 0.5623(0.7572) Grad: 416200.8438  LR: 0.00000012  
Epoch: [3][300/7353] Elapsed 4m 8s (remain 97m 12s) Loss: 0.9869(0.7702) Grad: 285476.7812  LR: 0.00000012  
Epoch: [3][400/7353] Elapsed 5m 31s (remain 95m 50s) Loss: 0.5526(0.7793) Grad: 285068.7500  LR: 0.00000011  
Epoch: [3][500/7353] Elapsed 6m 54s (remain 94m 27s) Loss: 0.9599(0.7809) Grad: 411572.3750  LR: 0.00000011  
Epoch: [3][600/7353] Elapsed 8m 17s (remain 93m 3s) Loss: 1.0723(0.7803) Grad: 178829.7500  LR: 0.00000011  
Epoch: [3][700/7353] Elapsed 9m 39s (remain 91m 41s) Loss: 1.1949(0.7841) Grad: 220651.3281  LR: 0.00000010  
Epoch: [3][800/7353] Elapsed 11m 2s (remain 90m 19s) Loss: 0.7791(0.7880) Grad: 125251.5547  LR: 0.00000010  
Epoch: [3][900/7353] Ela

  grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)


Epoch: [1][0/7353] Elapsed 0m 0s (remain 45m 33s) Loss: 1.2886(1.2886) Grad: inf  LR: 0.00000050  
Epoch: [1][100/7353] Elapsed 0m 42s (remain 51m 5s) Loss: 0.7612(1.0517) Grad: 492514.3125  LR: 0.00000050  
Epoch: [1][200/7353] Elapsed 1m 24s (remain 49m 57s) Loss: 0.8907(1.0130) Grad: 458244.1250  LR: 0.00000050  
Epoch: [1][300/7353] Elapsed 2m 5s (remain 49m 11s) Loss: 1.3458(1.0090) Grad: 782875.0625  LR: 0.00000050  
Epoch: [1][400/7353] Elapsed 2m 47s (remain 48m 26s) Loss: 0.8739(1.0001) Grad: 481011.8438  LR: 0.00000050  


KeyboardInterrupt: 

In [24]:
oof_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay_text,label,inputs,fold,oof_ineffective,oof_adequate,oof_effective
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...",1,"lead hi, i'm isaac, i'm going to be writing ab...",0,0.312385,1.141884,-2.849747
1,6efd9102298b,00944C693682,the environment suffers greatly from the many ...,Claim,Effective,Limiting the usage of cars has personal and pr...,2,claim the environment suffers greatly from the...,0,-2.063072,0.061954,0.578710
2,8df2da9994bf,00944C693682,It is also worth noting that cities that have ...,Claim,Effective,Limiting the usage of cars has personal and pr...,2,claim it is also worth noting that cities that...,0,-2.059776,0.069159,0.562960
3,af494e4e0b4f,00BD97EA4041,To conclusion computers in school shouldn't de...,Concluding Statement,Adequate,Should computers read the emotional expression...,1,concluding statement to conclusion computers i...,0,-0.197987,0.835998,-2.118340
4,095847b91399,00C6E82FE5BA,People use face reconition alot and it can be ...,Evidence,Adequate,I think that it wouldn't be valueable to have ...,1,evidence people use face reconition alot and i...,0,-0.517432,0.617753,-1.679376
...,...,...,...,...,...,...,...,...,...,...,...,...
7348,4c649f487587,FE3CA06DDCA1,they are give you as much detail as they have ...,Claim,Adequate,Why is it when someone asks you for advice the...,1,claim they are give you as much detail as they...,2,-0.083238,1.245852,-0.878734
7349,2a26c07f620e,FEF42864AE28,due to the distractions for students at home,Claim,Effective,"During a long day at school, have you ever tho...",2,claim due to the distractions for students at ...,2,-1.978429,0.612321,1.213933
7350,625808403363,FF9E0379CD98,if you need highlights or pensil for write som...,Evidence,Adequate,Some school offer distence learning as a optio...,1,evidence if you need highlights or pensil for ...,2,0.364349,1.507439,-1.426399
7351,74c58fcc7ef8,FF9E0379CD98,you cant work or cant study after school with ...,Evidence,Adequate,Some school offer distence learning as a optio...,1,evidence you cant work or cant study after sch...,2,0.312942,1.485517,-1.370835


In [25]:
mll = log_loss(
            oof_df['label'],
            oof_df[['oof_ineffective'	,'oof_adequate',	'oof_effective']],
            labels=[0, 1, 2],
        )

In [26]:
mll

5.487588889049747