In [1]:
!pip install -q transformers



reference : 

https://colab.research.google.com/drive/19loLGUDjxGKy4ulZJ1m3hALq2ozNyEGe#scrollTo=oJFsRo_vGDYU

https://www.kaggle.com/piantic/pytorch-tpu

In [2]:
# for TPU
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5116  100  5116    0     0  30094      0 --:--:-- --:--:-- --:--:-- 30094
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200515 ...
Found existing installation: torch 1.7.0
Uninstalling torch-1.7.0:
Done updating TPU runtime
  Successfully uninstalled torch-1.7.0
Found existing installation: torchvision 0.8.1
Uninstalling torchvision-0.8.1:
  Successfully uninstalled torchvision-0.8.1
Copying gs://tpu-pytorch/wheels/torch-nightly+20200515-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/91.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200515-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/119.5 MiB.                                    
Copying gs://tpu-pytorch/wheels/torchvision-nightly+202

In [3]:
import os 
import sys

import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

import seaborn as sns

In [4]:
warnings.filterwarnings("ignore")

In [5]:
# imports the torch_xla package
import torch_xla
import torch_xla.core.xla_model as xm

device = xm.xla_device()
torch.set_default_tensor_type('torch.FloatTensor')


In [6]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

xla:1


In [7]:
DATA_DIR = "../input/signate-spws-2/"
OUTPUT_DIR = "./"

#../input/signate-spws-2/train.csv

In [8]:
DEBUG = False

if DEBUG:
  train = pd.read_csv(DATA_DIR + "train.csv").sample(20).reset_index(drop=True)
  test = pd.read_csv(DATA_DIR + "test.csv").sample(20).reset_index(drop=True)
  sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None).sample(20).reset_index(drop=True)
  sub.columns = ["id", "judgement"]
else:
  train = pd.read_csv(DATA_DIR + "train.csv")
  test = pd.read_csv(DATA_DIR + "test.csv")
  sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
  sub.columns = ["id", "judgement"]

In [9]:
train['text'] = train['title']+ " " + train["abstract"].fillna('NaN')
train['text_len'] = train['text'].apply(lambda x: len(x.split(' ')))

In [10]:
#データの訂正
train.loc[train['id'] == 2488, 'judgement'] = 0
train.loc[train['id'] == 7708, 'judgement'] = 0

In [11]:
class config:
  if DEBUG:
    border = len(train[train["judgement"] == 1]) / len(train["judgement"])
    seed = 89
    NUM_SPLITS = 5
    MAX_LEN = 400
    #MODEL_NAME = "bert-base-uncased"
    MODEL_NAME = "allenai/scibert_scivocab_uncased"
    TRAIN_BATCH_SIZE = 16
    VALID_BATCH_SIZE = 16
    
    EPOCH = 3

  else:
    FILENAME = 'bioelectra-base-discriminator-pubmed-pmc-lt_fold7'

    border = len(train[train["judgement"] == 1]) / len(train["judgement"])
    seed = 89
    NUM_SPLITS = 7
    MAX_LEN = 400
    #MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
    #MODEL_NAME = "sultan/BioM-ELECTRA-Base-Discriminator"
    #MODEL_NAME = "gsarti/biobert-nli"
    MODEL_NAME = "kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt"
    TRAIN_BATCH_SIZE = 16
    VALID_BATCH_SIZE = 16
    
    EPOCH = 3

In [12]:
def init_logger(log_file=OUTPUT_DIR + f"{config.FILENAME}_train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [13]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = config.seed
seed_torch(seed)

In [14]:
from scipy.optimize import minimize, minimize_scalar
def optimize_threshold(y_true, y_pred):
    """fbeta score計算時のthresholdを最適化"""
    def opt_(x): 
        return -fbeta_score(y_true, y_pred >= x, beta=7)
    #result = minimize(opt_, x0=np.array([0.5]), method='Nelder-Mead')
    result = minimize_scalar(opt_, bounds=(0, 0.5), method='bounded') 

    best_threshold = result['x'].item()
    return best_threshold


In [15]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [16]:
train = get_train_data(train)

In [17]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        tokenizer = T.ElectraTokenizer.from_pretrained(model_name)

        self.df = df
        self.include_labels = include_labels

        #self.title = df["title"].tolist()
        df["text"] = df["title"]+" "+df["abstract"].fillna('NaN')
        #self.title = df["text"].tolist()
        self.text = df["text"].tolist()

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = config.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["judgement"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [18]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.model = T.ElectraForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        out = self.sigmoid(out.logits).squeeze()

        return out

In [19]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [20]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to train mode
    model.train()

    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)

        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        #optimizer.step()
        xm.optimizer_step(optimizer, barrier=True)

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    return losses.avg

In [21]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    predictions = np.concatenate(preds)
    return losses.avg, predictions


In [22]:
def inference(threshold):
    predictions = []
    predictions2 = []

    test_dataset = BaseDataset(test, config.MODEL_NAME, include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True
    )

    for fold in range(config.NUM_SPLITS):
        LOGGER.info(f"========== model: {config.FILENAME} fold: {fold} inference ==========")
        model = BaseModel(config.MODEL_NAME)
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"{config.FILENAME}_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        preds2 = np.where(preds < threshold[fold], 0, 1)
        predictions.append(preds)
        predictions2.append(preds2)
    predictions = np.mean(predictions, axis=0)
    predictions2 = np.mean(predictions2, axis=0)

    return predictions, predictions2

In [23]:
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds, config.MODEL_NAME)
    valid_dataset = BaseDataset(valid_folds, config.MODEL_NAME)

    train_loader = DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = BaseModel(config.MODEL_NAME)
    model.to(device)

    optimizer = T.AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf

    for epoch in range(config.EPOCH):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values

        # scoring
        score = fbeta_score(valid_labels, np.where(preds < config.border, 0, 1), beta=7.0)

        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"{config.FILENAME}_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"{config.FILENAME}_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

In [24]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score = fbeta_score(labels, np.where(preds < config.border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

    best_threshold = optimize_threshold(labels, preds)
    LOGGER.info(f"Best threshold : {best_threshold:<.5f}")
    score = fbeta_score(labels, np.where(preds < best_threshold, 0, 1), beta=7.0)
    LOGGER.info(f"After optimizing score: {score:<.5f}")

    return best_threshold

In [25]:
def main():
    # Training
    oof_df = pd.DataFrame()
    threshold = []
    for fold in range(config.NUM_SPLITS):
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
        best_threshold = get_result(_oof_df)
        threshold.append(best_threshold)
        
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    
    # Save OOF result
    oof_df.to_csv(OUTPUT_DIR + f"{config.FILENAME}_oof_df.csv", index=False)

    # Inference
    proba_predictions, majo_pred = inference(threshold)
    predictions = np.where(proba_predictions < config.border, 0, 1)
    majo_prediction = np.where(majo_pred < 0.5, 0, 1)

    # submission
    sub["judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + f"./sub_{config.FILENAME}.csv", index=False, header=False)
    
    # submission
    sub["judgement"] = majo_prediction
    sub.to_csv(OUTPUT_DIR + f"./sub_{config.FILENAME}_fold_majority.csv", index=False, header=False)

    # mean threshold
    predictions = np.where(proba_predictions < np.mean(threshold), 0, 1)

    sub["judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + f"./sub_mean_thr_{config.FILENAME}.csv", index=False, header=False)

    #stack = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
    sub["judgement"] = proba_predictions
    sub.to_csv(OUTPUT_DIR + f"./stack_{config.FILENAME}.csv", index=False, header=False)

In [26]:
if __name__ == "__main__":
  main()



Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

Epoch: [1][0/1454] Elapsed 0m 32s (remain 786m 56s) Loss: 0.6736 
Epoch: [1][100/1454] Elapsed 1m 48s (remain 24m 11s) Loss: 0.2262 
Epoch: [1][200/1454] Elapsed 2m 31s (remain 15m 42s) Loss: 0.1665 
Epoch: [1][300/1454] Elapsed 3m 14s (remain 12m 23s) Loss: 0.1506 
Epoch: [1][400/1454] Elapsed 3m 56s (remain 10m 21s) Loss: 0.1354 
Epoch: [1][500/1454] Elapsed 4m 39s (remain 8m 51s) Loss: 0.1220 
Epoch: [1][600/1454] Elapsed 5m 22s (remain 7m 37s) Loss: 0.1114 
Epoch: [1][700/1454] Elapsed 6m 5s (remain 6m 32s) Loss: 0.1056 
Epoch: [1][800/1454] Elapsed 6m 48s (remain 5m 32s) Loss: 0.1014 
Epoch: [1][900/1454] Elapsed 7m 31s (remain 4m 36s) Loss: 0.0977 
Epoch: [1][1000/1454] Elapsed 8m 13s (remain 3m 43s) Loss: 0.0944 
Epoch: [1][1100/1454] Elapsed 8m 56s (remain 2m 52s) Loss: 0.0916 
Epoch: [1][1200/1454] Elapsed 9m 39s (remain 2m 2s) Loss: 0.0876 
Epoch: [1][1300/1454] Elapsed 10m 22s (remain 1m 13s) Loss: 0.0857 
Epoch: [1][1400/1454] Elapsed 11m 5s (remain 0m 25s) Loss: 0.0828 
Ep

Epoch 1 - avg_train_loss: 0.0824  avg_val_loss: 0.0507  time: 741s
Epoch 1 - Score: 0.8331529971867561
Epoch 1 - Save Best Score: 0.8332 Model


EVAL: [242/243] Elapsed 0m 53s (remain 0m 0s) Loss: 0.0507 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 11m 31s) Loss: 0.0189 
Epoch: [2][100/1454] Elapsed 0m 43s (remain 9m 41s) Loss: 0.0416 
Epoch: [2][200/1454] Elapsed 1m 25s (remain 8m 56s) Loss: 0.0413 
Epoch: [2][300/1454] Elapsed 2m 8s (remain 8m 12s) Loss: 0.0402 
Epoch: [2][400/1454] Elapsed 2m 51s (remain 7m 29s) Loss: 0.0426 
Epoch: [2][500/1454] Elapsed 3m 34s (remain 6m 47s) Loss: 0.0407 
Epoch: [2][600/1454] Elapsed 4m 16s (remain 6m 4s) Loss: 0.0400 
Epoch: [2][700/1454] Elapsed 4m 59s (remain 5m 21s) Loss: 0.0396 
Epoch: [2][800/1454] Elapsed 5m 42s (remain 4m 39s) Loss: 0.0398 
Epoch: [2][900/1454] Elapsed 6m 24s (remain 3m 56s) Loss: 0.0405 
Epoch: [2][1000/1454] Elapsed 7m 7s (remain 3m 13s) Loss: 0.0404 
Epoch: [2][1100/1454] Elapsed 7m 50s (remain 2m 30s) Loss: 0.0397 
Epoch: [2][1200/1454] Elapsed 8m 33s (remain 1m 48s) Loss: 0.0403 
Epoch: [2][1300/1454] Elapsed 9m 15s (remain 1m 5s) Loss: 0.0402 
Epoch: [2][1400/14

Epoch 2 - avg_train_loss: 0.0405  avg_val_loss: 0.0446  time: 656s
Epoch 2 - Score: 0.8777633289986998
Epoch 2 - Save Best Score: 0.8778 Model


EVAL: [242/243] Elapsed 0m 34s (remain 0m 0s) Loss: 0.0446 
Epoch: [3][0/1454] Elapsed 0m 0s (remain 11m 41s) Loss: 0.0017 
Epoch: [3][100/1454] Elapsed 0m 43s (remain 9m 40s) Loss: 0.0346 
Epoch: [3][200/1454] Elapsed 1m 26s (remain 8m 56s) Loss: 0.0303 
Epoch: [3][300/1454] Elapsed 2m 8s (remain 8m 13s) Loss: 0.0287 
Epoch: [3][400/1454] Elapsed 2m 51s (remain 7m 30s) Loss: 0.0290 
Epoch: [3][500/1454] Elapsed 3m 34s (remain 6m 47s) Loss: 0.0287 
Epoch: [3][600/1454] Elapsed 4m 17s (remain 6m 4s) Loss: 0.0302 
Epoch: [3][700/1454] Elapsed 5m 0s (remain 5m 22s) Loss: 0.0298 
Epoch: [3][800/1454] Elapsed 5m 42s (remain 4m 39s) Loss: 0.0293 
Epoch: [3][900/1454] Elapsed 6m 25s (remain 3m 56s) Loss: 0.0289 
Epoch: [3][1000/1454] Elapsed 7m 8s (remain 3m 14s) Loss: 0.0283 
Epoch: [3][1100/1454] Elapsed 7m 52s (remain 2m 31s) Loss: 0.0280 
Epoch: [3][1200/1454] Elapsed 8m 35s (remain 1m 48s) Loss: 0.0280 
Epoch: [3][1300/1454] Elapsed 9m 18s (remain 1m 5s) Loss: 0.0287 
Epoch: [3][1400/145

Epoch 3 - avg_train_loss: 0.0283  avg_val_loss: 0.0610  time: 660s
Epoch 3 - Score: 0.7646276595744682


EVAL: [242/243] Elapsed 0m 35s (remain 0m 0s) Loss: 0.0610 


Score: 0.87776
Best threshold : 0.01986
After optimizing score: 0.87700
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 24s (remain 586m 18s) Loss: 0.6932 
Epoch: [1][100/1454] Elapsed 1m 8s (remain 15m 13s) Loss: 0.2477 
Epoch: [1][200/1454] Elapsed 1m 52s (remain 11m 38s) Loss: 0.1766 
Epoch: [1][300/1454] Elapsed 2m 36s (remain 9m 57s) Loss: 0.1491 
Epoch: [1][400/1454] Elapsed 3m 19s (remain 8m 44s) Loss: 0.1318 
Epoch: [1][500/1454] Elapsed 4m 3s (remain 7m 43s) Loss: 0.1157 
Epoch: [1][600/1454] Elapsed 4m 47s (remain 6m 48s) Loss: 0.1098 
Epoch: [1][700/1454] Elapsed 5m 31s (remain 5m 56s) Loss: 0.1044 
Epoch: [1][800/1454] Elapsed 6m 15s (remain 5m 6s) Loss: 0.0998 
Epoch: [1][900/1454] Elapsed 6m 59s (remain 4m 17s) Loss: 0.0954 
Epoch: [1][1000/1454] Elapsed 7m 43s (remain 3m 29s) Loss: 0.0921 
Epoch: [1][1100/1454] Elapsed 8m 27s (remain 2m 42s) Loss: 0.0886 
Epoch: [1][1200/1454] Elapsed 9m 11s (remain 1m 56s) Loss: 0.0837 
Epoch: [1][1300/1454] Elapsed 9m 55s (remain 1m 9s) Loss: 0.0820 
Epoch: [1][1400/1454] Elapsed 10m 39s (remain 0m 24s) Loss: 0.0787 
Epoch:

Epoch 1 - avg_train_loss: 0.0781  avg_val_loss: 0.0464  time: 699s
Epoch 1 - Score: 0.8193186718413109
Epoch 1 - Save Best Score: 0.8193 Model


EVAL: [242/243] Elapsed 0m 35s (remain 0m 0s) Loss: 0.0464 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 12m 57s) Loss: 0.3571 
Epoch: [2][100/1454] Elapsed 0m 44s (remain 10m 2s) Loss: 0.0543 
Epoch: [2][200/1454] Elapsed 1m 29s (remain 9m 15s) Loss: 0.0426 
Epoch: [2][300/1454] Elapsed 2m 13s (remain 8m 31s) Loss: 0.0418 
Epoch: [2][400/1454] Elapsed 2m 57s (remain 7m 46s) Loss: 0.0407 
Epoch: [2][500/1454] Elapsed 3m 41s (remain 7m 2s) Loss: 0.0375 
Epoch: [2][600/1454] Elapsed 4m 26s (remain 6m 17s) Loss: 0.0399 
Epoch: [2][700/1454] Elapsed 5m 10s (remain 5m 33s) Loss: 0.0385 
Epoch: [2][800/1454] Elapsed 5m 54s (remain 4m 49s) Loss: 0.0392 
Epoch: [2][900/1454] Elapsed 6m 38s (remain 4m 4s) Loss: 0.0376 
Epoch: [2][1000/1454] Elapsed 7m 23s (remain 3m 20s) Loss: 0.0369 
Epoch: [2][1100/1454] Elapsed 8m 7s (remain 2m 36s) Loss: 0.0365 
Epoch: [2][1200/1454] Elapsed 8m 51s (remain 1m 51s) Loss: 0.0364 
Epoch: [2][1300/1454] Elapsed 9m 35s (remain 1m 7s) Loss: 0.0374 
Epoch: [2][1400/14

Epoch 2 - avg_train_loss: 0.0367  avg_val_loss: 0.0418  time: 679s
Epoch 2 - Score: 0.9134406263592867
Epoch 2 - Save Best Score: 0.9134 Model


EVAL: [242/243] Elapsed 0m 35s (remain 0m 0s) Loss: 0.0418 
Epoch: [3][0/1454] Elapsed 0m 1s (remain 29m 22s) Loss: 0.0450 
Epoch: [3][100/1454] Elapsed 0m 46s (remain 10m 25s) Loss: 0.0258 
Epoch: [3][200/1454] Elapsed 1m 30s (remain 9m 26s) Loss: 0.0276 
Epoch: [3][300/1454] Elapsed 2m 15s (remain 8m 37s) Loss: 0.0276 
Epoch: [3][400/1454] Elapsed 2m 59s (remain 7m 51s) Loss: 0.0252 
Epoch: [3][500/1454] Elapsed 3m 44s (remain 7m 6s) Loss: 0.0231 
Epoch: [3][600/1454] Elapsed 4m 29s (remain 6m 22s) Loss: 0.0223 
Epoch: [3][700/1454] Elapsed 5m 13s (remain 5m 37s) Loss: 0.0225 
Epoch: [3][800/1454] Elapsed 5m 58s (remain 4m 52s) Loss: 0.0230 
Epoch: [3][900/1454] Elapsed 6m 43s (remain 4m 7s) Loss: 0.0231 
Epoch: [3][1000/1454] Elapsed 7m 27s (remain 3m 22s) Loss: 0.0242 
Epoch: [3][1100/1454] Elapsed 8m 12s (remain 2m 37s) Loss: 0.0238 
Epoch: [3][1200/1454] Elapsed 8m 57s (remain 1m 53s) Loss: 0.0234 
Epoch: [3][1300/1454] Elapsed 9m 41s (remain 1m 8s) Loss: 0.0240 
Epoch: [3][1400/

Epoch 3 - avg_train_loss: 0.0246  avg_val_loss: 0.0362  time: 688s
Epoch 3 - Score: 0.9217089568423336
Epoch 3 - Save Best Score: 0.9217 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0362 


Score: 0.92171
Best threshold : 0.02787
After optimizing score: 0.92311
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 0s (remain 12m 32s) Loss: 0.7375 
Epoch: [1][100/1454] Elapsed 0m 45s (remain 10m 5s) Loss: 0.2437 
Epoch: [1][200/1454] Elapsed 1m 29s (remain 9m 20s) Loss: 0.1824 
Epoch: [1][300/1454] Elapsed 2m 14s (remain 8m 37s) Loss: 0.1554 
Epoch: [1][400/1454] Elapsed 2m 59s (remain 7m 52s) Loss: 0.1456 
Epoch: [1][500/1454] Elapsed 3m 44s (remain 7m 7s) Loss: 0.1315 
Epoch: [1][600/1454] Elapsed 4m 29s (remain 6m 22s) Loss: 0.1182 
Epoch: [1][700/1454] Elapsed 5m 14s (remain 5m 37s) Loss: 0.1134 
Epoch: [1][800/1454] Elapsed 5m 59s (remain 4m 53s) Loss: 0.1068 
Epoch: [1][900/1454] Elapsed 6m 46s (remain 4m 9s) Loss: 0.1018 
Epoch: [1][1000/1454] Elapsed 7m 30s (remain 3m 24s) Loss: 0.0969 
Epoch: [1][1100/1454] Elapsed 8m 15s (remain 2m 38s) Loss: 0.0931 
Epoch: [1][1200/1454] Elapsed 9m 0s (remain 1m 53s) Loss: 0.0892 
Epoch: [1][1300/1454] Elapsed 9m 45s (remain 1m 8s) Loss: 0.0872 
Epoch: [1][1400/1454] Elapsed 10m 30s (remain 0m 23s) Loss: 0.0841 
Epoch: [1]

Epoch 1 - avg_train_loss: 0.0827  avg_val_loss: 0.0487  time: 691s
Epoch 1 - Score: 0.8637443316778233
Epoch 1 - Save Best Score: 0.8637 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0487 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 13m 12s) Loss: 0.0734 
Epoch: [2][100/1454] Elapsed 0m 45s (remain 10m 4s) Loss: 0.0322 
Epoch: [2][200/1454] Elapsed 1m 29s (remain 9m 18s) Loss: 0.0373 
Epoch: [2][300/1454] Elapsed 2m 14s (remain 8m 33s) Loss: 0.0354 
Epoch: [2][400/1454] Elapsed 2m 58s (remain 7m 48s) Loss: 0.0353 
Epoch: [2][500/1454] Elapsed 3m 42s (remain 7m 3s) Loss: 0.0351 
Epoch: [2][600/1454] Elapsed 4m 27s (remain 6m 19s) Loss: 0.0362 
Epoch: [2][700/1454] Elapsed 5m 11s (remain 5m 34s) Loss: 0.0361 
Epoch: [2][800/1454] Elapsed 5m 56s (remain 4m 50s) Loss: 0.0374 
Epoch: [2][900/1454] Elapsed 6m 40s (remain 4m 5s) Loss: 0.0366 
Epoch: [2][1000/1454] Elapsed 7m 24s (remain 3m 21s) Loss: 0.0355 
Epoch: [2][1100/1454] Elapsed 8m 9s (remain 2m 36s) Loss: 0.0354 
Epoch: [2][1200/1454] Elapsed 8m 53s (remain 1m 52s) Loss: 0.0352 
Epoch: [2][1300/1454] Elapsed 9m 38s (remain 1m 8s) Loss: 0.0346 
Epoch: [2][1400/14

Epoch 2 - avg_train_loss: 0.0351  avg_val_loss: 0.0348  time: 683s
Epoch 2 - Score: 0.9110396570203645
Epoch 2 - Save Best Score: 0.9110 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0348 
Epoch: [3][0/1454] Elapsed 0m 0s (remain 17m 37s) Loss: 0.0462 
Epoch: [3][100/1454] Elapsed 0m 45s (remain 10m 6s) Loss: 0.0159 
Epoch: [3][200/1454] Elapsed 1m 29s (remain 9m 20s) Loss: 0.0168 
Epoch: [3][300/1454] Elapsed 2m 13s (remain 8m 33s) Loss: 0.0223 
Epoch: [3][400/1454] Elapsed 2m 58s (remain 7m 47s) Loss: 0.0212 
Epoch: [3][500/1454] Elapsed 3m 42s (remain 7m 2s) Loss: 0.0213 
Epoch: [3][600/1454] Elapsed 4m 26s (remain 6m 18s) Loss: 0.0220 
Epoch: [3][700/1454] Elapsed 5m 10s (remain 5m 33s) Loss: 0.0222 
Epoch: [3][800/1454] Elapsed 5m 55s (remain 4m 49s) Loss: 0.0229 
Epoch: [3][900/1454] Elapsed 6m 39s (remain 4m 5s) Loss: 0.0225 
Epoch: [3][1000/1454] Elapsed 7m 24s (remain 3m 21s) Loss: 0.0219 
Epoch: [3][1100/1454] Elapsed 8m 8s (remain 2m 36s) Loss: 0.0214 
Epoch: [3][1200/1454] Elapsed 8m 52s (remain 1m 52s) Loss: 0.0211 
Epoch: [3][1300/1454] Elapsed 9m 37s (remain 1m 7s) Loss: 0.0219 
Epoch: [3][1400/14

Epoch 3 - avg_train_loss: 0.0216  avg_val_loss: 0.0523  time: 682s
Epoch 3 - Score: 0.8487486398258978


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0523 


Score: 0.91104
Best threshold : 0.10231
After optimizing score: 0.90830
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 0s (remain 13m 33s) Loss: 0.7205 
Epoch: [1][100/1454] Elapsed 0m 45s (remain 10m 8s) Loss: 0.2449 
Epoch: [1][200/1454] Elapsed 1m 30s (remain 9m 22s) Loss: 0.1824 
Epoch: [1][300/1454] Elapsed 2m 14s (remain 8m 37s) Loss: 0.1605 
Epoch: [1][400/1454] Elapsed 2m 59s (remain 7m 52s) Loss: 0.1460 
Epoch: [1][500/1454] Elapsed 3m 44s (remain 7m 7s) Loss: 0.1357 
Epoch: [1][600/1454] Elapsed 4m 29s (remain 6m 22s) Loss: 0.1263 
Epoch: [1][700/1454] Elapsed 5m 14s (remain 5m 37s) Loss: 0.1194 
Epoch: [1][800/1454] Elapsed 5m 59s (remain 4m 52s) Loss: 0.1139 
Epoch: [1][900/1454] Elapsed 6m 43s (remain 4m 7s) Loss: 0.1086 
Epoch: [1][1000/1454] Elapsed 7m 28s (remain 3m 22s) Loss: 0.1032 
Epoch: [1][1100/1454] Elapsed 8m 13s (remain 2m 38s) Loss: 0.0992 
Epoch: [1][1200/1454] Elapsed 8m 58s (remain 1m 53s) Loss: 0.0956 
Epoch: [1][1300/1454] Elapsed 9m 43s (remain 1m 8s) Loss: 0.0942 
Epoch: [1][1400/1454] Elapsed 10m 28s (remain 0m 23s) Loss: 0.0908 
Epoch: [1

Epoch 1 - avg_train_loss: 0.0895  avg_val_loss: 0.0644  time: 689s
Epoch 1 - Score: 0.6179651291105716
Epoch 1 - Save Best Score: 0.6180 Model


EVAL: [242/243] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0644 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 18m 36s) Loss: 0.0470 
Epoch: [2][100/1454] Elapsed 0m 45s (remain 10m 11s) Loss: 0.0590 
Epoch: [2][200/1454] Elapsed 1m 30s (remain 9m 23s) Loss: 0.0502 
Epoch: [2][300/1454] Elapsed 2m 15s (remain 8m 39s) Loss: 0.0509 
Epoch: [2][400/1454] Elapsed 3m 1s (remain 7m 55s) Loss: 0.0505 
Epoch: [2][500/1454] Elapsed 3m 45s (remain 7m 9s) Loss: 0.0496 
Epoch: [2][600/1454] Elapsed 4m 30s (remain 6m 24s) Loss: 0.0495 
Epoch: [2][700/1454] Elapsed 5m 15s (remain 5m 38s) Loss: 0.0498 
Epoch: [2][800/1454] Elapsed 6m 0s (remain 4m 53s) Loss: 0.0500 
Epoch: [2][900/1454] Elapsed 6m 45s (remain 4m 8s) Loss: 0.0500 
Epoch: [2][1000/1454] Elapsed 7m 29s (remain 3m 23s) Loss: 0.0494 
Epoch: [2][1100/1454] Elapsed 8m 14s (remain 2m 38s) Loss: 0.0497 
Epoch: [2][1200/1454] Elapsed 8m 59s (remain 1m 53s) Loss: 0.0500 
Epoch: [2][1300/1454] Elapsed 9m 44s (remain 1m 8s) Loss: 0.0501 
Epoch: [2][1400/14

Epoch 2 - avg_train_loss: 0.0491  avg_val_loss: 0.0562  time: 690s
Epoch 2 - Score: 0.7526881720430108
Epoch 2 - Save Best Score: 0.7527 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0562 
Epoch: [3][0/1454] Elapsed 0m 1s (remain 27m 16s) Loss: 0.0027 
Epoch: [3][100/1454] Elapsed 0m 47s (remain 10m 32s) Loss: 0.0410 
Epoch: [3][200/1454] Elapsed 1m 31s (remain 9m 31s) Loss: 0.0383 
Epoch: [3][300/1454] Elapsed 2m 16s (remain 8m 42s) Loss: 0.0349 
Epoch: [3][400/1454] Elapsed 3m 0s (remain 7m 54s) Loss: 0.0370 
Epoch: [3][500/1454] Elapsed 3m 45s (remain 7m 8s) Loss: 0.0357 
Epoch: [3][600/1454] Elapsed 4m 30s (remain 6m 23s) Loss: 0.0369 
Epoch: [3][700/1454] Elapsed 5m 14s (remain 5m 38s) Loss: 0.0379 
Epoch: [3][800/1454] Elapsed 5m 59s (remain 4m 52s) Loss: 0.0357 
Epoch: [3][900/1454] Elapsed 6m 43s (remain 4m 7s) Loss: 0.0371 
Epoch: [3][1000/1454] Elapsed 7m 28s (remain 3m 22s) Loss: 0.0374 
Epoch: [3][1100/1454] Elapsed 8m 12s (remain 2m 38s) Loss: 0.0367 
Epoch: [3][1200/1454] Elapsed 8m 57s (remain 1m 53s) Loss: 0.0362 
Epoch: [3][1300/1454] Elapsed 9m 41s (remain 1m 8s) Loss: 0.0359 
Epoch: [3][1400/1

Epoch 3 - avg_train_loss: 0.0365  avg_val_loss: 0.0508  time: 688s
Epoch 3 - Score: 0.7596832869676866
Epoch 3 - Save Best Score: 0.7597 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0508 


Score: 0.75968
Best threshold : 0.06267
After optimizing score: 0.76990
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 0s (remain 12m 52s) Loss: 0.7177 
Epoch: [1][100/1454] Elapsed 0m 46s (remain 10m 23s) Loss: 0.2546 
Epoch: [1][200/1454] Elapsed 1m 31s (remain 9m 28s) Loss: 0.1882 
Epoch: [1][300/1454] Elapsed 2m 16s (remain 8m 41s) Loss: 0.1606 
Epoch: [1][400/1454] Elapsed 3m 0s (remain 7m 55s) Loss: 0.1366 
Epoch: [1][500/1454] Elapsed 3m 45s (remain 7m 9s) Loss: 0.1264 
Epoch: [1][600/1454] Elapsed 4m 30s (remain 6m 24s) Loss: 0.1142 
Epoch: [1][700/1454] Elapsed 5m 16s (remain 5m 40s) Loss: 0.1078 
Epoch: [1][800/1454] Elapsed 6m 1s (remain 4m 54s) Loss: 0.1011 
Epoch: [1][900/1454] Elapsed 6m 46s (remain 4m 9s) Loss: 0.0986 
Epoch: [1][1000/1454] Elapsed 7m 31s (remain 3m 24s) Loss: 0.0962 
Epoch: [1][1100/1454] Elapsed 8m 16s (remain 2m 39s) Loss: 0.0926 
Epoch: [1][1200/1454] Elapsed 9m 0s (remain 1m 53s) Loss: 0.0890 
Epoch: [1][1300/1454] Elapsed 9m 45s (remain 1m 8s) Loss: 0.0856 
Epoch: [1][1400/1454] Elapsed 10m 31s (remain 0m 23s) Loss: 0.0835 
Epoch: [1][

Epoch 1 - avg_train_loss: 0.0820  avg_val_loss: 0.0461  time: 692s
Epoch 1 - Score: 0.9102065862139497
Epoch 1 - Save Best Score: 0.9102 Model


EVAL: [242/243] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0461 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 14m 56s) Loss: 0.0259 
Epoch: [2][100/1454] Elapsed 0m 46s (remain 10m 18s) Loss: 0.0453 
Epoch: [2][200/1454] Elapsed 1m 31s (remain 9m 29s) Loss: 0.0428 
Epoch: [2][300/1454] Elapsed 2m 16s (remain 8m 42s) Loss: 0.0415 
Epoch: [2][400/1454] Elapsed 3m 1s (remain 7m 56s) Loss: 0.0379 
Epoch: [2][500/1454] Elapsed 3m 46s (remain 7m 10s) Loss: 0.0381 
Epoch: [2][600/1454] Elapsed 4m 31s (remain 6m 25s) Loss: 0.0376 
Epoch: [2][700/1454] Elapsed 5m 16s (remain 5m 39s) Loss: 0.0372 
Epoch: [2][800/1454] Elapsed 6m 1s (remain 4m 54s) Loss: 0.0358 
Epoch: [2][900/1454] Elapsed 6m 46s (remain 4m 9s) Loss: 0.0375 
Epoch: [2][1000/1454] Elapsed 7m 31s (remain 3m 24s) Loss: 0.0379 
Epoch: [2][1100/1454] Elapsed 8m 16s (remain 2m 39s) Loss: 0.0372 
Epoch: [2][1200/1454] Elapsed 9m 1s (remain 1m 54s) Loss: 0.0370 
Epoch: [2][1300/1454] Elapsed 9m 46s (remain 1m 9s) Loss: 0.0368 
Epoch: [2][1400/14

Epoch 2 - avg_train_loss: 0.0371  avg_val_loss: 0.0365  time: 693s
Epoch 2 - Score: 0.9045875511522722


EVAL: [242/243] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0365 
Epoch: [3][0/1454] Elapsed 0m 0s (remain 12m 50s) Loss: 0.0008 
Epoch: [3][100/1454] Elapsed 0m 45s (remain 10m 11s) Loss: 0.0213 
Epoch: [3][200/1454] Elapsed 1m 30s (remain 9m 24s) Loss: 0.0196 
Epoch: [3][300/1454] Elapsed 2m 15s (remain 8m 39s) Loss: 0.0202 
Epoch: [3][400/1454] Elapsed 3m 0s (remain 7m 55s) Loss: 0.0217 
Epoch: [3][500/1454] Elapsed 3m 45s (remain 7m 9s) Loss: 0.0210 
Epoch: [3][600/1454] Elapsed 4m 31s (remain 6m 24s) Loss: 0.0213 
Epoch: [3][700/1454] Elapsed 5m 16s (remain 5m 39s) Loss: 0.0217 
Epoch: [3][800/1454] Elapsed 6m 1s (remain 4m 54s) Loss: 0.0219 
Epoch: [3][900/1454] Elapsed 6m 46s (remain 4m 9s) Loss: 0.0217 
Epoch: [3][1000/1454] Elapsed 7m 32s (remain 3m 24s) Loss: 0.0215 
Epoch: [3][1100/1454] Elapsed 8m 17s (remain 2m 39s) Loss: 0.0221 
Epoch: [3][1200/1454] Elapsed 9m 2s (remain 1m 54s) Loss: 0.0220 
Epoch: [3][1300/1454] Elapsed 9m 47s (remain 1m 9s) Loss: 0.0219 
Epoch: [3][1400/145

Epoch 3 - avg_train_loss: 0.0223  avg_val_loss: 0.0427  time: 694s
Epoch 3 - Score: 0.8703220191470844


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0427 


Score: 0.91021
Best threshold : 0.04509
After optimizing score: 0.91579
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 0s (remain 14m 3s) Loss: 0.7190 
Epoch: [1][100/1454] Elapsed 0m 46s (remain 10m 16s) Loss: 0.2314 
Epoch: [1][200/1454] Elapsed 1m 31s (remain 9m 28s) Loss: 0.1683 
Epoch: [1][300/1454] Elapsed 2m 16s (remain 8m 44s) Loss: 0.1457 
Epoch: [1][400/1454] Elapsed 3m 2s (remain 7m 58s) Loss: 0.1305 
Epoch: [1][500/1454] Elapsed 3m 47s (remain 7m 13s) Loss: 0.1174 
Epoch: [1][600/1454] Elapsed 4m 33s (remain 6m 27s) Loss: 0.1072 
Epoch: [1][700/1454] Elapsed 5m 18s (remain 5m 42s) Loss: 0.0997 
Epoch: [1][800/1454] Elapsed 6m 4s (remain 4m 57s) Loss: 0.0952 
Epoch: [1][900/1454] Elapsed 6m 50s (remain 4m 11s) Loss: 0.0935 
Epoch: [1][1000/1454] Elapsed 7m 35s (remain 3m 26s) Loss: 0.0883 
Epoch: [1][1100/1454] Elapsed 8m 20s (remain 2m 40s) Loss: 0.0851 
Epoch: [1][1200/1454] Elapsed 9m 6s (remain 1m 55s) Loss: 0.0822 
Epoch: [1][1300/1454] Elapsed 9m 51s (remain 1m 9s) Loss: 0.0785 
Epoch: [1][1400/1454] Elapsed 10m 36s (remain 0m 24s) Loss: 0.0774 
Epoch: [1]

Epoch 1 - avg_train_loss: 0.0767  avg_val_loss: 0.0407  time: 698s
Epoch 1 - Score: 0.9245942058763099
Epoch 1 - Save Best Score: 0.9246 Model


EVAL: [242/243] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0407 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 15m 4s) Loss: 0.0102 
Epoch: [2][100/1454] Elapsed 0m 46s (remain 10m 20s) Loss: 0.0306 
Epoch: [2][200/1454] Elapsed 1m 32s (remain 9m 36s) Loss: 0.0311 
Epoch: [2][300/1454] Elapsed 2m 18s (remain 8m 49s) Loss: 0.0357 
Epoch: [2][400/1454] Elapsed 3m 3s (remain 8m 3s) Loss: 0.0372 
Epoch: [2][500/1454] Elapsed 3m 49s (remain 7m 16s) Loss: 0.0354 
Epoch: [2][600/1454] Elapsed 4m 35s (remain 6m 30s) Loss: 0.0338 
Epoch: [2][700/1454] Elapsed 5m 20s (remain 5m 44s) Loss: 0.0336 
Epoch: [2][800/1454] Elapsed 6m 7s (remain 4m 59s) Loss: 0.0340 
Epoch: [2][900/1454] Elapsed 6m 53s (remain 4m 13s) Loss: 0.0335 
Epoch: [2][1000/1454] Elapsed 7m 39s (remain 3m 28s) Loss: 0.0338 
Epoch: [2][1100/1454] Elapsed 8m 25s (remain 2m 42s) Loss: 0.0351 
Epoch: [2][1200/1454] Elapsed 9m 11s (remain 1m 56s) Loss: 0.0352 
Epoch: [2][1300/1454] Elapsed 9m 57s (remain 1m 10s) Loss: 0.0348 
Epoch: [2][1400/1

Epoch 2 - avg_train_loss: 0.0347  avg_val_loss: 0.0519  time: 706s
Epoch 2 - Score: 0.8155168613621334


EVAL: [242/243] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0519 
Epoch: [3][0/1454] Elapsed 0m 0s (remain 14m 1s) Loss: 0.0005 
Epoch: [3][100/1454] Elapsed 0m 46s (remain 10m 28s) Loss: 0.0256 
Epoch: [3][200/1454] Elapsed 1m 32s (remain 9m 39s) Loss: 0.0233 
Epoch: [3][300/1454] Elapsed 2m 18s (remain 8m 51s) Loss: 0.0229 
Epoch: [3][400/1454] Elapsed 3m 4s (remain 8m 5s) Loss: 0.0229 
Epoch: [3][500/1454] Elapsed 3m 50s (remain 7m 19s) Loss: 0.0237 
Epoch: [3][600/1454] Elapsed 4m 37s (remain 6m 34s) Loss: 0.0244 
Epoch: [3][700/1454] Elapsed 5m 24s (remain 5m 48s) Loss: 0.0244 
Epoch: [3][800/1454] Elapsed 6m 10s (remain 5m 2s) Loss: 0.0250 
Epoch: [3][900/1454] Elapsed 6m 57s (remain 4m 16s) Loss: 0.0240 
Epoch: [3][1000/1454] Elapsed 7m 44s (remain 3m 30s) Loss: 0.0236 
Epoch: [3][1100/1454] Elapsed 8m 31s (remain 2m 43s) Loss: 0.0243 
Epoch: [3][1200/1454] Elapsed 9m 18s (remain 1m 57s) Loss: 0.0237 
Epoch: [3][1300/1454] Elapsed 10m 5s (remain 1m 11s) Loss: 0.0240 
Epoch: [3][1400/1

Epoch 3 - avg_train_loss: 0.0242  avg_val_loss: 0.0435  time: 716s
Epoch 3 - Score: 0.836635843240863


EVAL: [242/243] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0435 


Score: 0.92459
Best threshold : 0.17430
After optimizing score: 0.91185
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the mod

Epoch: [1][0/1454] Elapsed 0m 0s (remain 13m 12s) Loss: 0.7379 
Epoch: [1][100/1454] Elapsed 0m 44s (remain 10m 1s) Loss: 0.2304 
Epoch: [1][200/1454] Elapsed 1m 29s (remain 9m 15s) Loss: 0.1772 
Epoch: [1][300/1454] Elapsed 2m 13s (remain 8m 31s) Loss: 0.1581 
Epoch: [1][400/1454] Elapsed 2m 57s (remain 7m 47s) Loss: 0.1426 
Epoch: [1][500/1454] Elapsed 3m 42s (remain 7m 3s) Loss: 0.1308 
Epoch: [1][600/1454] Elapsed 4m 26s (remain 6m 18s) Loss: 0.1197 
Epoch: [1][700/1454] Elapsed 5m 11s (remain 5m 34s) Loss: 0.1129 
Epoch: [1][800/1454] Elapsed 5m 55s (remain 4m 50s) Loss: 0.1045 
Epoch: [1][900/1454] Elapsed 6m 40s (remain 4m 5s) Loss: 0.1008 
Epoch: [1][1000/1454] Elapsed 7m 24s (remain 3m 21s) Loss: 0.0972 
Epoch: [1][1100/1454] Elapsed 8m 8s (remain 2m 36s) Loss: 0.0925 
Epoch: [1][1200/1454] Elapsed 8m 53s (remain 1m 52s) Loss: 0.0897 
Epoch: [1][1300/1454] Elapsed 9m 37s (remain 1m 7s) Loss: 0.0867 
Epoch: [1][1400/1454] Elapsed 10m 22s (remain 0m 23s) Loss: 0.0848 
Epoch: [1]

Epoch 1 - avg_train_loss: 0.0839  avg_val_loss: 0.0544  time: 691s
Epoch 1 - Score: 0.8547829427583559
Epoch 1 - Save Best Score: 0.8548 Model


EVAL: [242/243] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0544 
Epoch: [2][0/1454] Elapsed 0m 0s (remain 13m 5s) Loss: 0.0560 
Epoch: [2][100/1454] Elapsed 0m 44s (remain 9m 58s) Loss: 0.0415 
Epoch: [2][200/1454] Elapsed 1m 28s (remain 9m 12s) Loss: 0.0393 
Epoch: [2][300/1454] Elapsed 2m 12s (remain 8m 28s) Loss: 0.0478 
Epoch: [2][400/1454] Elapsed 2m 56s (remain 7m 43s) Loss: 0.0484 
Epoch: [2][500/1454] Elapsed 3m 40s (remain 6m 59s) Loss: 0.0458 
Epoch: [2][600/1454] Elapsed 4m 24s (remain 6m 15s) Loss: 0.0428 
Epoch: [2][700/1454] Elapsed 5m 8s (remain 5m 31s) Loss: 0.0421 
Epoch: [2][800/1454] Elapsed 5m 52s (remain 4m 47s) Loss: 0.0407 
Epoch: [2][900/1454] Elapsed 6m 36s (remain 4m 3s) Loss: 0.0402 
Epoch: [2][1000/1454] Elapsed 7m 20s (remain 3m 19s) Loss: 0.0395 
Epoch: [2][1100/1454] Elapsed 8m 4s (remain 2m 35s) Loss: 0.0396 
Epoch: [2][1200/1454] Elapsed 8m 48s (remain 1m 51s) Loss: 0.0391 
Epoch: [2][1300/1454] Elapsed 9m 32s (remain 1m 7s) Loss: 0.0385 
Epoch: [2][1400/145

Epoch 2 - avg_train_loss: 0.0377  avg_val_loss: 0.0338  time: 676s
Epoch 2 - Score: 0.9118541033434651
Epoch 2 - Save Best Score: 0.9119 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0338 
Epoch: [3][0/1454] Elapsed 0m 0s (remain 12m 26s) Loss: 0.0006 
Epoch: [3][100/1454] Elapsed 0m 44s (remain 9m 55s) Loss: 0.0255 
Epoch: [3][200/1454] Elapsed 1m 28s (remain 9m 11s) Loss: 0.0237 
Epoch: [3][300/1454] Elapsed 2m 12s (remain 8m 27s) Loss: 0.0235 
Epoch: [3][400/1454] Elapsed 2m 56s (remain 7m 43s) Loss: 0.0231 
Epoch: [3][500/1454] Elapsed 3m 40s (remain 6m 59s) Loss: 0.0236 
Epoch: [3][600/1454] Elapsed 4m 24s (remain 6m 14s) Loss: 0.0241 
Epoch: [3][700/1454] Elapsed 5m 8s (remain 5m 30s) Loss: 0.0256 
Epoch: [3][800/1454] Elapsed 5m 51s (remain 4m 46s) Loss: 0.0235 
Epoch: [3][900/1454] Elapsed 6m 35s (remain 4m 2s) Loss: 0.0251 
Epoch: [3][1000/1454] Elapsed 7m 19s (remain 3m 18s) Loss: 0.0255 
Epoch: [3][1100/1454] Elapsed 8m 3s (remain 2m 35s) Loss: 0.0253 
Epoch: [3][1200/1454] Elapsed 8m 47s (remain 1m 51s) Loss: 0.0244 
Epoch: [3][1300/1454] Elapsed 9m 31s (remain 1m 7s) Loss: 0.0249 
Epoch: [3][1400/14

Epoch 3 - avg_train_loss: 0.0251  avg_val_loss: 0.0327  time: 675s
Epoch 3 - Score: 0.919661733615222
Epoch 3 - Save Best Score: 0.9197 Model


EVAL: [242/243] Elapsed 0m 36s (remain 0m 0s) Loss: 0.0327 


Score: 0.91966
Best threshold : 0.04589
After optimizing score: 0.93750
Score: 0.88959
Best threshold : 0.02588
After optimizing score: 0.88877
Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt a

  0%|          | 0/2553 [00:00<?, ?it/s]