# Directory settings

In [1]:
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# 参数配置表
class CFG:
    debug=False # 启用debug模式
    apex=True # 混合精度
    print_freq=50 # 日志输出频率
    num_workers=4 # CPU线程数
    model="bert-base-uncased" # 模型名称
    scheduler='cosine' # 学习率调度器 # ['linear', 'cosine']
    batch_scheduler=True # 以batch或者epoch为周期调度
    num_cycles=0.5 # 学习率升温波数
    num_warmup_steps=0 # 学习率升温步数
    epochs=15 # epoch
    encoder_lr=1e-5 # 编码器 学习率
    decoder_lr=1e-5 # 解码器 学习率
    min_lr=1e-6 # 最小学习率
    eps=1e-6  # adamw.eps
    betas=(0.9, 0.999) # adamw.betas
    batch_size=64 # batch_size
    fc_dropout=0. # 全连接层的dropout
    text="text"
    target="target"
    target_size=1 # output size
    head=32 # 与tail组合 可以取句子最中间的部分
    tail=32 
    max_len=head+tail # 一个句子的最大长度
    weight_decay=0.01 # 权重衰减
    gradient_accumulation_steps=1 # 梯度累积
    max_grad_norm=1000 # 梯度上限
    margin=0.5  # MarginRankingLoss - margin
    seed=42 # 随机种子
    n_fold=5 # crossvalid - n_fold
    trn_fold=[0, 1, 2, 3, 4]
    train=True # 训练模式

# Library

In [3]:
# ====================================================
# 导入库
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500) # 最多显示500行
pd.set_option('display.max_columns', 500) # 最多显示500列
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

####
os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')
####

import tokenizers # 分词器
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2


# Utils

In [4]:
# ====================================================
# Utils
# ====================================================
def get_score(df):
    '''
    计算score
    '''
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df) # 除号左边是 预测正确的数量，右边是总数，除一下就是准确率
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    '''
    初始化日志
    '''
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    '''
    固定随机种子
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=2345)

# Data Loading

In [6]:
# ====================================================
# 加载官方数据集
# ====================================================
train = pd.read_csv('F:/data/validation_data.csv')
if CFG.debug:
    train = train.sample(n=100, random_state=CFG.seed).reset_index(drop=True)
test = pd.read_csv('F:/data/comments_to_score.csv')
submission = pd.read_csv('F:/data/sample_submission.csv')
print(train.shape)
print(test.shape, submission.shape)
display(train.head())
display(test.head())
display(submission.head())

(30108, 3)
(7537, 2) (7537, 2)


Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


Unnamed: 0,comment_id,score
0,114890,0.5
1,732895,0.5
2,1139051,0.5
3,1434512,0.5
4,2084821,0.5


# CV split

In [7]:
"""
整体逻辑（从for循环开始）：
K折切分训练集和验证集
给验证集打上标签
标签变成整型
train输出特征列:worker+less_toxic+more_toxic+fold
"""
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold # K折交叉验证

n_splits=5
nrows = None

df = pd.read_csv("F:/data/validation_data.csv", nrows=nrows)

# 简单的kfold切分
kfold = KFold(n_splits=n_splits)
# split返回分类后数据集 index;生成索引以将数据拆分为训练集和测试集。
for fold, (trn, val) in enumerate(kfold.split(df)): # fold：序列0,1,2,3,4   ；trn, val：训练集的索引和验证集的索引
    df.loc[val , "fold"] = fold # 在原来df上面加上一列fold，这一列中“索引等于val”的行，值等于fold（0,1,2,3,4,5...）

df["fold"] = df["fold"].astype(int)

display(df.groupby('fold').size())

train = df[list(train.columns)+['fold']].copy() # DataFrame.columns输出特征列
train

fold
0    6022
1    6022
2    6022
3    6021
4    6021
dtype: int64

Unnamed: 0,worker,less_toxic,more_toxic,fold
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,0
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,0
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",0
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",0
...,...,...,...,...
30103,461,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,4
30104,527,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,4
30105,352,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,4
30106,311,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,4


# tokenizer

In [8]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = BertTokenizer.from_pretrained(CFG.model, lowercase=True) # from_pretrained加载模型
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/') # 保存模型
CFG.tokenizer = tokenizer

Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.3kB/s]
Downloading: 100%|██████████| 226k/226k [00:01<00:00, 198kB/s]  
Downloading: 100%|██████████| 455k/455k [00:02<00:00, 207kB/s]  
Downloading: 100%|██████████| 570/570 [00:00<00:00, 190kB/s]


# Dataset

In [9]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    # 句子长度在64以内
    # if取True，那么就是取64
    if cfg.tail == 0: 
        # encode_plus返回所有的编码信息
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items(): # k是句子序号，v是句子内容
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else: # else表示执行两端各取32的操作
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True,  # Whether or not to encode the sequences with the special tokens relative to their model.
                                           truncation=True # 截断到用参数max_length指定的最大长度，如果没有提供该参数，则截断到该模型可接受的最大输入长度。
                                           )
        # 取句子两端/边的值
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len: # 大于64
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]]) # np.hstack 按水平/列的方向，拼接
            if k == 'input_ids': # 因为model只接受tensor的输入，不接受list，所以这里是把输入的转化为tensor
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id # 当句子长度不足64时， if的话填充为一个cfg.tokenizer.pad_token_id
            else: # 句子长度不足64，不足的地方填充0
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    '''
    训练数据 对象
    '''
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.less_toxic = df['less_toxic'].fillna("none").values
        self.more_toxic = df['more_toxic'].fillna("none").values

    def __len__(self):
        return len(self.less_toxic)

    def __getitem__(self, item):
        less_toxic_inputs = prepare_input(str(self.less_toxic[item]), self.cfg)
        more_toxic_inputs = prepare_input(str(self.more_toxic[item]), self.cfg)
        label = torch.tensor(1, dtype=torch.float)
        return less_toxic_inputs, more_toxic_inputs, label


class TestDataset(Dataset):
    '''
    测试数据 对象
    '''
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs

# Model

In [10]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = BertConfig.from_pretrained(cfg.model, output_hidden_states=True) # 预训练config
        else:
            self.config = torch.load(config_path) # 载入训练好模型的config
        if pretrained:
            self.model = BertModel.from_pretrained(cfg.model, config=self.config) # 预训练模型
        else:
            self.model = BertModel(self.config) #载入训练好的模型
        self.fc_dropout = nn.Dropout(cfg.fc_dropout) # dropout
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
    
    #feature是forward的中的一个方法，forward就是深度学习里的前向计算，神经网络参数的前向传递
    def feature(self, inputs):
        outputs = self.model(**inputs) # **inputs python的解包
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helpler functions

In [11]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    '''
    秒 转 分钟
    '''
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    '''
    计时器
    '''
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# 接下来读取数据，放入模型，跑出结果，更新损失，反向传播，更新调度器/优化器/学习率
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    # criterion：loss函数；scheduler：调度器；device：CPU或者GPU
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex) # 半精度加速训练
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    # 开始训练
    for step, (less_toxic_inputs, more_toxic_inputs, labels) in enumerate(train_loader):
        for k, v in less_toxic_inputs.items():
            less_toxic_inputs[k] = v.to(device)
        for k, v in more_toxic_inputs.items():
            more_toxic_inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            less_toxic_y_preds = model(less_toxic_inputs)
            more_toxic_y_preds = model(more_toxic_inputs)
            loss = criterion(more_toxic_y_preds, less_toxic_y_preds, labels)
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1: # 如果显存不足，我们可以通过gradient_accumulation_steps梯度累计来解决
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        # 打印出训练的过程，方便查看
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        #wandb.log({f"[fold{fold}] loss": losses.val,
        #           f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg

# 这部分是去kaggle推理时用的，训练时没用
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    # 开始测试
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [12]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    # 训练所有fold
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    # folds列 如果是等于那么就是验证集的部分，不等于就是训练集部分
    trn_idx = folds[folds['fold'] != fold].index # 训练集
    val_idx = folds[folds['fold'] == fold].index # 验证集
    
    train_folds = folds.loc[trn_idx].reset_index(drop=True) # 从源数据中挑出来一部分作为训练集（或者验证集），那么原来的索引排序就乱了，数据清洗时，会将带空值的行删除，需要reset index重置索引
    validation = folds.loc[val_idx].reset_index(drop=True)
    # valid_folds 两步是把valid的两列去重合并成到1列
    valid_folds = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique())) # unique()函数用于获取Series对象的唯一值
    valid_folds = pd.DataFrame({'text': valid_folds}).reset_index()
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TestDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    # 优化器参数详细设置
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas) # AdamW有权重衰减，防止过拟合
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.MarginRankingLoss(margin=CFG.margin)
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        preds = inference_fn(valid_loader, model, device)
        
        # scoring
        # 把validation句子再变回more和less形式，这样就能计算出排序是否正确
        valid_folds['pred'] = preds
        if 'less_toxic_pred' in validation.columns:
            validation = validation.drop(columns='less_toxic_pred')
        if 'more_toxic_pred' in validation.columns:
            validation = validation.drop(columns='more_toxic_pred')
        rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='less_toxic', how='left')
        rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
        validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                      on='more_toxic', how='left')
        score = get_score(validation)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'preds': preds},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    preds = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                       map_location=torch.device('cpu'))['preds']
    valid_folds['pred'] = preds
    if 'less_toxic_pred' in validation.columns:
        validation = validation.drop(columns='less_toxic_pred')
    if 'more_toxic_pred' in validation.columns:
        validation = validation.drop(columns='more_toxic_pred')
    rename_cols = {CFG.text: 'less_toxic', 'pred': 'less_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='less_toxic', how='left')
    rename_cols = {CFG.text: 'more_toxic', 'pred': 'more_toxic_pred'}
    validation = validation.merge(valid_folds[[CFG.text, 'pred']].rename(columns=rename_cols), 
                                  on='more_toxic', how='left')

    torch.cuda.empty_cache()
    gc.collect()
    
    return validation

In [13]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        score = get_score(oof_df)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        # train 
        # oof_df和_oof_df解释：每个fold都会算出自己的validation数据，全部拼接在一起就是对所有验证集的预测（整个数据集的预测）
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        # CV result
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)

Downloading: 100%|██████████| 420M/420M [01:38<00:00, 4.49MB/s]   
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
