In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import os
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from collections import OrderedDict
import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.optim.swa_utils import AveragedModel, SWALR
from mixout.mixout import MixLinear
from util import *
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.9.0
tokenizers.__version__: 0.10.3
transformers.__version__: 4.14.1
env: TOKENIZERS_PARALLELISM=true


# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='PPPM'
    debug=False
    apex=True
    print_freq=100
    num_workers=32
    model="facebook/bart-large-cnn"#
    scheduler='cosine'# ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=5e-6
    decoder_lr=5e-6
    min_lr=5e-7
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64   # base 64 large 38 xlarge 18
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=2
    max_grad_norm=1000
    seed=1989
    n_fold=5
    trn_fold=[i for i in range(5)]
    train=True 
    cut_mix=False 
    rand_mask=False  
    val_freq = 10
    rdrop=False 
    mixout=False 
    pgd=False 
    special_tokens=False 
    reinit_layers=False 
    layer= 'new'# attention, last_cls, transformer, TextCNN, new,  working:poolatt 
    output_dir = '../outputs/new_bart_large/'

    


INPUT_DIR = '../data/'
OUTPUT_DIR = CFG.output_dir
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]
    
if CFG.model == "microsoft/cocolm-large": 
    from cocolm.modeling_cocolm import COCOLMSCLHead as AutoModel
    from cocolm.configuration_cocolm import COCOLMConfig as AutoConfig
    from cocolm.tokenization_cocolm import COCOLMTokenizer as AutoTokenizer

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    wandb.login(key='2a353fd7e7ae96e46cacafa4bd6db033c0649977')

    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='PPPM', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=None)

# Utils

In [4]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [5]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(INPUT_DIR+'train.csv')
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

train.shape: (36473, 5)
test.shape: (36, 4)
submission.shape: (36, 2)


In [6]:
# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results


cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)
test['context_text'] = test['context'].map(cpc_texts)
display(train.head())
display(test.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


Unnamed: 0,id,anchor,target,context,context_text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE


In [7]:
if CFG.special_tokens:
    train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[cpc]'  + train['context_text']
    test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[cpc]'  + test['context_text']
else:
    train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text']
    test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

# CV split

In [8]:
# def create_folds(data_frame, targets, groups, folds=5, seed=42, shuffle=True, fold_column="fold"):
#     cv_strategy = StratifiedGroupKFold(n_splits=folds, random_state=seed, shuffle=shuffle)
#     folds = cv_strategy.split(X=data_frame, y=targets, groups=groups)
#     for fold, (train_indexes, validation_indexes) in enumerate(folds):
#         data_frame.loc[validation_indexes, fold_column] =  int(fold)
        
#     data_frame[fold_column] = data_frame[fold_column].astype(int)
    
#     return data_frame
# from sklearn.model_selection import StratifiedGroupKFold
# train["score_bin"] = pd.cut(train["score"], bins=5, labels=False)
# train = create_folds(data_frame=train, 
#                      targets=train["score_bin"].values,
#                      groups=train["anchor"].values,
#                      folds=CFG.n_fold, 
#                      seed=42, 
#                      shuffle=True)

In [9]:
# # ====================================================
# # CV split
# # ====================================================
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
# cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
# dfx = dfx[cols]

# mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
# labels = [c for c in dfx.columns if c != "anchor"]
# dfx_labels = dfx[labels]
# dfx["fold"] = -1

# for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
#     print(len(trn_), len(val_))
#     dfx.loc[val_, "fold"] = fold

# train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
# ====================================================
# CV split
# ====================================================
train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())


fold
0    7295
1    7295
2    7295
3    7294
4    7294
dtype: int64

In [10]:
# import pickle 
# pse = pd.read_csv('../data/add_dataset.csv')
# with open('pse_fold0.pkl','rb') as f:
#     pse_score = pickle.load(f)
# pse['context_text'] = pse['context'].map(cpc_texts)
# pse['score'] = pse_score[0]
# pse['score'] =pse['score'].astype('float32')
# pse['fold'] = 0
# pse['text'] = pse['anchor'] + '[SEP]' + pse['target'] + '[cpc]'  + pse['context_text']
# pse['id'] = 'aaa'
# train = pd.concat((train[train['fold'] != 0].reset_index(),pse),0)

In [11]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [12]:
# ====================================================
# tokenizer
# ====================================================
if CFG.special_tokens:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model,
                                              additional_special_tokens = ['[cpc]']
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model)
#tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths
    
CFG.max_len = max(lengths_dict['anchor']) + max(lengths_dict['target'])\
                + max(lengths_dict['context_text']) + 4 # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

max_len: 175


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

In [15]:

# train_dataset = TrainDataset(CFG, train)
# inputs, label = train_dataset[0]
# print(inputs)
# print(label)



# Model

In [16]:
# class CustomModel(nn.Module):
#     def __init__(self, cfg, config_path=None, pretrained=False):
#         super().__init__()
#         self.cfg = cfg
#         if config_path is None:
#             self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
#         else:
#             self.config = torch.load(config_path)
#         if pretrained:
#             self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
#         else:
#             self.model = AutoModel.from_config(self.config)
#         self.fc_dropout = nn.Dropout(cfg.fc_dropout)
#         self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
#         self._init_weights(self.fc)
#         self.attention = nn.Sequential(
#             nn.Linear(self.config.hidden_size, 512),
#             nn.Tanh(),
#             nn.Linear(512, 1),
#             nn.Softmax(dim=1)
#         )
#         self._init_weights(self.attention)
#         self.high_dropout = nn.Dropout(p=0.5)
# #         self.reinit_layers(n=3, 
# #                    layers=self.model.encoder.layer, 
# #                    std=self.config.initializer_range)
        
# #     def reinit_layers(self, layers, n=0, std=0.02):
# #         if n > 0:
# #             for layer in layers[-n:]:
# #                 for name, module in layer.named_modules():
# #                     self._init_weights(module)
            
# #             print(f"Reinitializated last {n} layers.")
#     def _init_weights(self, module):
#         if isinstance(module, nn.Linear):
#             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
#             if module.bias is not None:
#                 module.bias.data.zero_()
#         elif isinstance(module, nn.Embedding):
#             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
#             if module.padding_idx is not None:
#                 module.weight.data[module.padding_idx].zero_()
#         elif isinstance(module, nn.LayerNorm):
#             module.bias.data.zero_()
#             module.weight.data.fill_(1.0)

            
#     def feature(self, inputs):
#         outputs = self.model(**inputs)
#         cls_outputs = torch.stack(
#                     [self.fc_dropout(layer) for layer in outputs['hidden_states'][-24:]], dim=0
#                 ).sum(0)
        
#         logits = torch.mean(
#             torch.stack(
#                 [torch.sum(self.attention(self.high_dropout(cls_outputs)) * cls_outputs, dim=1) for _ in range(5)],
#                 dim=0,
#             ),
#             dim=0,
#         )
        
#         return logits

#     def forward(self, inputs):
#         feature = self.feature(inputs)
#         output = self.fc(feature)
#         return output


# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True, return_dict=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)

        #### optional ####
        if CFG.reinit_layers:
            self.reinit_layers(n=CFG.reinit_layers, 
                       layers=self.model.encoder.layer, 
                       std=self.config.initializer_range)    
            
    
        if CFG.layer == 'attention':
            self.attention = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                nn.Tanh(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )
            self._init_weights(self.attention)
            self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
            
        if CFG.layer == 'transformer':
            self.attention = TransformerHead(in_features=self.config.hidden_size, 
                                             max_length=CFG.max_len, 
                                             num_layers=1, 
                                             nhead=8, num_targets=1)
            self._init_weights(self.attention)
            self.fc = nn.Linear(self.attention.out_features, self.cfg.target_size)  
            
        if CFG.layer == 'poolatt':    
            self.pool = AttentionPool(self.config.hidden_size)
            self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)  
            
        if CFG.layer == 'last_cls':
            self.fc = nn.Linear(self.config.hidden_size*2, self.cfg.target_size) 
            if 'deberta' in CFG.model: 
                self.pooler = ContextPooler(hidden_size=self.config.hidden_size, dropout=0.1)
        
        if CFG.layer == 'TextCNN':
            filter_num = 128
            filter_sizes = [2,3,4]
            self.convs = nn.ModuleList(
                [nn.Conv2d(1, filter_num, (size, self.config.hidden_size)) for size in filter_sizes])
            self.dropout = nn.Dropout(0.5)
            self.relu = nn.ReLU()
            self.bn = nn.BatchNorm1d(CFG.max_len)   
            self.linear_a = nn.Linear(len(filter_sizes) * filter_num,int(self.config.hidden_size/2)) 
            self.fc = nn.Linear(int(self.config.hidden_size/2), self.cfg.target_size)   
        if CFG.layer == 'new':            
            self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)   
            self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
            self.fc_dropout = nn.Dropout(0)
        
        self._init_weights(self.fc)  
        
        
    def reinit_layers(self, layers, n=0, std=0.02):
        if n > 0:
            for layer in layers[-n:]:
                for name, module in layer.named_modules():
                    self._init_weights(module)
                    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        
        if CFG.layer == 'attention':
            last_hidden_states = outputs[0]
            weights = self.attention(last_hidden_states)
            feature = torch.sum(weights * last_hidden_states, dim=1)
        if CFG.layer == 'transformer':
            last_hidden_states = outputs[0]
            feature = self.attention(last_hidden_states)
        if CFG.layer == 'last_cls':
            sequence_output = outputs.last_hidden_state
            hidden_states = outputs.hidden_states
            if self.pooler:
                pooler_output = self.pooler(outputs[0])
            else:
                pooler_output = outputs[1]
            seq_avg = torch.mean(sequence_output, dim=1)
            feature = torch.cat((seq_avg, pooler_output), dim=1)
        if CFG.layer == 'TextCNN':    
            output = self.dropout(outputs[0])
            tcnn_input = output.unsqueeze(1)
            tcnn_output = [F.relu(conv(tcnn_input)).squeeze(3) for conv in self.convs]
            tcnn_output = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in tcnn_output]
            tcnn_output = torch.cat(tcnn_output, 1)
            feature = self.dropout(tcnn_output)
            feature = self.linear_a(feature)
            outputs = self.model(**inputs)
            
        if CFG.layer == 'new':                   
            last_hidden_state = outputs[0]
            input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            out = sum_embeddings / sum_mask
            feature = self.layer_norm1(out)
            
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output




# Helpler functions

In [17]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

def compute_kl_loss( p, q, pad_mask=None):
    
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')
    
    # pad_mask is for seq-level tasks
    if pad_mask is not None:
        p_loss.masked_fill_(pad_mask, 0.)
        q_loss.masked_fill_(pad_mask, 0.)

    # You can choose whether to use function "sum" and "mean" depending on your task
    p_loss = p_loss.sum()
    q_loss = q_loss.sum()

    loss = (p_loss + q_loss) / 2
    return loss
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device,
             valid_loader, best_score,valid_labels):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    if CFG.pgd:    
        pgd = PGD(model,emb_name='word_embeddings.',epsilon=0.2,alpha=0.15)
        K = 5      
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if CFG.cut_mix:
            inputs = cut_mix(inputs)
        if CFG.rand_mask:
            inputs = rand_mask(inputs,tokenizer,device = 'cuda')   
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds_1 = model(inputs)
            if CFG.rdrop:
                y_preds_2 = model(inputs)
            
        loss_1 = criterion(y_preds_1.view(-1, 1), labels.view(-1, 1))
        if CFG.rdrop:
            loss_2 = criterion(y_preds_2.view(-1, 1), labels.view(-1, 1))
            loss = (loss_1 + loss_2 )*0.5
            kl_loss = compute_kl_loss(loss_1,loss_2)
            loss = loss + 5 * kl_loss
        else:
            loss = loss_1
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        
        if CFG.pgd:    
            pgd.backup_grad()
            # 对抗训练
            for t in range(K):
                pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.processor
                if t != K-1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()
                with torch.cuda.amp.autocast(enabled=CFG.apex):
                    loss_adv = model(inputs).view(-1, 1)
                loss_adv = criterion(loss_adv, labels.view(-1, 1))
                if CFG.gradient_accumulation_steps > 1:
                    loss_adv = loss_adv / CFG.gradient_accumulation_steps
                scaler.scale(loss_adv).backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
            pgd.restore()


        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler and scheduler.get_lr()[0] > CFG.min_lr:
                scheduler.step()
                
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
        if epoch >=3  or step == (len(train_loader)-1) : 
            if step % CFG.val_freq == 0 or step == (len(train_loader)-1) : 

                    avg_val_loss, predictions = valid_fn(valid_loader, 
                                                             model, 
                                                             criterion, device)                    

                    score = get_score(valid_labels, predictions)

                    if best_score < score:
                        LOGGER.info(f'Best Score {best_score:.5f} updating {score:.5f}')
                        best_score = score
                        torch.save({'model': model.state_dict(),
                                    'predictions': predictions},
                                    OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best_{best_score}.pth")    
                    else:
                        LOGGER.info(f'Score: {score:.5f} ' )

    return losses.avg, best_score, avg_val_loss, predictions, score


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [18]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['score'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=False, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=int(CFG.batch_size*1.2),
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=False, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    if CFG.mixout:
        for name, module in model.named_modules():
            if name in ['drop1', 'drop2'] and isinstance(module, nn.Dropout):
                setattr(model, name, nn.Dropout(0))
            if name in ['linear2', 'linear3'] and isinstance(module, nn.Linear):
                target_state_dict = module.state_dict()
                bias = True if module.bias is not None else False
                new_module = MixLinear(module.in_features, module.out_features, 
                                       bias, target_state_dict['weight'], 0.9)
                new_module.load_state_dict(target_state_dict)
                setattr(model, name, new_module)
            
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def create_optimizer(model):
        parameters = []
        lr = 3e-5
        for layer in range(23,-1,-1):
            
            layer_params = {
            'params': [p for n,p in model.named_parameters() if f'encoder.layer.{layer}.' in n],
                "weight_decay": 0.01,
            'lr': lr
                
        }
            parameters.append(layer_params)
            lr *= 0.975
        classifier_params = {
        'params': [p for n,p in model.named_parameters() if 'layer_norm' in n or 'linear' in n 
                   or 'pooling' in n],
            "weight_decay": 0.0,
        'lr': 3e-5
    }
        parameters.append(classifier_params)
        return AdamW(parameters)
    

    optimizer = create_optimizer(model)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        num_warmup_steps = int(num_train_steps * 0.1)
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=num_warmup_steps,
                num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=cfg.num_warmup_steps, 
                num_training_steps=num_train_steps,
                num_cycles=cfg.num_cycles
            )
            
        return scheduler
    
    num_train_steps = len(train_loader) * CFG.epochs
    #num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
#     criterion = DiceLoss(
#                     with_logits=True,
#                     smooth=1.0,
#                     ohem_ratio=0.8,
#                     alpha=0.01,
#                     square_denominator=True,
#                     index_label_position=True,
#                     reduction="mean"
#                 )
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    best_score = 0.
    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss,best_score, avg_val_loss, predictions, score = train_fn(fold, 
                                                                         train_loader,
                                                                         model, 
                                                                         criterion, 
                                                                         optimizer,
                                                                         epoch, 
                                                                         scheduler, 
                                                                         device, 
                                                                         valid_loader, 
                                                                         best_score,
                                                                         valid_labels,

                                                                        )

#         # eval
#         avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
#         # scoring
#         score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - best_score: {best_score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] Best_score": best_score})
        


    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best_{best_score}.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['score'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()



Epoch: [1][0/455] Elapsed 0m 6s (remain 45m 51s) Loss: 0.5370(0.5370) LR: 0.00003000  
Epoch: [1][100/455] Elapsed 0m 57s (remain 3m 23s) Loss: 0.3573(0.3624) LR: 0.00002996  
Epoch: [1][200/455] Elapsed 1m 49s (remain 2m 18s) Loss: 0.3304(0.3455) LR: 0.00002986  
Epoch: [1][300/455] Elapsed 2m 41s (remain 1m 22s) Loss: 0.3215(0.3401) LR: 0.00002968  
Epoch: [1][400/455] Elapsed 3m 33s (remain 0m 28s) Loss: 0.3239(0.3370) LR: 0.00002943  
Epoch: [1][454/455] Elapsed 4m 1s (remain 0m 0s) Loss: 0.3469(0.3358) LR: 0.00002927  
EVAL: [0/96] Elapsed 0m 4s (remain 7m 40s) Loss: 0.3201(0.3201) 


Best Score 0.00000 updating 0.10372


EVAL: [95/96] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3328(0.3267) 


Epoch 1 - avg_train_loss: 0.3358  avg_val_loss: 0.3267  time: 269s
Epoch 1 - best_score: 0.1037


Epoch: [2][0/455] Elapsed 0m 5s (remain 38m 33s) Loss: 0.3228(0.3228) LR: 0.00002927  
Epoch: [2][100/455] Elapsed 0m 56s (remain 3m 19s) Loss: 0.3277(0.3267) LR: 0.00002892  
Epoch: [2][200/455] Elapsed 1m 48s (remain 2m 17s) Loss: 0.3154(0.3268) LR: 0.00002850  
Epoch: [2][300/455] Elapsed 2m 40s (remain 1m 22s) Loss: 0.3430(0.3262) LR: 0.00002801  
Epoch: [2][400/455] Elapsed 3m 32s (remain 0m 28s) Loss: 0.3206(0.3257) LR: 0.00002747  
Epoch: [2][454/455] Elapsed 4m 0s (remain 0m 0s) Loss: 0.3208(0.3257) LR: 0.00002715  
EVAL: [0/96] Elapsed 0m 4s (remain 7m 45s) Loss: 0.3196(0.3196) 


Best Score 0.10372 updating 0.24394


EVAL: [95/96] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3298(0.3238) 


Epoch 2 - avg_train_loss: 0.3257  avg_val_loss: 0.3238  time: 268s
Epoch 2 - best_score: 0.2439


Epoch: [3][0/455] Elapsed 0m 5s (remain 38m 33s) Loss: 0.3113(0.3113) LR: 0.00002715  
Epoch: [3][100/455] Elapsed 0m 57s (remain 3m 19s) Loss: 0.3224(0.3238) LR: 0.00002651  
Epoch: [3][200/455] Elapsed 1m 48s (remain 2m 17s) Loss: 0.3187(0.3241) LR: 0.00002582  
Epoch: [3][300/455] Elapsed 2m 40s (remain 1m 22s) Loss: 0.3146(0.3233) LR: 0.00002508  
Epoch: [3][400/455] Elapsed 3m 32s (remain 0m 28s) Loss: 0.3416(0.3229) LR: 0.00002429  
Epoch: [3][454/455] Elapsed 4m 0s (remain 0m 0s) Loss: 0.3263(0.3228) LR: 0.00002384  
EVAL: [0/96] Elapsed 0m 4s (remain 7m 39s) Loss: 0.3181(0.3181) 


Best Score 0.24394 updating 0.36635


EVAL: [95/96] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3254(0.3198) 


Epoch 3 - avg_train_loss: 0.3228  avg_val_loss: 0.3198  time: 268s
Epoch 3 - best_score: 0.3663


Epoch: [4][0/455] Elapsed 0m 5s (remain 38m 46s) Loss: 0.3293(0.3293) LR: 0.00002384  
EVAL: [0/96] Elapsed 0m 4s (remain 7m 33s) Loss: 0.3181(0.3181) 


Score: 0.36635 


EVAL: [95/96] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3254(0.3198) 
EVAL: [0/96] Elapsed 0m 4s (remain 7m 37s) Loss: 0.3179(0.3179) 


Best Score 0.36635 updating 0.36941


EVAL: [95/96] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3252(0.3196) 
EVAL: [0/96] Elapsed 0m 4s (remain 7m 43s) Loss: 0.3171(0.3171) 


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0450fe1af0>
Traceback (most recent call last):
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1301, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/pyth

Traceback (most recent call last):
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_51964/4252808389.py", line 13, in <module>
    _oof_df = train_loop(train, fold)
  File "/tmp/ipykernel_51964/666696628.py", line 116, in train_loop
    avg_loss,best_score, avg_val_loss, predictions, score = train_fn(fold,
  File "/tmp/ipykernel_51964/1325986819.py", line 135, in train_fn
    avg_val_loss, predictions = valid_fn(valid_loader,
  File "/tmp/ipykernel_51964/1325986819.py", line 164, in valid_fn
    y_preds = model(inputs)
  File "/nfs/home/wangmingjie/.conda/envs/baseline/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/tmp/ipykernel_51964/1337847800.py", line 203, in forward
    feature = self.feature(inputs)
  File "/tmp/ipykernel_51964/13378

TypeError: object of type 'NoneType' has no len()