In [1]:
import os
import pathlib
import gc
import cv2
import time
import random
import pandas as pd
import numpy as np
import copy

## Pytorch Import
import torch 
import torch.nn as nn
##Pytorch进行优化，更新参数
import torch.optim as optim
##根据需求在epochs增大的时候自动降低学习率
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR

from tqdm import tqdm
from collections import defaultdict

## transformer 
from transformers import AdamW,AutoTokenizer,AutoModel,AutoConfig

## import Scikit
from sklearn.model_selection import KFold,StratifiedKFold
from transformers import  XLNetTokenizer, XLNetModel, TFXLNetModel, XLNetLMHeadModel, XLNetConfig, XLNetForSequenceClassification
## color 
from colorama import Fore,Back,Style

# from DeBERTa import deberta


red = Fore.RED
blue = Fore.BLUE
set_all = Style.RESET_ALL

import warnings 
warnings.filterwarnings('ignore')

import sys
sys.path.append("../input/tez-lib/")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

vali = pd.read_csv('classification-pseudo-label/classification_pairs_891.csv')
# tmp_valid = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

# vali = pd.merge(vali, tmp_valid, how='left', left_on=['less_toxic', 'more_toxic'], right_on=['less_toxic', 'more_toxic'])

2021-12-20 04:03:23.157737: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
class Config:
    
    model_name_or_path = '../input/xlnet-base-cased'
        
    epochs = 2
    train_bs = 32
    valid_bs = 64
        
    seed = 42
    max_length = 128
    min_lr = 1e-7
    margin = 0.5
#     scheduler = 'CosineAnnealingLR' # 学习率衰减策略
    T_max  = 500
    T_0 = 5
    weight_decay = 1e-4 # 权重衰减 L2正则化 减少过拟合
    max_grad_norm = 1.0 # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
    num_classes = 1
    n_fold = 5
    n_accululate = 1
    device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    hidden_size = 768
    num_hidden_layers = 12
    
    dropout = 0.2
    
    epsilon = 1e-6
    
    lr =  1e-4
    scheduler = 'CosineAnnealingLR'
    
tokenizer = XLNetTokenizer.from_pretrained('../input/xlnetbasecased/xlnet_cased_L-12_H-768_A-12')
config = XLNetConfig.from_pretrained('../input/xlnet-base-cased')

In [3]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(Config.seed)

In [4]:
# kf = StratifiedKFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
kf = KFold(n_splits=Config.n_fold, shuffle=True, random_state=Config.seed)
for fold, ( _, val_) in enumerate(kf.split(X=vali)):
    vali.loc[val_ , "kfold"] = int(fold)
    
vali["kfold"] = vali["kfold"].astype(int)

In [5]:
class JDataset(Dataset):
    def __init__(self, vali, tokenizer, max_length):
        self.vali = vali
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = vali['more_toxic'].values
        self.less_toxic = vali['less_toxic'].values
        
    def __len__(self):
        return len(self.vali)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        ## encode_plus 返回句子所有的编码信息 
        ## input_ids : 单词在词典中的编码   token_type_ids : 区分两个句子的编码  attention_mask : 指定哪些词进行self_attention的操作
        ## sentence = "Hello, my son is laughing."
        ## print(tokenizer.encode(sentence))
        ## print(tokenizer.encode_plus(sentence))
#         [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
#         {'input_ids': [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length',
                                return_attention_mask=True,
                                return_token_type_ids=True,
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length',
                                return_attention_mask=True,
                                return_token_type_ids=True,
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        more_toxic_token_type_ids = inputs_more_toxic['token_type_ids']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        less_toxic_token_type_ids = inputs_less_toxic['token_type_ids']
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'more_toxic_token_type_ids':torch.tensor(more_toxic_token_type_ids,dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'less_toxic_token_type_ids':torch.tensor(less_toxic_token_type_ids,dtype= torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [6]:
class XLNetBaseModel(nn.Module):
    def __init__(self, checkpoint=Config.model_name_or_path):
        super(XLNetBaseModel, self).__init__()
        self.checkpoint = checkpoint
        self.xlnet = XLNetModel.from_pretrained(checkpoint, return_dict=False)
        self.layer_norm = nn.LayerNorm(Config.hidden_size)
        self.dropout = nn.Dropout(Config.dropout)
        self.dense = nn.Sequential(
            nn.Linear(Config.hidden_size, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(),
            nn.Dropout(Config.dropout),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids,token_type_ids, attention_mask):
        last_hidden_state = self.xlnet(input_ids=input_ids,token_type_ids = token_type_ids, attention_mask=attention_mask)
        pooled_output = self.pool_hidden_state(last_hidden_state)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds
    
    def pool_hidden_state(self,last_hidden_state):
        '''
        pool the last_hidden_state into a mean hidden_state
        '''
        last_hidden_state = last_hidden_state[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        return mean_last_hidden_state

In [7]:
def criterion(output1, output2, targets):
    return nn.MarginRankingLoss(margin=Config.margin)(output1, output2, targets)

In [8]:
class MetricMonitor:
    def __init__(self, float_precision=4):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"],
                    float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

In [9]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    
    #启用batch normalization和drop out  
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for step, data in bar:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        more_token_type_ids = data['more_toxic_token_type_ids'].to(device, dtype = torch.long)
        
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        less_token_type_ids = data['less_toxic_token_type_ids'].to(device, dtype = torch.long)
        
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids,more_token_type_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_token_type_ids,less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        # 梯度累加
        loss = loss / Config.n_accululate
        loss.backward()
    
        if (step + 1) % Config.n_accululate == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [10]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    #神经网络会沿用batch normalization的值，并不使用drop out
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for step, data in bar:        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        more_token_type_ids = data['more_toxic_token_type_ids'].to(device, dtype = torch.long)
        
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        less_token_type_ids = data['less_toxic_token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids,more_token_type_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids,less_token_type_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

In [11]:
best_models_of_each_fold = []

In [12]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    # To automatically log gradients
#     wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=Config.device, epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=Config.device, 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)

        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{red}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{set_all}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [13]:
def prepare_loaders(fold):
    df_train = vali[vali.kfold != fold].reset_index(drop=True)
    df_valid = vali[vali.kfold == fold].reset_index(drop=True)
    
    train_dataset = JDataset(df_train, tokenizer=tokenizer, max_length=Config.max_length)
    valid_dataset = JDataset(df_valid, tokenizer=tokenizer, max_length=Config.max_length)

    train_loader = DataLoader(train_dataset, batch_size=Config.train_bs, 
                              num_workers=8, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.valid_bs, 
                              num_workers=8, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [14]:
def fetch_scheduler(optimizer):
    if Config.scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=Config.T_max, 
                                                   eta_min=Config.min_lr)
    elif Configscheduler == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=5, 
                                                             eta_min=Config.min_lr)
    elif Config.scheduler == None:
        return None
        
    return scheduler

In [15]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = Config.lr

        if layer_num >= 69:        
            lr = Config.lr * 2.5

        if layer_num >= 133:
            lr = Config.lr * 5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return optim.AdamW(parameters)

In [16]:
%%time
for fold in range(0, Config.n_fold):
    print(f"{blue}====== Fold: {fold} ======{set_all}")

    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
#     config  = AutoConfig.from_pretrained(Config.pretrained_model_path)
#     transformer = AutoModel.from_pretrained(Config.pretrained_model_path, config=config) 
    
    model = XLNetBaseModel(Config.model_name_or_path)
    model.to(Config.device)
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay)
    scheduler = fetch_scheduler(optimizer)
    
    model, history = run_training(model, optimizer, scheduler,
                                  device=Config.device,
                                  num_epochs=Config.epochs,
                                  fold=fold)
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()

[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 3750/3750 [29:25<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.22it/s, Ep


[31mValidation Loss Improved (inf ---> 0.1133230497876803)
Model Saved[0m



100%|█| 3750/3750 [29:25<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.22it/s, Ep


[31mValidation Loss Improved (0.1133230497876803 ---> 0.09854290266036987)
Model Saved[0m

Training complete in 1h 3m 49s
Best Loss: 0.0985

[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 3750/3750 [29:26<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.22it/s, Ep


[31mValidation Loss Improved (inf ---> 0.10242218701839446)
Model Saved[0m



100%|█| 3750/3750 [29:24<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.22it/s, Ep


[31mValidation Loss Improved (0.10242218701839446 ---> 0.09556384704113007)
Model Saved[0m

Training complete in 1h 3m 49s
Best Loss: 0.0956

[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 3750/3750 [29:27<00:00,  2.12it/s, 
100%|█| 469/469 [02:26<00:00,  3.21it/s, Ep


[31mValidation Loss Improved (inf ---> 0.099626016664505)
Model Saved[0m



100%|█| 3750/3750 [29:30<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.22it/s, Ep



Training complete in 1h 3m 56s
Best Loss: 0.0996

[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 3750/3750 [29:25<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.21it/s, Ep


[31mValidation Loss Improved (inf ---> 0.10262608381708463)
Model Saved[0m



100%|█| 3750/3750 [29:28<00:00,  2.12it/s, 
100%|█| 469/469 [02:25<00:00,  3.21it/s, Ep



Training complete in 1h 3m 52s
Best Loss: 0.1026

[INFO] Using GPU: NVIDIA GeForce RTX 3090



100%|█| 3750/3750 [29:26<00:00,  2.12it/s, 
100%|█| 469/469 [02:31<00:00,  3.10it/s, Ep


[31mValidation Loss Improved (inf ---> 0.09828754281202952)
Model Saved[0m



100%|█| 3750/3750 [29:29<00:00,  2.12it/s, 
100%|█| 469/469 [02:31<00:00,  3.10it/s, Ep



Training complete in 1h 4m 4s
Best Loss: 0.0983

CPU times: user 10h 43min 46s, sys: 4min 44s, total: 10h 48min 31s
Wall time: 5h 19min 41s
